]> git.ipfire.org Git - thirdparty/gcc.git/blob - gcc/config/aarch64/aarch64.c
[AArch64] Optimise aarch64_add_offset for SVE VL constants
[thirdparty/gcc.git] / gcc / config / aarch64 / aarch64.c
1 /* Machine description for AArch64 architecture.
2 Copyright (C) 2009-2019 Free Software Foundation, Inc.
3 Contributed by ARM Ltd.
4
5 This file is part of GCC.
6
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3, or (at your option)
10 any later version.
11
12 GCC is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 General Public License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
20
21 #define IN_TARGET_CODE 1
22
23 #include "config.h"
24 #define INCLUDE_STRING
25 #include "system.h"
26 #include "coretypes.h"
27 #include "backend.h"
28 #include "target.h"
29 #include "rtl.h"
30 #include "tree.h"
31 #include "memmodel.h"
32 #include "gimple.h"
33 #include "cfghooks.h"
34 #include "cfgloop.h"
35 #include "df.h"
36 #include "tm_p.h"
37 #include "stringpool.h"
38 #include "attribs.h"
39 #include "optabs.h"
40 #include "regs.h"
41 #include "emit-rtl.h"
42 #include "recog.h"
43 #include "cgraph.h"
44 #include "diagnostic.h"
45 #include "insn-attr.h"
46 #include "alias.h"
47 #include "fold-const.h"
48 #include "stor-layout.h"
49 #include "calls.h"
50 #include "varasm.h"
51 #include "output.h"
52 #include "flags.h"
53 #include "explow.h"
54 #include "expr.h"
55 #include "reload.h"
56 #include "langhooks.h"
57 #include "opts.h"
58 #include "params.h"
59 #include "gimplify.h"
60 #include "dwarf2.h"
61 #include "gimple-iterator.h"
62 #include "tree-vectorizer.h"
63 #include "aarch64-cost-tables.h"
64 #include "dumpfile.h"
65 #include "builtins.h"
66 #include "rtl-iter.h"
67 #include "tm-constrs.h"
68 #include "sched-int.h"
69 #include "target-globals.h"
70 #include "common/common-target.h"
71 #include "cfgrtl.h"
72 #include "selftest.h"
73 #include "selftest-rtl.h"
74 #include "rtx-vector-builder.h"
75 #include "intl.h"
76 #include "expmed.h"
77
78 /* This file should be included last. */
79 #include "target-def.h"
80
81 /* Defined for convenience. */
82 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
83
84 /* Information about a legitimate vector immediate operand. */
85 struct simd_immediate_info
86 {
87 enum insn_type { MOV, MVN, INDEX, PTRUE };
88 enum modifier_type { LSL, MSL };
89
90 simd_immediate_info () {}
91 simd_immediate_info (scalar_float_mode, rtx);
92 simd_immediate_info (scalar_int_mode, unsigned HOST_WIDE_INT,
93 insn_type = MOV, modifier_type = LSL,
94 unsigned int = 0);
95 simd_immediate_info (scalar_mode, rtx, rtx);
96 simd_immediate_info (scalar_int_mode, aarch64_svpattern);
97
98 /* The mode of the elements. */
99 scalar_mode elt_mode;
100
101 /* The instruction to use to move the immediate into a vector. */
102 insn_type insn;
103
104 union
105 {
106 /* For MOV and MVN. */
107 struct
108 {
109 /* The value of each element. */
110 rtx value;
111
112 /* The kind of shift modifier to use, and the number of bits to shift.
113 This is (LSL, 0) if no shift is needed. */
114 modifier_type modifier;
115 unsigned int shift;
116 } mov;
117
118 /* For INDEX. */
119 struct
120 {
121 /* The value of the first element and the step to be added for each
122 subsequent element. */
123 rtx base, step;
124 } index;
125
126 /* For PTRUE. */
127 aarch64_svpattern pattern;
128 } u;
129 };
130
131 /* Construct a floating-point immediate in which each element has mode
132 ELT_MODE_IN and value VALUE_IN. */
133 inline simd_immediate_info
134 ::simd_immediate_info (scalar_float_mode elt_mode_in, rtx value_in)
135 : elt_mode (elt_mode_in), insn (MOV)
136 {
137 u.mov.value = value_in;
138 u.mov.modifier = LSL;
139 u.mov.shift = 0;
140 }
141
142 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
143 and value VALUE_IN. The other parameters are as for the structure
144 fields. */
145 inline simd_immediate_info
146 ::simd_immediate_info (scalar_int_mode elt_mode_in,
147 unsigned HOST_WIDE_INT value_in,
148 insn_type insn_in, modifier_type modifier_in,
149 unsigned int shift_in)
150 : elt_mode (elt_mode_in), insn (insn_in)
151 {
152 u.mov.value = gen_int_mode (value_in, elt_mode_in);
153 u.mov.modifier = modifier_in;
154 u.mov.shift = shift_in;
155 }
156
157 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
158 and where element I is equal to BASE_IN + I * STEP_IN. */
159 inline simd_immediate_info
160 ::simd_immediate_info (scalar_mode elt_mode_in, rtx base_in, rtx step_in)
161 : elt_mode (elt_mode_in), insn (INDEX)
162 {
163 u.index.base = base_in;
164 u.index.step = step_in;
165 }
166
167 /* Construct a predicate that controls elements of mode ELT_MODE_IN
168 and has PTRUE pattern PATTERN_IN. */
169 inline simd_immediate_info
170 ::simd_immediate_info (scalar_int_mode elt_mode_in,
171 aarch64_svpattern pattern_in)
172 : elt_mode (elt_mode_in), insn (PTRUE)
173 {
174 u.pattern = pattern_in;
175 }
176
177 /* The current code model. */
178 enum aarch64_code_model aarch64_cmodel;
179
180 /* The number of 64-bit elements in an SVE vector. */
181 poly_uint16 aarch64_sve_vg;
182
183 #ifdef HAVE_AS_TLS
184 #undef TARGET_HAVE_TLS
185 #define TARGET_HAVE_TLS 1
186 #endif
187
188 static bool aarch64_composite_type_p (const_tree, machine_mode);
189 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
190 const_tree,
191 machine_mode *, int *,
192 bool *);
193 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
194 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
195 static void aarch64_override_options_after_change (void);
196 static bool aarch64_vector_mode_supported_p (machine_mode);
197 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
198 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
199 const_tree type,
200 int misalignment,
201 bool is_packed);
202 static machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64);
203 static bool aarch64_print_address_internal (FILE*, machine_mode, rtx,
204 aarch64_addr_query_type);
205 static HOST_WIDE_INT aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val);
206
207 /* Major revision number of the ARM Architecture implemented by the target. */
208 unsigned aarch64_architecture_version;
209
210 /* The processor for which instructions should be scheduled. */
211 enum aarch64_processor aarch64_tune = cortexa53;
212
213 /* Mask to specify which instruction scheduling options should be used. */
214 uint64_t aarch64_tune_flags = 0;
215
216 /* Global flag for PC relative loads. */
217 bool aarch64_pcrelative_literal_loads;
218
219 /* Global flag for whether frame pointer is enabled. */
220 bool aarch64_use_frame_pointer;
221
222 #define BRANCH_PROTECT_STR_MAX 255
223 char *accepted_branch_protection_string = NULL;
224
225 static enum aarch64_parse_opt_result
226 aarch64_parse_branch_protection (const char*, char**);
227
228 /* Support for command line parsing of boolean flags in the tuning
229 structures. */
230 struct aarch64_flag_desc
231 {
232 const char* name;
233 unsigned int flag;
234 };
235
236 #define AARCH64_FUSION_PAIR(name, internal_name) \
237 { name, AARCH64_FUSE_##internal_name },
238 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
239 {
240 { "none", AARCH64_FUSE_NOTHING },
241 #include "aarch64-fusion-pairs.def"
242 { "all", AARCH64_FUSE_ALL },
243 { NULL, AARCH64_FUSE_NOTHING }
244 };
245
246 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
247 { name, AARCH64_EXTRA_TUNE_##internal_name },
248 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
249 {
250 { "none", AARCH64_EXTRA_TUNE_NONE },
251 #include "aarch64-tuning-flags.def"
252 { "all", AARCH64_EXTRA_TUNE_ALL },
253 { NULL, AARCH64_EXTRA_TUNE_NONE }
254 };
255
256 /* Tuning parameters. */
257
258 static const struct cpu_addrcost_table generic_addrcost_table =
259 {
260 {
261 1, /* hi */
262 0, /* si */
263 0, /* di */
264 1, /* ti */
265 },
266 0, /* pre_modify */
267 0, /* post_modify */
268 0, /* register_offset */
269 0, /* register_sextend */
270 0, /* register_zextend */
271 0 /* imm_offset */
272 };
273
274 static const struct cpu_addrcost_table exynosm1_addrcost_table =
275 {
276 {
277 0, /* hi */
278 0, /* si */
279 0, /* di */
280 2, /* ti */
281 },
282 0, /* pre_modify */
283 0, /* post_modify */
284 1, /* register_offset */
285 1, /* register_sextend */
286 2, /* register_zextend */
287 0, /* imm_offset */
288 };
289
290 static const struct cpu_addrcost_table xgene1_addrcost_table =
291 {
292 {
293 1, /* hi */
294 0, /* si */
295 0, /* di */
296 1, /* ti */
297 },
298 1, /* pre_modify */
299 1, /* post_modify */
300 0, /* register_offset */
301 1, /* register_sextend */
302 1, /* register_zextend */
303 0, /* imm_offset */
304 };
305
306 static const struct cpu_addrcost_table thunderx2t99_addrcost_table =
307 {
308 {
309 1, /* hi */
310 1, /* si */
311 1, /* di */
312 2, /* ti */
313 },
314 0, /* pre_modify */
315 0, /* post_modify */
316 2, /* register_offset */
317 3, /* register_sextend */
318 3, /* register_zextend */
319 0, /* imm_offset */
320 };
321
322 static const struct cpu_addrcost_table tsv110_addrcost_table =
323 {
324 {
325 1, /* hi */
326 0, /* si */
327 0, /* di */
328 1, /* ti */
329 },
330 0, /* pre_modify */
331 0, /* post_modify */
332 0, /* register_offset */
333 1, /* register_sextend */
334 1, /* register_zextend */
335 0, /* imm_offset */
336 };
337
338 static const struct cpu_addrcost_table qdf24xx_addrcost_table =
339 {
340 {
341 1, /* hi */
342 1, /* si */
343 1, /* di */
344 2, /* ti */
345 },
346 1, /* pre_modify */
347 1, /* post_modify */
348 3, /* register_offset */
349 3, /* register_sextend */
350 3, /* register_zextend */
351 2, /* imm_offset */
352 };
353
354 static const struct cpu_regmove_cost generic_regmove_cost =
355 {
356 1, /* GP2GP */
357 /* Avoid the use of slow int<->fp moves for spilling by setting
358 their cost higher than memmov_cost. */
359 5, /* GP2FP */
360 5, /* FP2GP */
361 2 /* FP2FP */
362 };
363
364 static const struct cpu_regmove_cost cortexa57_regmove_cost =
365 {
366 1, /* GP2GP */
367 /* Avoid the use of slow int<->fp moves for spilling by setting
368 their cost higher than memmov_cost. */
369 5, /* GP2FP */
370 5, /* FP2GP */
371 2 /* FP2FP */
372 };
373
374 static const struct cpu_regmove_cost cortexa53_regmove_cost =
375 {
376 1, /* GP2GP */
377 /* Avoid the use of slow int<->fp moves for spilling by setting
378 their cost higher than memmov_cost. */
379 5, /* GP2FP */
380 5, /* FP2GP */
381 2 /* FP2FP */
382 };
383
384 static const struct cpu_regmove_cost exynosm1_regmove_cost =
385 {
386 1, /* GP2GP */
387 /* Avoid the use of slow int<->fp moves for spilling by setting
388 their cost higher than memmov_cost (actual, 4 and 9). */
389 9, /* GP2FP */
390 9, /* FP2GP */
391 1 /* FP2FP */
392 };
393
394 static const struct cpu_regmove_cost thunderx_regmove_cost =
395 {
396 2, /* GP2GP */
397 2, /* GP2FP */
398 6, /* FP2GP */
399 4 /* FP2FP */
400 };
401
402 static const struct cpu_regmove_cost xgene1_regmove_cost =
403 {
404 1, /* GP2GP */
405 /* Avoid the use of slow int<->fp moves for spilling by setting
406 their cost higher than memmov_cost. */
407 8, /* GP2FP */
408 8, /* FP2GP */
409 2 /* FP2FP */
410 };
411
412 static const struct cpu_regmove_cost qdf24xx_regmove_cost =
413 {
414 2, /* GP2GP */
415 /* Avoid the use of int<->fp moves for spilling. */
416 6, /* GP2FP */
417 6, /* FP2GP */
418 4 /* FP2FP */
419 };
420
421 static const struct cpu_regmove_cost thunderx2t99_regmove_cost =
422 {
423 1, /* GP2GP */
424 /* Avoid the use of int<->fp moves for spilling. */
425 8, /* GP2FP */
426 8, /* FP2GP */
427 4 /* FP2FP */
428 };
429
430 static const struct cpu_regmove_cost tsv110_regmove_cost =
431 {
432 1, /* GP2GP */
433 /* Avoid the use of slow int<->fp moves for spilling by setting
434 their cost higher than memmov_cost. */
435 2, /* GP2FP */
436 3, /* FP2GP */
437 2 /* FP2FP */
438 };
439
440 /* Generic costs for vector insn classes. */
441 static const struct cpu_vector_cost generic_vector_cost =
442 {
443 1, /* scalar_int_stmt_cost */
444 1, /* scalar_fp_stmt_cost */
445 1, /* scalar_load_cost */
446 1, /* scalar_store_cost */
447 1, /* vec_int_stmt_cost */
448 1, /* vec_fp_stmt_cost */
449 2, /* vec_permute_cost */
450 1, /* vec_to_scalar_cost */
451 1, /* scalar_to_vec_cost */
452 1, /* vec_align_load_cost */
453 1, /* vec_unalign_load_cost */
454 1, /* vec_unalign_store_cost */
455 1, /* vec_store_cost */
456 3, /* cond_taken_branch_cost */
457 1 /* cond_not_taken_branch_cost */
458 };
459
460 /* QDF24XX costs for vector insn classes. */
461 static const struct cpu_vector_cost qdf24xx_vector_cost =
462 {
463 1, /* scalar_int_stmt_cost */
464 1, /* scalar_fp_stmt_cost */
465 1, /* scalar_load_cost */
466 1, /* scalar_store_cost */
467 1, /* vec_int_stmt_cost */
468 3, /* vec_fp_stmt_cost */
469 2, /* vec_permute_cost */
470 1, /* vec_to_scalar_cost */
471 1, /* scalar_to_vec_cost */
472 1, /* vec_align_load_cost */
473 1, /* vec_unalign_load_cost */
474 1, /* vec_unalign_store_cost */
475 1, /* vec_store_cost */
476 3, /* cond_taken_branch_cost */
477 1 /* cond_not_taken_branch_cost */
478 };
479
480 /* ThunderX costs for vector insn classes. */
481 static const struct cpu_vector_cost thunderx_vector_cost =
482 {
483 1, /* scalar_int_stmt_cost */
484 1, /* scalar_fp_stmt_cost */
485 3, /* scalar_load_cost */
486 1, /* scalar_store_cost */
487 4, /* vec_int_stmt_cost */
488 1, /* vec_fp_stmt_cost */
489 4, /* vec_permute_cost */
490 2, /* vec_to_scalar_cost */
491 2, /* scalar_to_vec_cost */
492 3, /* vec_align_load_cost */
493 5, /* vec_unalign_load_cost */
494 5, /* vec_unalign_store_cost */
495 1, /* vec_store_cost */
496 3, /* cond_taken_branch_cost */
497 3 /* cond_not_taken_branch_cost */
498 };
499
500 static const struct cpu_vector_cost tsv110_vector_cost =
501 {
502 1, /* scalar_int_stmt_cost */
503 1, /* scalar_fp_stmt_cost */
504 5, /* scalar_load_cost */
505 1, /* scalar_store_cost */
506 2, /* vec_int_stmt_cost */
507 2, /* vec_fp_stmt_cost */
508 2, /* vec_permute_cost */
509 3, /* vec_to_scalar_cost */
510 2, /* scalar_to_vec_cost */
511 5, /* vec_align_load_cost */
512 5, /* vec_unalign_load_cost */
513 1, /* vec_unalign_store_cost */
514 1, /* vec_store_cost */
515 1, /* cond_taken_branch_cost */
516 1 /* cond_not_taken_branch_cost */
517 };
518
519 /* Generic costs for vector insn classes. */
520 static const struct cpu_vector_cost cortexa57_vector_cost =
521 {
522 1, /* scalar_int_stmt_cost */
523 1, /* scalar_fp_stmt_cost */
524 4, /* scalar_load_cost */
525 1, /* scalar_store_cost */
526 2, /* vec_int_stmt_cost */
527 2, /* vec_fp_stmt_cost */
528 3, /* vec_permute_cost */
529 8, /* vec_to_scalar_cost */
530 8, /* scalar_to_vec_cost */
531 4, /* vec_align_load_cost */
532 4, /* vec_unalign_load_cost */
533 1, /* vec_unalign_store_cost */
534 1, /* vec_store_cost */
535 1, /* cond_taken_branch_cost */
536 1 /* cond_not_taken_branch_cost */
537 };
538
539 static const struct cpu_vector_cost exynosm1_vector_cost =
540 {
541 1, /* scalar_int_stmt_cost */
542 1, /* scalar_fp_stmt_cost */
543 5, /* scalar_load_cost */
544 1, /* scalar_store_cost */
545 3, /* vec_int_stmt_cost */
546 3, /* vec_fp_stmt_cost */
547 3, /* vec_permute_cost */
548 3, /* vec_to_scalar_cost */
549 3, /* scalar_to_vec_cost */
550 5, /* vec_align_load_cost */
551 5, /* vec_unalign_load_cost */
552 1, /* vec_unalign_store_cost */
553 1, /* vec_store_cost */
554 1, /* cond_taken_branch_cost */
555 1 /* cond_not_taken_branch_cost */
556 };
557
558 /* Generic costs for vector insn classes. */
559 static const struct cpu_vector_cost xgene1_vector_cost =
560 {
561 1, /* scalar_int_stmt_cost */
562 1, /* scalar_fp_stmt_cost */
563 5, /* scalar_load_cost */
564 1, /* scalar_store_cost */
565 2, /* vec_int_stmt_cost */
566 2, /* vec_fp_stmt_cost */
567 2, /* vec_permute_cost */
568 4, /* vec_to_scalar_cost */
569 4, /* scalar_to_vec_cost */
570 10, /* vec_align_load_cost */
571 10, /* vec_unalign_load_cost */
572 2, /* vec_unalign_store_cost */
573 2, /* vec_store_cost */
574 2, /* cond_taken_branch_cost */
575 1 /* cond_not_taken_branch_cost */
576 };
577
578 /* Costs for vector insn classes for Vulcan. */
579 static const struct cpu_vector_cost thunderx2t99_vector_cost =
580 {
581 1, /* scalar_int_stmt_cost */
582 6, /* scalar_fp_stmt_cost */
583 4, /* scalar_load_cost */
584 1, /* scalar_store_cost */
585 5, /* vec_int_stmt_cost */
586 6, /* vec_fp_stmt_cost */
587 3, /* vec_permute_cost */
588 6, /* vec_to_scalar_cost */
589 5, /* scalar_to_vec_cost */
590 8, /* vec_align_load_cost */
591 8, /* vec_unalign_load_cost */
592 4, /* vec_unalign_store_cost */
593 4, /* vec_store_cost */
594 2, /* cond_taken_branch_cost */
595 1 /* cond_not_taken_branch_cost */
596 };
597
598 /* Generic costs for branch instructions. */
599 static const struct cpu_branch_cost generic_branch_cost =
600 {
601 1, /* Predictable. */
602 3 /* Unpredictable. */
603 };
604
605 /* Generic approximation modes. */
606 static const cpu_approx_modes generic_approx_modes =
607 {
608 AARCH64_APPROX_NONE, /* division */
609 AARCH64_APPROX_NONE, /* sqrt */
610 AARCH64_APPROX_NONE /* recip_sqrt */
611 };
612
613 /* Approximation modes for Exynos M1. */
614 static const cpu_approx_modes exynosm1_approx_modes =
615 {
616 AARCH64_APPROX_NONE, /* division */
617 AARCH64_APPROX_ALL, /* sqrt */
618 AARCH64_APPROX_ALL /* recip_sqrt */
619 };
620
621 /* Approximation modes for X-Gene 1. */
622 static const cpu_approx_modes xgene1_approx_modes =
623 {
624 AARCH64_APPROX_NONE, /* division */
625 AARCH64_APPROX_NONE, /* sqrt */
626 AARCH64_APPROX_ALL /* recip_sqrt */
627 };
628
629 /* Generic prefetch settings (which disable prefetch). */
630 static const cpu_prefetch_tune generic_prefetch_tune =
631 {
632 0, /* num_slots */
633 -1, /* l1_cache_size */
634 -1, /* l1_cache_line_size */
635 -1, /* l2_cache_size */
636 true, /* prefetch_dynamic_strides */
637 -1, /* minimum_stride */
638 -1 /* default_opt_level */
639 };
640
641 static const cpu_prefetch_tune exynosm1_prefetch_tune =
642 {
643 0, /* num_slots */
644 -1, /* l1_cache_size */
645 64, /* l1_cache_line_size */
646 -1, /* l2_cache_size */
647 true, /* prefetch_dynamic_strides */
648 -1, /* minimum_stride */
649 -1 /* default_opt_level */
650 };
651
652 static const cpu_prefetch_tune qdf24xx_prefetch_tune =
653 {
654 4, /* num_slots */
655 32, /* l1_cache_size */
656 64, /* l1_cache_line_size */
657 512, /* l2_cache_size */
658 false, /* prefetch_dynamic_strides */
659 2048, /* minimum_stride */
660 3 /* default_opt_level */
661 };
662
663 static const cpu_prefetch_tune thunderxt88_prefetch_tune =
664 {
665 8, /* num_slots */
666 32, /* l1_cache_size */
667 128, /* l1_cache_line_size */
668 16*1024, /* l2_cache_size */
669 true, /* prefetch_dynamic_strides */
670 -1, /* minimum_stride */
671 3 /* default_opt_level */
672 };
673
674 static const cpu_prefetch_tune thunderx_prefetch_tune =
675 {
676 8, /* num_slots */
677 32, /* l1_cache_size */
678 128, /* l1_cache_line_size */
679 -1, /* l2_cache_size */
680 true, /* prefetch_dynamic_strides */
681 -1, /* minimum_stride */
682 -1 /* default_opt_level */
683 };
684
685 static const cpu_prefetch_tune thunderx2t99_prefetch_tune =
686 {
687 8, /* num_slots */
688 32, /* l1_cache_size */
689 64, /* l1_cache_line_size */
690 256, /* l2_cache_size */
691 true, /* prefetch_dynamic_strides */
692 -1, /* minimum_stride */
693 -1 /* default_opt_level */
694 };
695
696 static const cpu_prefetch_tune tsv110_prefetch_tune =
697 {
698 0, /* num_slots */
699 64, /* l1_cache_size */
700 64, /* l1_cache_line_size */
701 512, /* l2_cache_size */
702 true, /* prefetch_dynamic_strides */
703 -1, /* minimum_stride */
704 -1 /* default_opt_level */
705 };
706
707 static const cpu_prefetch_tune xgene1_prefetch_tune =
708 {
709 8, /* num_slots */
710 32, /* l1_cache_size */
711 64, /* l1_cache_line_size */
712 256, /* l2_cache_size */
713 true, /* prefetch_dynamic_strides */
714 -1, /* minimum_stride */
715 -1 /* default_opt_level */
716 };
717
718 static const struct tune_params generic_tunings =
719 {
720 &cortexa57_extra_costs,
721 &generic_addrcost_table,
722 &generic_regmove_cost,
723 &generic_vector_cost,
724 &generic_branch_cost,
725 &generic_approx_modes,
726 SVE_NOT_IMPLEMENTED, /* sve_width */
727 4, /* memmov_cost */
728 2, /* issue_rate */
729 (AARCH64_FUSE_AES_AESMC), /* fusible_ops */
730 "16:12", /* function_align. */
731 "4", /* jump_align. */
732 "8", /* loop_align. */
733 2, /* int_reassoc_width. */
734 4, /* fp_reassoc_width. */
735 1, /* vec_reassoc_width. */
736 2, /* min_div_recip_mul_sf. */
737 2, /* min_div_recip_mul_df. */
738 0, /* max_case_values. */
739 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
740 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
741 &generic_prefetch_tune
742 };
743
744 static const struct tune_params cortexa35_tunings =
745 {
746 &cortexa53_extra_costs,
747 &generic_addrcost_table,
748 &cortexa53_regmove_cost,
749 &generic_vector_cost,
750 &generic_branch_cost,
751 &generic_approx_modes,
752 SVE_NOT_IMPLEMENTED, /* sve_width */
753 4, /* memmov_cost */
754 1, /* issue_rate */
755 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
756 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
757 "16", /* function_align. */
758 "4", /* jump_align. */
759 "8", /* loop_align. */
760 2, /* int_reassoc_width. */
761 4, /* fp_reassoc_width. */
762 1, /* vec_reassoc_width. */
763 2, /* min_div_recip_mul_sf. */
764 2, /* min_div_recip_mul_df. */
765 0, /* max_case_values. */
766 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
767 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
768 &generic_prefetch_tune
769 };
770
771 static const struct tune_params cortexa53_tunings =
772 {
773 &cortexa53_extra_costs,
774 &generic_addrcost_table,
775 &cortexa53_regmove_cost,
776 &generic_vector_cost,
777 &generic_branch_cost,
778 &generic_approx_modes,
779 SVE_NOT_IMPLEMENTED, /* sve_width */
780 4, /* memmov_cost */
781 2, /* issue_rate */
782 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
783 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
784 "16", /* function_align. */
785 "4", /* jump_align. */
786 "8", /* loop_align. */
787 2, /* int_reassoc_width. */
788 4, /* fp_reassoc_width. */
789 1, /* vec_reassoc_width. */
790 2, /* min_div_recip_mul_sf. */
791 2, /* min_div_recip_mul_df. */
792 0, /* max_case_values. */
793 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
794 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
795 &generic_prefetch_tune
796 };
797
798 static const struct tune_params cortexa57_tunings =
799 {
800 &cortexa57_extra_costs,
801 &generic_addrcost_table,
802 &cortexa57_regmove_cost,
803 &cortexa57_vector_cost,
804 &generic_branch_cost,
805 &generic_approx_modes,
806 SVE_NOT_IMPLEMENTED, /* sve_width */
807 4, /* memmov_cost */
808 3, /* issue_rate */
809 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
810 | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
811 "16", /* function_align. */
812 "4", /* jump_align. */
813 "8", /* loop_align. */
814 2, /* int_reassoc_width. */
815 4, /* fp_reassoc_width. */
816 1, /* vec_reassoc_width. */
817 2, /* min_div_recip_mul_sf. */
818 2, /* min_div_recip_mul_df. */
819 0, /* max_case_values. */
820 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
821 (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS), /* tune_flags. */
822 &generic_prefetch_tune
823 };
824
825 static const struct tune_params cortexa72_tunings =
826 {
827 &cortexa57_extra_costs,
828 &generic_addrcost_table,
829 &cortexa57_regmove_cost,
830 &cortexa57_vector_cost,
831 &generic_branch_cost,
832 &generic_approx_modes,
833 SVE_NOT_IMPLEMENTED, /* sve_width */
834 4, /* memmov_cost */
835 3, /* issue_rate */
836 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
837 | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
838 "16", /* function_align. */
839 "4", /* jump_align. */
840 "8", /* loop_align. */
841 2, /* int_reassoc_width. */
842 4, /* fp_reassoc_width. */
843 1, /* vec_reassoc_width. */
844 2, /* min_div_recip_mul_sf. */
845 2, /* min_div_recip_mul_df. */
846 0, /* max_case_values. */
847 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
848 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
849 &generic_prefetch_tune
850 };
851
852 static const struct tune_params cortexa73_tunings =
853 {
854 &cortexa57_extra_costs,
855 &generic_addrcost_table,
856 &cortexa57_regmove_cost,
857 &cortexa57_vector_cost,
858 &generic_branch_cost,
859 &generic_approx_modes,
860 SVE_NOT_IMPLEMENTED, /* sve_width */
861 4, /* memmov_cost. */
862 2, /* issue_rate. */
863 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
864 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
865 "16", /* function_align. */
866 "4", /* jump_align. */
867 "8", /* loop_align. */
868 2, /* int_reassoc_width. */
869 4, /* fp_reassoc_width. */
870 1, /* vec_reassoc_width. */
871 2, /* min_div_recip_mul_sf. */
872 2, /* min_div_recip_mul_df. */
873 0, /* max_case_values. */
874 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
875 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
876 &generic_prefetch_tune
877 };
878
879
880
881 static const struct tune_params exynosm1_tunings =
882 {
883 &exynosm1_extra_costs,
884 &exynosm1_addrcost_table,
885 &exynosm1_regmove_cost,
886 &exynosm1_vector_cost,
887 &generic_branch_cost,
888 &exynosm1_approx_modes,
889 SVE_NOT_IMPLEMENTED, /* sve_width */
890 4, /* memmov_cost */
891 3, /* issue_rate */
892 (AARCH64_FUSE_AES_AESMC), /* fusible_ops */
893 "4", /* function_align. */
894 "4", /* jump_align. */
895 "4", /* loop_align. */
896 2, /* int_reassoc_width. */
897 4, /* fp_reassoc_width. */
898 1, /* vec_reassoc_width. */
899 2, /* min_div_recip_mul_sf. */
900 2, /* min_div_recip_mul_df. */
901 48, /* max_case_values. */
902 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
903 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
904 &exynosm1_prefetch_tune
905 };
906
907 static const struct tune_params thunderxt88_tunings =
908 {
909 &thunderx_extra_costs,
910 &generic_addrcost_table,
911 &thunderx_regmove_cost,
912 &thunderx_vector_cost,
913 &generic_branch_cost,
914 &generic_approx_modes,
915 SVE_NOT_IMPLEMENTED, /* sve_width */
916 6, /* memmov_cost */
917 2, /* issue_rate */
918 AARCH64_FUSE_CMP_BRANCH, /* fusible_ops */
919 "8", /* function_align. */
920 "8", /* jump_align. */
921 "8", /* loop_align. */
922 2, /* int_reassoc_width. */
923 4, /* fp_reassoc_width. */
924 1, /* vec_reassoc_width. */
925 2, /* min_div_recip_mul_sf. */
926 2, /* min_div_recip_mul_df. */
927 0, /* max_case_values. */
928 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
929 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW), /* tune_flags. */
930 &thunderxt88_prefetch_tune
931 };
932
933 static const struct tune_params thunderx_tunings =
934 {
935 &thunderx_extra_costs,
936 &generic_addrcost_table,
937 &thunderx_regmove_cost,
938 &thunderx_vector_cost,
939 &generic_branch_cost,
940 &generic_approx_modes,
941 SVE_NOT_IMPLEMENTED, /* sve_width */
942 6, /* memmov_cost */
943 2, /* issue_rate */
944 AARCH64_FUSE_CMP_BRANCH, /* fusible_ops */
945 "8", /* function_align. */
946 "8", /* jump_align. */
947 "8", /* loop_align. */
948 2, /* int_reassoc_width. */
949 4, /* fp_reassoc_width. */
950 1, /* vec_reassoc_width. */
951 2, /* min_div_recip_mul_sf. */
952 2, /* min_div_recip_mul_df. */
953 0, /* max_case_values. */
954 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
955 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
956 | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND), /* tune_flags. */
957 &thunderx_prefetch_tune
958 };
959
960 static const struct tune_params tsv110_tunings =
961 {
962 &tsv110_extra_costs,
963 &tsv110_addrcost_table,
964 &tsv110_regmove_cost,
965 &tsv110_vector_cost,
966 &generic_branch_cost,
967 &generic_approx_modes,
968 SVE_NOT_IMPLEMENTED, /* sve_width */
969 4, /* memmov_cost */
970 4, /* issue_rate */
971 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH
972 | AARCH64_FUSE_ALU_BRANCH), /* fusible_ops */
973 "16", /* function_align. */
974 "4", /* jump_align. */
975 "8", /* loop_align. */
976 2, /* int_reassoc_width. */
977 4, /* fp_reassoc_width. */
978 1, /* vec_reassoc_width. */
979 2, /* min_div_recip_mul_sf. */
980 2, /* min_div_recip_mul_df. */
981 0, /* max_case_values. */
982 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
983 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
984 &tsv110_prefetch_tune
985 };
986
987 static const struct tune_params xgene1_tunings =
988 {
989 &xgene1_extra_costs,
990 &xgene1_addrcost_table,
991 &xgene1_regmove_cost,
992 &xgene1_vector_cost,
993 &generic_branch_cost,
994 &xgene1_approx_modes,
995 SVE_NOT_IMPLEMENTED, /* sve_width */
996 6, /* memmov_cost */
997 4, /* issue_rate */
998 AARCH64_FUSE_NOTHING, /* fusible_ops */
999 "16", /* function_align. */
1000 "16", /* jump_align. */
1001 "16", /* loop_align. */
1002 2, /* int_reassoc_width. */
1003 4, /* fp_reassoc_width. */
1004 1, /* vec_reassoc_width. */
1005 2, /* min_div_recip_mul_sf. */
1006 2, /* min_div_recip_mul_df. */
1007 17, /* max_case_values. */
1008 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
1009 (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS), /* tune_flags. */
1010 &xgene1_prefetch_tune
1011 };
1012
1013 static const struct tune_params emag_tunings =
1014 {
1015 &xgene1_extra_costs,
1016 &xgene1_addrcost_table,
1017 &xgene1_regmove_cost,
1018 &xgene1_vector_cost,
1019 &generic_branch_cost,
1020 &xgene1_approx_modes,
1021 SVE_NOT_IMPLEMENTED,
1022 6, /* memmov_cost */
1023 4, /* issue_rate */
1024 AARCH64_FUSE_NOTHING, /* fusible_ops */
1025 "16", /* function_align. */
1026 "16", /* jump_align. */
1027 "16", /* loop_align. */
1028 2, /* int_reassoc_width. */
1029 4, /* fp_reassoc_width. */
1030 1, /* vec_reassoc_width. */
1031 2, /* min_div_recip_mul_sf. */
1032 2, /* min_div_recip_mul_df. */
1033 17, /* max_case_values. */
1034 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
1035 (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS), /* tune_flags. */
1036 &xgene1_prefetch_tune
1037 };
1038
1039 static const struct tune_params qdf24xx_tunings =
1040 {
1041 &qdf24xx_extra_costs,
1042 &qdf24xx_addrcost_table,
1043 &qdf24xx_regmove_cost,
1044 &qdf24xx_vector_cost,
1045 &generic_branch_cost,
1046 &generic_approx_modes,
1047 SVE_NOT_IMPLEMENTED, /* sve_width */
1048 4, /* memmov_cost */
1049 4, /* issue_rate */
1050 (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1051 | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops */
1052 "16", /* function_align. */
1053 "8", /* jump_align. */
1054 "16", /* loop_align. */
1055 2, /* int_reassoc_width. */
1056 4, /* fp_reassoc_width. */
1057 1, /* vec_reassoc_width. */
1058 2, /* min_div_recip_mul_sf. */
1059 2, /* min_div_recip_mul_df. */
1060 0, /* max_case_values. */
1061 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1062 AARCH64_EXTRA_TUNE_RENAME_LOAD_REGS, /* tune_flags. */
1063 &qdf24xx_prefetch_tune
1064 };
1065
1066 /* Tuning structure for the Qualcomm Saphira core. Default to falkor values
1067 for now. */
1068 static const struct tune_params saphira_tunings =
1069 {
1070 &generic_extra_costs,
1071 &generic_addrcost_table,
1072 &generic_regmove_cost,
1073 &generic_vector_cost,
1074 &generic_branch_cost,
1075 &generic_approx_modes,
1076 SVE_NOT_IMPLEMENTED, /* sve_width */
1077 4, /* memmov_cost */
1078 4, /* issue_rate */
1079 (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1080 | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops */
1081 "16", /* function_align. */
1082 "8", /* jump_align. */
1083 "16", /* loop_align. */
1084 2, /* int_reassoc_width. */
1085 4, /* fp_reassoc_width. */
1086 1, /* vec_reassoc_width. */
1087 2, /* min_div_recip_mul_sf. */
1088 2, /* min_div_recip_mul_df. */
1089 0, /* max_case_values. */
1090 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1091 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1092 &generic_prefetch_tune
1093 };
1094
1095 static const struct tune_params thunderx2t99_tunings =
1096 {
1097 &thunderx2t99_extra_costs,
1098 &thunderx2t99_addrcost_table,
1099 &thunderx2t99_regmove_cost,
1100 &thunderx2t99_vector_cost,
1101 &generic_branch_cost,
1102 &generic_approx_modes,
1103 SVE_NOT_IMPLEMENTED, /* sve_width */
1104 4, /* memmov_cost. */
1105 4, /* issue_rate. */
1106 (AARCH64_FUSE_CMP_BRANCH | AARCH64_FUSE_AES_AESMC
1107 | AARCH64_FUSE_ALU_BRANCH), /* fusible_ops */
1108 "16", /* function_align. */
1109 "8", /* jump_align. */
1110 "16", /* loop_align. */
1111 3, /* int_reassoc_width. */
1112 2, /* fp_reassoc_width. */
1113 2, /* vec_reassoc_width. */
1114 2, /* min_div_recip_mul_sf. */
1115 2, /* min_div_recip_mul_df. */
1116 0, /* max_case_values. */
1117 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1118 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1119 &thunderx2t99_prefetch_tune
1120 };
1121
1122 static const struct tune_params neoversen1_tunings =
1123 {
1124 &cortexa57_extra_costs,
1125 &generic_addrcost_table,
1126 &generic_regmove_cost,
1127 &cortexa57_vector_cost,
1128 &generic_branch_cost,
1129 &generic_approx_modes,
1130 SVE_NOT_IMPLEMENTED, /* sve_width */
1131 4, /* memmov_cost */
1132 3, /* issue_rate */
1133 AARCH64_FUSE_AES_AESMC, /* fusible_ops */
1134 "32:16", /* function_align. */
1135 "32:16", /* jump_align. */
1136 "32:16", /* loop_align. */
1137 2, /* int_reassoc_width. */
1138 4, /* fp_reassoc_width. */
1139 2, /* vec_reassoc_width. */
1140 2, /* min_div_recip_mul_sf. */
1141 2, /* min_div_recip_mul_df. */
1142 0, /* max_case_values. */
1143 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1144 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1145 &generic_prefetch_tune
1146 };
1147
1148 /* Support for fine-grained override of the tuning structures. */
1149 struct aarch64_tuning_override_function
1150 {
1151 const char* name;
1152 void (*parse_override)(const char*, struct tune_params*);
1153 };
1154
1155 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
1156 static void aarch64_parse_tune_string (const char*, struct tune_params*);
1157 static void aarch64_parse_sve_width_string (const char*, struct tune_params*);
1158
1159 static const struct aarch64_tuning_override_function
1160 aarch64_tuning_override_functions[] =
1161 {
1162 { "fuse", aarch64_parse_fuse_string },
1163 { "tune", aarch64_parse_tune_string },
1164 { "sve_width", aarch64_parse_sve_width_string },
1165 { NULL, NULL }
1166 };
1167
1168 /* A processor implementing AArch64. */
1169 struct processor
1170 {
1171 const char *const name;
1172 enum aarch64_processor ident;
1173 enum aarch64_processor sched_core;
1174 enum aarch64_arch arch;
1175 unsigned architecture_version;
1176 const uint64_t flags;
1177 const struct tune_params *const tune;
1178 };
1179
1180 /* Architectures implementing AArch64. */
1181 static const struct processor all_architectures[] =
1182 {
1183 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
1184 {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
1185 #include "aarch64-arches.def"
1186 {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
1187 };
1188
1189 /* Processor cores implementing AArch64. */
1190 static const struct processor all_cores[] =
1191 {
1192 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART, VARIANT) \
1193 {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH, \
1194 all_architectures[AARCH64_ARCH_##ARCH].architecture_version, \
1195 FLAGS, &COSTS##_tunings},
1196 #include "aarch64-cores.def"
1197 {"generic", generic, cortexa53, AARCH64_ARCH_8A, 8,
1198 AARCH64_FL_FOR_ARCH8, &generic_tunings},
1199 {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
1200 };
1201
1202
1203 /* Target specification. These are populated by the -march, -mtune, -mcpu
1204 handling code or by target attributes. */
1205 static const struct processor *selected_arch;
1206 static const struct processor *selected_cpu;
1207 static const struct processor *selected_tune;
1208
1209 enum aarch64_key_type aarch64_ra_sign_key = AARCH64_KEY_A;
1210
1211 /* The current tuning set. */
1212 struct tune_params aarch64_tune_params = generic_tunings;
1213
1214 /* Table of machine attributes. */
1215 static const struct attribute_spec aarch64_attribute_table[] =
1216 {
1217 /* { name, min_len, max_len, decl_req, type_req, fn_type_req,
1218 affects_type_identity, handler, exclude } */
1219 { "aarch64_vector_pcs", 0, 0, false, true, true, true, NULL, NULL },
1220 { NULL, 0, 0, false, false, false, false, NULL, NULL }
1221 };
1222
1223 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
1224
1225 /* An ISA extension in the co-processor and main instruction set space. */
1226 struct aarch64_option_extension
1227 {
1228 const char *const name;
1229 const unsigned long flags_on;
1230 const unsigned long flags_off;
1231 };
1232
1233 typedef enum aarch64_cond_code
1234 {
1235 AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
1236 AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
1237 AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
1238 }
1239 aarch64_cc;
1240
1241 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
1242
1243 struct aarch64_branch_protect_type
1244 {
1245 /* The type's name that the user passes to the branch-protection option
1246 string. */
1247 const char* name;
1248 /* Function to handle the protection type and set global variables.
1249 First argument is the string token corresponding with this type and the
1250 second argument is the next token in the option string.
1251 Return values:
1252 * AARCH64_PARSE_OK: Handling was sucessful.
1253 * AARCH64_INVALID_ARG: The type is invalid in this context and the caller
1254 should print an error.
1255 * AARCH64_INVALID_FEATURE: The type is invalid and the handler prints its
1256 own error. */
1257 enum aarch64_parse_opt_result (*handler)(char*, char*);
1258 /* A list of types that can follow this type in the option string. */
1259 const aarch64_branch_protect_type* subtypes;
1260 unsigned int num_subtypes;
1261 };
1262
1263 static enum aarch64_parse_opt_result
1264 aarch64_handle_no_branch_protection (char* str, char* rest)
1265 {
1266 aarch64_ra_sign_scope = AARCH64_FUNCTION_NONE;
1267 aarch64_enable_bti = 0;
1268 if (rest)
1269 {
1270 error ("unexpected %<%s%> after %<%s%>", rest, str);
1271 return AARCH64_PARSE_INVALID_FEATURE;
1272 }
1273 return AARCH64_PARSE_OK;
1274 }
1275
1276 static enum aarch64_parse_opt_result
1277 aarch64_handle_standard_branch_protection (char* str, char* rest)
1278 {
1279 aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
1280 aarch64_ra_sign_key = AARCH64_KEY_A;
1281 aarch64_enable_bti = 1;
1282 if (rest)
1283 {
1284 error ("unexpected %<%s%> after %<%s%>", rest, str);
1285 return AARCH64_PARSE_INVALID_FEATURE;
1286 }
1287 return AARCH64_PARSE_OK;
1288 }
1289
1290 static enum aarch64_parse_opt_result
1291 aarch64_handle_pac_ret_protection (char* str ATTRIBUTE_UNUSED,
1292 char* rest ATTRIBUTE_UNUSED)
1293 {
1294 aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
1295 aarch64_ra_sign_key = AARCH64_KEY_A;
1296 return AARCH64_PARSE_OK;
1297 }
1298
1299 static enum aarch64_parse_opt_result
1300 aarch64_handle_pac_ret_leaf (char* str ATTRIBUTE_UNUSED,
1301 char* rest ATTRIBUTE_UNUSED)
1302 {
1303 aarch64_ra_sign_scope = AARCH64_FUNCTION_ALL;
1304 return AARCH64_PARSE_OK;
1305 }
1306
1307 static enum aarch64_parse_opt_result
1308 aarch64_handle_pac_ret_b_key (char* str ATTRIBUTE_UNUSED,
1309 char* rest ATTRIBUTE_UNUSED)
1310 {
1311 aarch64_ra_sign_key = AARCH64_KEY_B;
1312 return AARCH64_PARSE_OK;
1313 }
1314
1315 static enum aarch64_parse_opt_result
1316 aarch64_handle_bti_protection (char* str ATTRIBUTE_UNUSED,
1317 char* rest ATTRIBUTE_UNUSED)
1318 {
1319 aarch64_enable_bti = 1;
1320 return AARCH64_PARSE_OK;
1321 }
1322
1323 static const struct aarch64_branch_protect_type aarch64_pac_ret_subtypes[] = {
1324 { "leaf", aarch64_handle_pac_ret_leaf, NULL, 0 },
1325 { "b-key", aarch64_handle_pac_ret_b_key, NULL, 0 },
1326 { NULL, NULL, NULL, 0 }
1327 };
1328
1329 static const struct aarch64_branch_protect_type aarch64_branch_protect_types[] = {
1330 { "none", aarch64_handle_no_branch_protection, NULL, 0 },
1331 { "standard", aarch64_handle_standard_branch_protection, NULL, 0 },
1332 { "pac-ret", aarch64_handle_pac_ret_protection, aarch64_pac_ret_subtypes,
1333 ARRAY_SIZE (aarch64_pac_ret_subtypes) },
1334 { "bti", aarch64_handle_bti_protection, NULL, 0 },
1335 { NULL, NULL, NULL, 0 }
1336 };
1337
1338 /* The condition codes of the processor, and the inverse function. */
1339 static const char * const aarch64_condition_codes[] =
1340 {
1341 "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
1342 "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
1343 };
1344
1345 /* The preferred condition codes for SVE conditions. */
1346 static const char *const aarch64_sve_condition_codes[] =
1347 {
1348 "none", "any", "nlast", "last", "first", "nfrst", "vs", "vc",
1349 "pmore", "plast", "tcont", "tstop", "gt", "le", "al", "nv"
1350 };
1351
1352 /* Return the assembly token for svpattern value VALUE. */
1353
1354 static const char *
1355 svpattern_token (enum aarch64_svpattern pattern)
1356 {
1357 switch (pattern)
1358 {
1359 #define CASE(UPPER, LOWER, VALUE) case AARCH64_SV_##UPPER: return #LOWER;
1360 AARCH64_FOR_SVPATTERN (CASE)
1361 #undef CASE
1362 case AARCH64_NUM_SVPATTERNS:
1363 break;
1364 }
1365 gcc_unreachable ();
1366 }
1367
1368 /* Generate code to enable conditional branches in functions over 1 MiB. */
1369 const char *
1370 aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
1371 const char * branch_format)
1372 {
1373 rtx_code_label * tmp_label = gen_label_rtx ();
1374 char label_buf[256];
1375 char buffer[128];
1376 ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
1377 CODE_LABEL_NUMBER (tmp_label));
1378 const char *label_ptr = targetm.strip_name_encoding (label_buf);
1379 rtx dest_label = operands[pos_label];
1380 operands[pos_label] = tmp_label;
1381
1382 snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
1383 output_asm_insn (buffer, operands);
1384
1385 snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
1386 operands[pos_label] = dest_label;
1387 output_asm_insn (buffer, operands);
1388 return "";
1389 }
1390
1391 void
1392 aarch64_err_no_fpadvsimd (machine_mode mode)
1393 {
1394 if (TARGET_GENERAL_REGS_ONLY)
1395 if (FLOAT_MODE_P (mode))
1396 error ("%qs is incompatible with the use of floating-point types",
1397 "-mgeneral-regs-only");
1398 else
1399 error ("%qs is incompatible with the use of vector types",
1400 "-mgeneral-regs-only");
1401 else
1402 if (FLOAT_MODE_P (mode))
1403 error ("%qs feature modifier is incompatible with the use of"
1404 " floating-point types", "+nofp");
1405 else
1406 error ("%qs feature modifier is incompatible with the use of"
1407 " vector types", "+nofp");
1408 }
1409
1410 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
1411 The register allocator chooses POINTER_AND_FP_REGS if FP_REGS and
1412 GENERAL_REGS have the same cost - even if POINTER_AND_FP_REGS has a much
1413 higher cost. POINTER_AND_FP_REGS is also used if the cost of both FP_REGS
1414 and GENERAL_REGS is lower than the memory cost (in this case the best class
1415 is the lowest cost one). Using POINTER_AND_FP_REGS irrespectively of its
1416 cost results in bad allocations with many redundant int<->FP moves which
1417 are expensive on various cores.
1418 To avoid this we don't allow POINTER_AND_FP_REGS as the allocno class, but
1419 force a decision between FP_REGS and GENERAL_REGS. We use the allocno class
1420 if it isn't POINTER_AND_FP_REGS. Similarly, use the best class if it isn't
1421 POINTER_AND_FP_REGS. Otherwise set the allocno class depending on the mode.
1422 The result of this is that it is no longer inefficient to have a higher
1423 memory move cost than the register move cost.
1424 */
1425
1426 static reg_class_t
1427 aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class,
1428 reg_class_t best_class)
1429 {
1430 machine_mode mode;
1431
1432 if (!reg_class_subset_p (GENERAL_REGS, allocno_class)
1433 || !reg_class_subset_p (FP_REGS, allocno_class))
1434 return allocno_class;
1435
1436 if (!reg_class_subset_p (GENERAL_REGS, best_class)
1437 || !reg_class_subset_p (FP_REGS, best_class))
1438 return best_class;
1439
1440 mode = PSEUDO_REGNO_MODE (regno);
1441 return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS;
1442 }
1443
1444 static unsigned int
1445 aarch64_min_divisions_for_recip_mul (machine_mode mode)
1446 {
1447 if (GET_MODE_UNIT_SIZE (mode) == 4)
1448 return aarch64_tune_params.min_div_recip_mul_sf;
1449 return aarch64_tune_params.min_div_recip_mul_df;
1450 }
1451
1452 /* Return the reassociation width of treeop OPC with mode MODE. */
1453 static int
1454 aarch64_reassociation_width (unsigned opc, machine_mode mode)
1455 {
1456 if (VECTOR_MODE_P (mode))
1457 return aarch64_tune_params.vec_reassoc_width;
1458 if (INTEGRAL_MODE_P (mode))
1459 return aarch64_tune_params.int_reassoc_width;
1460 /* Avoid reassociating floating point addition so we emit more FMAs. */
1461 if (FLOAT_MODE_P (mode) && opc != PLUS_EXPR)
1462 return aarch64_tune_params.fp_reassoc_width;
1463 return 1;
1464 }
1465
1466 /* Provide a mapping from gcc register numbers to dwarf register numbers. */
1467 unsigned
1468 aarch64_dbx_register_number (unsigned regno)
1469 {
1470 if (GP_REGNUM_P (regno))
1471 return AARCH64_DWARF_R0 + regno - R0_REGNUM;
1472 else if (regno == SP_REGNUM)
1473 return AARCH64_DWARF_SP;
1474 else if (FP_REGNUM_P (regno))
1475 return AARCH64_DWARF_V0 + regno - V0_REGNUM;
1476 else if (PR_REGNUM_P (regno))
1477 return AARCH64_DWARF_P0 + regno - P0_REGNUM;
1478 else if (regno == VG_REGNUM)
1479 return AARCH64_DWARF_VG;
1480
1481 /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
1482 equivalent DWARF register. */
1483 return DWARF_FRAME_REGISTERS;
1484 }
1485
1486 /* If X is a CONST_DOUBLE, return its bit representation as a constant
1487 integer, otherwise return X unmodified. */
1488 static rtx
1489 aarch64_bit_representation (rtx x)
1490 {
1491 if (CONST_DOUBLE_P (x))
1492 x = gen_lowpart (int_mode_for_mode (GET_MODE (x)).require (), x);
1493 return x;
1494 }
1495
1496 /* Return true if MODE is any of the Advanced SIMD structure modes. */
1497 static bool
1498 aarch64_advsimd_struct_mode_p (machine_mode mode)
1499 {
1500 return (TARGET_SIMD
1501 && (mode == OImode || mode == CImode || mode == XImode));
1502 }
1503
1504 /* Return true if MODE is an SVE predicate mode. */
1505 static bool
1506 aarch64_sve_pred_mode_p (machine_mode mode)
1507 {
1508 return (TARGET_SVE
1509 && (mode == VNx16BImode
1510 || mode == VNx8BImode
1511 || mode == VNx4BImode
1512 || mode == VNx2BImode));
1513 }
1514
1515 /* Three mutually-exclusive flags describing a vector or predicate type. */
1516 const unsigned int VEC_ADVSIMD = 1;
1517 const unsigned int VEC_SVE_DATA = 2;
1518 const unsigned int VEC_SVE_PRED = 4;
1519 /* Can be used in combination with VEC_ADVSIMD or VEC_SVE_DATA to indicate
1520 a structure of 2, 3 or 4 vectors. */
1521 const unsigned int VEC_STRUCT = 8;
1522 /* Useful combinations of the above. */
1523 const unsigned int VEC_ANY_SVE = VEC_SVE_DATA | VEC_SVE_PRED;
1524 const unsigned int VEC_ANY_DATA = VEC_ADVSIMD | VEC_SVE_DATA;
1525
1526 /* Return a set of flags describing the vector properties of mode MODE.
1527 Ignore modes that are not supported by the current target. */
1528 static unsigned int
1529 aarch64_classify_vector_mode (machine_mode mode)
1530 {
1531 if (aarch64_advsimd_struct_mode_p (mode))
1532 return VEC_ADVSIMD | VEC_STRUCT;
1533
1534 if (aarch64_sve_pred_mode_p (mode))
1535 return VEC_SVE_PRED;
1536
1537 /* Make the decision based on the mode's enum value rather than its
1538 properties, so that we keep the correct classification regardless
1539 of -msve-vector-bits. */
1540 switch (mode)
1541 {
1542 /* Single SVE vectors. */
1543 case E_VNx16QImode:
1544 case E_VNx8HImode:
1545 case E_VNx4SImode:
1546 case E_VNx2DImode:
1547 case E_VNx8HFmode:
1548 case E_VNx4SFmode:
1549 case E_VNx2DFmode:
1550 return TARGET_SVE ? VEC_SVE_DATA : 0;
1551
1552 /* x2 SVE vectors. */
1553 case E_VNx32QImode:
1554 case E_VNx16HImode:
1555 case E_VNx8SImode:
1556 case E_VNx4DImode:
1557 case E_VNx16HFmode:
1558 case E_VNx8SFmode:
1559 case E_VNx4DFmode:
1560 /* x3 SVE vectors. */
1561 case E_VNx48QImode:
1562 case E_VNx24HImode:
1563 case E_VNx12SImode:
1564 case E_VNx6DImode:
1565 case E_VNx24HFmode:
1566 case E_VNx12SFmode:
1567 case E_VNx6DFmode:
1568 /* x4 SVE vectors. */
1569 case E_VNx64QImode:
1570 case E_VNx32HImode:
1571 case E_VNx16SImode:
1572 case E_VNx8DImode:
1573 case E_VNx32HFmode:
1574 case E_VNx16SFmode:
1575 case E_VNx8DFmode:
1576 return TARGET_SVE ? VEC_SVE_DATA | VEC_STRUCT : 0;
1577
1578 /* 64-bit Advanced SIMD vectors. */
1579 case E_V8QImode:
1580 case E_V4HImode:
1581 case E_V2SImode:
1582 /* ...E_V1DImode doesn't exist. */
1583 case E_V4HFmode:
1584 case E_V2SFmode:
1585 case E_V1DFmode:
1586 /* 128-bit Advanced SIMD vectors. */
1587 case E_V16QImode:
1588 case E_V8HImode:
1589 case E_V4SImode:
1590 case E_V2DImode:
1591 case E_V8HFmode:
1592 case E_V4SFmode:
1593 case E_V2DFmode:
1594 return TARGET_SIMD ? VEC_ADVSIMD : 0;
1595
1596 default:
1597 return 0;
1598 }
1599 }
1600
1601 /* Return true if MODE is any of the data vector modes, including
1602 structure modes. */
1603 static bool
1604 aarch64_vector_data_mode_p (machine_mode mode)
1605 {
1606 return aarch64_classify_vector_mode (mode) & VEC_ANY_DATA;
1607 }
1608
1609 /* Return true if MODE is an SVE data vector mode; either a single vector
1610 or a structure of vectors. */
1611 static bool
1612 aarch64_sve_data_mode_p (machine_mode mode)
1613 {
1614 return aarch64_classify_vector_mode (mode) & VEC_SVE_DATA;
1615 }
1616
1617 /* Implement target hook TARGET_ARRAY_MODE. */
1618 static opt_machine_mode
1619 aarch64_array_mode (machine_mode mode, unsigned HOST_WIDE_INT nelems)
1620 {
1621 if (aarch64_classify_vector_mode (mode) == VEC_SVE_DATA
1622 && IN_RANGE (nelems, 2, 4))
1623 return mode_for_vector (GET_MODE_INNER (mode),
1624 GET_MODE_NUNITS (mode) * nelems);
1625
1626 return opt_machine_mode ();
1627 }
1628
1629 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P. */
1630 static bool
1631 aarch64_array_mode_supported_p (machine_mode mode,
1632 unsigned HOST_WIDE_INT nelems)
1633 {
1634 if (TARGET_SIMD
1635 && (AARCH64_VALID_SIMD_QREG_MODE (mode)
1636 || AARCH64_VALID_SIMD_DREG_MODE (mode))
1637 && (nelems >= 2 && nelems <= 4))
1638 return true;
1639
1640 return false;
1641 }
1642
1643 /* Return the SVE predicate mode to use for elements that have
1644 ELEM_NBYTES bytes, if such a mode exists. */
1645
1646 opt_machine_mode
1647 aarch64_sve_pred_mode (unsigned int elem_nbytes)
1648 {
1649 if (TARGET_SVE)
1650 {
1651 if (elem_nbytes == 1)
1652 return VNx16BImode;
1653 if (elem_nbytes == 2)
1654 return VNx8BImode;
1655 if (elem_nbytes == 4)
1656 return VNx4BImode;
1657 if (elem_nbytes == 8)
1658 return VNx2BImode;
1659 }
1660 return opt_machine_mode ();
1661 }
1662
1663 /* Implement TARGET_VECTORIZE_GET_MASK_MODE. */
1664
1665 static opt_machine_mode
1666 aarch64_get_mask_mode (poly_uint64 nunits, poly_uint64 nbytes)
1667 {
1668 if (TARGET_SVE && known_eq (nbytes, BYTES_PER_SVE_VECTOR))
1669 {
1670 unsigned int elem_nbytes = vector_element_size (nbytes, nunits);
1671 machine_mode pred_mode;
1672 if (aarch64_sve_pred_mode (elem_nbytes).exists (&pred_mode))
1673 return pred_mode;
1674 }
1675
1676 return default_get_mask_mode (nunits, nbytes);
1677 }
1678
1679 /* Return the SVE vector mode that has NUNITS elements of mode INNER_MODE. */
1680
1681 static opt_machine_mode
1682 aarch64_sve_data_mode (scalar_mode inner_mode, poly_uint64 nunits)
1683 {
1684 enum mode_class mclass = (is_a <scalar_float_mode> (inner_mode)
1685 ? MODE_VECTOR_FLOAT : MODE_VECTOR_INT);
1686 machine_mode mode;
1687 FOR_EACH_MODE_IN_CLASS (mode, mclass)
1688 if (inner_mode == GET_MODE_INNER (mode)
1689 && known_eq (nunits, GET_MODE_NUNITS (mode))
1690 && aarch64_sve_data_mode_p (mode))
1691 return mode;
1692 return opt_machine_mode ();
1693 }
1694
1695 /* Return the integer element mode associated with SVE mode MODE. */
1696
1697 static scalar_int_mode
1698 aarch64_sve_element_int_mode (machine_mode mode)
1699 {
1700 unsigned int elt_bits = vector_element_size (BITS_PER_SVE_VECTOR,
1701 GET_MODE_NUNITS (mode));
1702 return int_mode_for_size (elt_bits, 0).require ();
1703 }
1704
1705 /* Return the integer vector mode associated with SVE mode MODE.
1706 Unlike mode_for_int_vector, this can handle the case in which
1707 MODE is a predicate (and thus has a different total size). */
1708
1709 static machine_mode
1710 aarch64_sve_int_mode (machine_mode mode)
1711 {
1712 scalar_int_mode int_mode = aarch64_sve_element_int_mode (mode);
1713 return aarch64_sve_data_mode (int_mode, GET_MODE_NUNITS (mode)).require ();
1714 }
1715
1716 /* Implement TARGET_PREFERRED_ELSE_VALUE. For binary operations,
1717 prefer to use the first arithmetic operand as the else value if
1718 the else value doesn't matter, since that exactly matches the SVE
1719 destructive merging form. For ternary operations we could either
1720 pick the first operand and use FMAD-like instructions or the last
1721 operand and use FMLA-like instructions; the latter seems more
1722 natural. */
1723
1724 static tree
1725 aarch64_preferred_else_value (unsigned, tree, unsigned int nops, tree *ops)
1726 {
1727 return nops == 3 ? ops[2] : ops[0];
1728 }
1729
1730 /* Implement TARGET_HARD_REGNO_NREGS. */
1731
1732 static unsigned int
1733 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
1734 {
1735 /* ??? Logically we should only need to provide a value when
1736 HARD_REGNO_MODE_OK says that the combination is valid,
1737 but at the moment we need to handle all modes. Just ignore
1738 any runtime parts for registers that can't store them. */
1739 HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
1740 switch (aarch64_regno_regclass (regno))
1741 {
1742 case FP_REGS:
1743 case FP_LO_REGS:
1744 case FP_LO8_REGS:
1745 if (aarch64_sve_data_mode_p (mode))
1746 return exact_div (GET_MODE_SIZE (mode),
1747 BYTES_PER_SVE_VECTOR).to_constant ();
1748 return CEIL (lowest_size, UNITS_PER_VREG);
1749 case PR_REGS:
1750 case PR_LO_REGS:
1751 case PR_HI_REGS:
1752 return 1;
1753 default:
1754 return CEIL (lowest_size, UNITS_PER_WORD);
1755 }
1756 gcc_unreachable ();
1757 }
1758
1759 /* Implement TARGET_HARD_REGNO_MODE_OK. */
1760
1761 static bool
1762 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
1763 {
1764 if (GET_MODE_CLASS (mode) == MODE_CC)
1765 return regno == CC_REGNUM;
1766
1767 if (regno == VG_REGNUM)
1768 /* This must have the same size as _Unwind_Word. */
1769 return mode == DImode;
1770
1771 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1772 if (vec_flags & VEC_SVE_PRED)
1773 return PR_REGNUM_P (regno);
1774
1775 if (PR_REGNUM_P (regno))
1776 return 0;
1777
1778 if (regno == SP_REGNUM)
1779 /* The purpose of comparing with ptr_mode is to support the
1780 global register variable associated with the stack pointer
1781 register via the syntax of asm ("wsp") in ILP32. */
1782 return mode == Pmode || mode == ptr_mode;
1783
1784 if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
1785 return mode == Pmode;
1786
1787 if (GP_REGNUM_P (regno))
1788 {
1789 if (known_le (GET_MODE_SIZE (mode), 8))
1790 return true;
1791 else if (known_le (GET_MODE_SIZE (mode), 16))
1792 return (regno & 1) == 0;
1793 }
1794 else if (FP_REGNUM_P (regno))
1795 {
1796 if (vec_flags & VEC_STRUCT)
1797 return end_hard_regno (mode, regno) - 1 <= V31_REGNUM;
1798 else
1799 return !VECTOR_MODE_P (mode) || vec_flags != 0;
1800 }
1801
1802 return false;
1803 }
1804
1805 /* Return true if this is a definition of a vectorized simd function. */
1806
1807 static bool
1808 aarch64_simd_decl_p (tree fndecl)
1809 {
1810 tree fntype;
1811
1812 if (fndecl == NULL)
1813 return false;
1814 fntype = TREE_TYPE (fndecl);
1815 if (fntype == NULL)
1816 return false;
1817
1818 /* Functions with the aarch64_vector_pcs attribute use the simd ABI. */
1819 if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (fntype)) != NULL)
1820 return true;
1821
1822 return false;
1823 }
1824
1825 /* Return the mode a register save/restore should use. DImode for integer
1826 registers, DFmode for FP registers in non-SIMD functions (they only save
1827 the bottom half of a 128 bit register), or TFmode for FP registers in
1828 SIMD functions. */
1829
1830 static machine_mode
1831 aarch64_reg_save_mode (tree fndecl, unsigned regno)
1832 {
1833 return GP_REGNUM_P (regno)
1834 ? E_DImode
1835 : (aarch64_simd_decl_p (fndecl) ? E_TFmode : E_DFmode);
1836 }
1837
1838 /* Return true if the instruction is a call to a SIMD function, false
1839 if it is not a SIMD function or if we do not know anything about
1840 the function. */
1841
1842 static bool
1843 aarch64_simd_call_p (rtx_insn *insn)
1844 {
1845 rtx symbol;
1846 rtx call;
1847 tree fndecl;
1848
1849 gcc_assert (CALL_P (insn));
1850 call = get_call_rtx_from (insn);
1851 symbol = XEXP (XEXP (call, 0), 0);
1852 if (GET_CODE (symbol) != SYMBOL_REF)
1853 return false;
1854 fndecl = SYMBOL_REF_DECL (symbol);
1855 if (!fndecl)
1856 return false;
1857
1858 return aarch64_simd_decl_p (fndecl);
1859 }
1860
1861 /* Implement TARGET_REMOVE_EXTRA_CALL_PRESERVED_REGS. If INSN calls
1862 a function that uses the SIMD ABI, take advantage of the extra
1863 call-preserved registers that the ABI provides. */
1864
1865 void
1866 aarch64_remove_extra_call_preserved_regs (rtx_insn *insn,
1867 HARD_REG_SET *return_set)
1868 {
1869 if (aarch64_simd_call_p (insn))
1870 {
1871 for (int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
1872 if (FP_SIMD_SAVED_REGNUM_P (regno))
1873 CLEAR_HARD_REG_BIT (*return_set, regno);
1874 }
1875 }
1876
1877 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED. The callee only saves
1878 the lower 64 bits of a 128-bit register. Tell the compiler the callee
1879 clobbers the top 64 bits when restoring the bottom 64 bits. */
1880
1881 static bool
1882 aarch64_hard_regno_call_part_clobbered (rtx_insn *insn, unsigned int regno,
1883 machine_mode mode)
1884 {
1885 bool simd_p = insn && CALL_P (insn) && aarch64_simd_call_p (insn);
1886 return FP_REGNUM_P (regno)
1887 && maybe_gt (GET_MODE_SIZE (mode), simd_p ? 16 : 8);
1888 }
1889
1890 /* Implement TARGET_RETURN_CALL_WITH_MAX_CLOBBERS. */
1891
1892 rtx_insn *
1893 aarch64_return_call_with_max_clobbers (rtx_insn *call_1, rtx_insn *call_2)
1894 {
1895 gcc_assert (CALL_P (call_1) && CALL_P (call_2));
1896
1897 if (!aarch64_simd_call_p (call_1) || aarch64_simd_call_p (call_2))
1898 return call_1;
1899 else
1900 return call_2;
1901 }
1902
1903 /* Implement REGMODE_NATURAL_SIZE. */
1904 poly_uint64
1905 aarch64_regmode_natural_size (machine_mode mode)
1906 {
1907 /* The natural size for SVE data modes is one SVE data vector,
1908 and similarly for predicates. We can't independently modify
1909 anything smaller than that. */
1910 /* ??? For now, only do this for variable-width SVE registers.
1911 Doing it for constant-sized registers breaks lower-subreg.c. */
1912 /* ??? And once that's fixed, we should probably have similar
1913 code for Advanced SIMD. */
1914 if (!aarch64_sve_vg.is_constant ())
1915 {
1916 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1917 if (vec_flags & VEC_SVE_PRED)
1918 return BYTES_PER_SVE_PRED;
1919 if (vec_flags & VEC_SVE_DATA)
1920 return BYTES_PER_SVE_VECTOR;
1921 }
1922 return UNITS_PER_WORD;
1923 }
1924
1925 /* Implement HARD_REGNO_CALLER_SAVE_MODE. */
1926 machine_mode
1927 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned,
1928 machine_mode mode)
1929 {
1930 /* The predicate mode determines which bits are significant and
1931 which are "don't care". Decreasing the number of lanes would
1932 lose data while increasing the number of lanes would make bits
1933 unnecessarily significant. */
1934 if (PR_REGNUM_P (regno))
1935 return mode;
1936 if (known_ge (GET_MODE_SIZE (mode), 4))
1937 return mode;
1938 else
1939 return SImode;
1940 }
1941
1942 /* Return true if I's bits are consecutive ones from the MSB. */
1943 bool
1944 aarch64_high_bits_all_ones_p (HOST_WIDE_INT i)
1945 {
1946 return exact_log2 (-i) != HOST_WIDE_INT_M1;
1947 }
1948
1949 /* Implement TARGET_CONSTANT_ALIGNMENT. Make strings word-aligned so
1950 that strcpy from constants will be faster. */
1951
1952 static HOST_WIDE_INT
1953 aarch64_constant_alignment (const_tree exp, HOST_WIDE_INT align)
1954 {
1955 if (TREE_CODE (exp) == STRING_CST && !optimize_size)
1956 return MAX (align, BITS_PER_WORD);
1957 return align;
1958 }
1959
1960 /* Return true if calls to DECL should be treated as
1961 long-calls (ie called via a register). */
1962 static bool
1963 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
1964 {
1965 return false;
1966 }
1967
1968 /* Return true if calls to symbol-ref SYM should be treated as
1969 long-calls (ie called via a register). */
1970 bool
1971 aarch64_is_long_call_p (rtx sym)
1972 {
1973 return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
1974 }
1975
1976 /* Return true if calls to symbol-ref SYM should not go through
1977 plt stubs. */
1978
1979 bool
1980 aarch64_is_noplt_call_p (rtx sym)
1981 {
1982 const_tree decl = SYMBOL_REF_DECL (sym);
1983
1984 if (flag_pic
1985 && decl
1986 && (!flag_plt
1987 || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
1988 && !targetm.binds_local_p (decl))
1989 return true;
1990
1991 return false;
1992 }
1993
1994 /* Return true if the offsets to a zero/sign-extract operation
1995 represent an expression that matches an extend operation. The
1996 operands represent the paramters from
1997
1998 (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)). */
1999 bool
2000 aarch64_is_extend_from_extract (scalar_int_mode mode, rtx mult_imm,
2001 rtx extract_imm)
2002 {
2003 HOST_WIDE_INT mult_val, extract_val;
2004
2005 if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
2006 return false;
2007
2008 mult_val = INTVAL (mult_imm);
2009 extract_val = INTVAL (extract_imm);
2010
2011 if (extract_val > 8
2012 && extract_val < GET_MODE_BITSIZE (mode)
2013 && exact_log2 (extract_val & ~7) > 0
2014 && (extract_val & 7) <= 4
2015 && mult_val == (1 << (extract_val & 7)))
2016 return true;
2017
2018 return false;
2019 }
2020
2021 /* Emit an insn that's a simple single-set. Both the operands must be
2022 known to be valid. */
2023 inline static rtx_insn *
2024 emit_set_insn (rtx x, rtx y)
2025 {
2026 return emit_insn (gen_rtx_SET (x, y));
2027 }
2028
2029 /* X and Y are two things to compare using CODE. Emit the compare insn and
2030 return the rtx for register 0 in the proper mode. */
2031 rtx
2032 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
2033 {
2034 machine_mode mode = SELECT_CC_MODE (code, x, y);
2035 rtx cc_reg = gen_rtx_REG (mode, CC_REGNUM);
2036
2037 emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y));
2038 return cc_reg;
2039 }
2040
2041 /* Similarly, but maybe zero-extend Y if Y_MODE < SImode. */
2042
2043 static rtx
2044 aarch64_gen_compare_reg_maybe_ze (RTX_CODE code, rtx x, rtx y,
2045 machine_mode y_mode)
2046 {
2047 if (y_mode == E_QImode || y_mode == E_HImode)
2048 {
2049 if (CONST_INT_P (y))
2050 y = GEN_INT (INTVAL (y) & GET_MODE_MASK (y_mode));
2051 else
2052 {
2053 rtx t, cc_reg;
2054 machine_mode cc_mode;
2055
2056 t = gen_rtx_ZERO_EXTEND (SImode, y);
2057 t = gen_rtx_COMPARE (CC_SWPmode, t, x);
2058 cc_mode = CC_SWPmode;
2059 cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
2060 emit_set_insn (cc_reg, t);
2061 return cc_reg;
2062 }
2063 }
2064
2065 return aarch64_gen_compare_reg (code, x, y);
2066 }
2067
2068 /* Build the SYMBOL_REF for __tls_get_addr. */
2069
2070 static GTY(()) rtx tls_get_addr_libfunc;
2071
2072 rtx
2073 aarch64_tls_get_addr (void)
2074 {
2075 if (!tls_get_addr_libfunc)
2076 tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
2077 return tls_get_addr_libfunc;
2078 }
2079
2080 /* Return the TLS model to use for ADDR. */
2081
2082 static enum tls_model
2083 tls_symbolic_operand_type (rtx addr)
2084 {
2085 enum tls_model tls_kind = TLS_MODEL_NONE;
2086 if (GET_CODE (addr) == CONST)
2087 {
2088 poly_int64 addend;
2089 rtx sym = strip_offset (addr, &addend);
2090 if (GET_CODE (sym) == SYMBOL_REF)
2091 tls_kind = SYMBOL_REF_TLS_MODEL (sym);
2092 }
2093 else if (GET_CODE (addr) == SYMBOL_REF)
2094 tls_kind = SYMBOL_REF_TLS_MODEL (addr);
2095
2096 return tls_kind;
2097 }
2098
2099 /* We'll allow lo_sum's in addresses in our legitimate addresses
2100 so that combine would take care of combining addresses where
2101 necessary, but for generation purposes, we'll generate the address
2102 as :
2103 RTL Absolute
2104 tmp = hi (symbol_ref); adrp x1, foo
2105 dest = lo_sum (tmp, symbol_ref); add dest, x1, :lo_12:foo
2106 nop
2107
2108 PIC TLS
2109 adrp x1, :got:foo adrp tmp, :tlsgd:foo
2110 ldr x1, [:got_lo12:foo] add dest, tmp, :tlsgd_lo12:foo
2111 bl __tls_get_addr
2112 nop
2113
2114 Load TLS symbol, depending on TLS mechanism and TLS access model.
2115
2116 Global Dynamic - Traditional TLS:
2117 adrp tmp, :tlsgd:imm
2118 add dest, tmp, #:tlsgd_lo12:imm
2119 bl __tls_get_addr
2120
2121 Global Dynamic - TLS Descriptors:
2122 adrp dest, :tlsdesc:imm
2123 ldr tmp, [dest, #:tlsdesc_lo12:imm]
2124 add dest, dest, #:tlsdesc_lo12:imm
2125 blr tmp
2126 mrs tp, tpidr_el0
2127 add dest, dest, tp
2128
2129 Initial Exec:
2130 mrs tp, tpidr_el0
2131 adrp tmp, :gottprel:imm
2132 ldr dest, [tmp, #:gottprel_lo12:imm]
2133 add dest, dest, tp
2134
2135 Local Exec:
2136 mrs tp, tpidr_el0
2137 add t0, tp, #:tprel_hi12:imm, lsl #12
2138 add t0, t0, #:tprel_lo12_nc:imm
2139 */
2140
2141 static void
2142 aarch64_load_symref_appropriately (rtx dest, rtx imm,
2143 enum aarch64_symbol_type type)
2144 {
2145 switch (type)
2146 {
2147 case SYMBOL_SMALL_ABSOLUTE:
2148 {
2149 /* In ILP32, the mode of dest can be either SImode or DImode. */
2150 rtx tmp_reg = dest;
2151 machine_mode mode = GET_MODE (dest);
2152
2153 gcc_assert (mode == Pmode || mode == ptr_mode);
2154
2155 if (can_create_pseudo_p ())
2156 tmp_reg = gen_reg_rtx (mode);
2157
2158 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
2159 emit_insn (gen_add_losym (dest, tmp_reg, imm));
2160 return;
2161 }
2162
2163 case SYMBOL_TINY_ABSOLUTE:
2164 emit_insn (gen_rtx_SET (dest, imm));
2165 return;
2166
2167 case SYMBOL_SMALL_GOT_28K:
2168 {
2169 machine_mode mode = GET_MODE (dest);
2170 rtx gp_rtx = pic_offset_table_rtx;
2171 rtx insn;
2172 rtx mem;
2173
2174 /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
2175 here before rtl expand. Tree IVOPT will generate rtl pattern to
2176 decide rtx costs, in which case pic_offset_table_rtx is not
2177 initialized. For that case no need to generate the first adrp
2178 instruction as the final cost for global variable access is
2179 one instruction. */
2180 if (gp_rtx != NULL)
2181 {
2182 /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
2183 using the page base as GOT base, the first page may be wasted,
2184 in the worst scenario, there is only 28K space for GOT).
2185
2186 The generate instruction sequence for accessing global variable
2187 is:
2188
2189 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
2190
2191 Only one instruction needed. But we must initialize
2192 pic_offset_table_rtx properly. We generate initialize insn for
2193 every global access, and allow CSE to remove all redundant.
2194
2195 The final instruction sequences will look like the following
2196 for multiply global variables access.
2197
2198 adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
2199
2200 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
2201 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
2202 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
2203 ... */
2204
2205 rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
2206 crtl->uses_pic_offset_table = 1;
2207 emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
2208
2209 if (mode != GET_MODE (gp_rtx))
2210 gp_rtx = gen_lowpart (mode, gp_rtx);
2211
2212 }
2213
2214 if (mode == ptr_mode)
2215 {
2216 if (mode == DImode)
2217 insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
2218 else
2219 insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
2220
2221 mem = XVECEXP (SET_SRC (insn), 0, 0);
2222 }
2223 else
2224 {
2225 gcc_assert (mode == Pmode);
2226
2227 insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
2228 mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
2229 }
2230
2231 /* The operand is expected to be MEM. Whenever the related insn
2232 pattern changed, above code which calculate mem should be
2233 updated. */
2234 gcc_assert (GET_CODE (mem) == MEM);
2235 MEM_READONLY_P (mem) = 1;
2236 MEM_NOTRAP_P (mem) = 1;
2237 emit_insn (insn);
2238 return;
2239 }
2240
2241 case SYMBOL_SMALL_GOT_4G:
2242 {
2243 /* In ILP32, the mode of dest can be either SImode or DImode,
2244 while the got entry is always of SImode size. The mode of
2245 dest depends on how dest is used: if dest is assigned to a
2246 pointer (e.g. in the memory), it has SImode; it may have
2247 DImode if dest is dereferenced to access the memeory.
2248 This is why we have to handle three different ldr_got_small
2249 patterns here (two patterns for ILP32). */
2250
2251 rtx insn;
2252 rtx mem;
2253 rtx tmp_reg = dest;
2254 machine_mode mode = GET_MODE (dest);
2255
2256 if (can_create_pseudo_p ())
2257 tmp_reg = gen_reg_rtx (mode);
2258
2259 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
2260 if (mode == ptr_mode)
2261 {
2262 if (mode == DImode)
2263 insn = gen_ldr_got_small_di (dest, tmp_reg, imm);
2264 else
2265 insn = gen_ldr_got_small_si (dest, tmp_reg, imm);
2266
2267 mem = XVECEXP (SET_SRC (insn), 0, 0);
2268 }
2269 else
2270 {
2271 gcc_assert (mode == Pmode);
2272
2273 insn = gen_ldr_got_small_sidi (dest, tmp_reg, imm);
2274 mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
2275 }
2276
2277 gcc_assert (GET_CODE (mem) == MEM);
2278 MEM_READONLY_P (mem) = 1;
2279 MEM_NOTRAP_P (mem) = 1;
2280 emit_insn (insn);
2281 return;
2282 }
2283
2284 case SYMBOL_SMALL_TLSGD:
2285 {
2286 rtx_insn *insns;
2287 machine_mode mode = GET_MODE (dest);
2288 rtx result = gen_rtx_REG (mode, R0_REGNUM);
2289
2290 start_sequence ();
2291 if (TARGET_ILP32)
2292 aarch64_emit_call_insn (gen_tlsgd_small_si (result, imm));
2293 else
2294 aarch64_emit_call_insn (gen_tlsgd_small_di (result, imm));
2295 insns = get_insns ();
2296 end_sequence ();
2297
2298 RTL_CONST_CALL_P (insns) = 1;
2299 emit_libcall_block (insns, dest, result, imm);
2300 return;
2301 }
2302
2303 case SYMBOL_SMALL_TLSDESC:
2304 {
2305 machine_mode mode = GET_MODE (dest);
2306 rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
2307 rtx tp;
2308
2309 gcc_assert (mode == Pmode || mode == ptr_mode);
2310
2311 /* In ILP32, the got entry is always of SImode size. Unlike
2312 small GOT, the dest is fixed at reg 0. */
2313 if (TARGET_ILP32)
2314 emit_insn (gen_tlsdesc_small_si (imm));
2315 else
2316 emit_insn (gen_tlsdesc_small_di (imm));
2317 tp = aarch64_load_tp (NULL);
2318
2319 if (mode != Pmode)
2320 tp = gen_lowpart (mode, tp);
2321
2322 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
2323 if (REG_P (dest))
2324 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2325 return;
2326 }
2327
2328 case SYMBOL_SMALL_TLSIE:
2329 {
2330 /* In ILP32, the mode of dest can be either SImode or DImode,
2331 while the got entry is always of SImode size. The mode of
2332 dest depends on how dest is used: if dest is assigned to a
2333 pointer (e.g. in the memory), it has SImode; it may have
2334 DImode if dest is dereferenced to access the memeory.
2335 This is why we have to handle three different tlsie_small
2336 patterns here (two patterns for ILP32). */
2337 machine_mode mode = GET_MODE (dest);
2338 rtx tmp_reg = gen_reg_rtx (mode);
2339 rtx tp = aarch64_load_tp (NULL);
2340
2341 if (mode == ptr_mode)
2342 {
2343 if (mode == DImode)
2344 emit_insn (gen_tlsie_small_di (tmp_reg, imm));
2345 else
2346 {
2347 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
2348 tp = gen_lowpart (mode, tp);
2349 }
2350 }
2351 else
2352 {
2353 gcc_assert (mode == Pmode);
2354 emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
2355 }
2356
2357 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
2358 if (REG_P (dest))
2359 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2360 return;
2361 }
2362
2363 case SYMBOL_TLSLE12:
2364 case SYMBOL_TLSLE24:
2365 case SYMBOL_TLSLE32:
2366 case SYMBOL_TLSLE48:
2367 {
2368 machine_mode mode = GET_MODE (dest);
2369 rtx tp = aarch64_load_tp (NULL);
2370
2371 if (mode != Pmode)
2372 tp = gen_lowpart (mode, tp);
2373
2374 switch (type)
2375 {
2376 case SYMBOL_TLSLE12:
2377 emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
2378 (dest, tp, imm));
2379 break;
2380 case SYMBOL_TLSLE24:
2381 emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
2382 (dest, tp, imm));
2383 break;
2384 case SYMBOL_TLSLE32:
2385 emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
2386 (dest, imm));
2387 emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
2388 (dest, dest, tp));
2389 break;
2390 case SYMBOL_TLSLE48:
2391 emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
2392 (dest, imm));
2393 emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
2394 (dest, dest, tp));
2395 break;
2396 default:
2397 gcc_unreachable ();
2398 }
2399
2400 if (REG_P (dest))
2401 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2402 return;
2403 }
2404
2405 case SYMBOL_TINY_GOT:
2406 emit_insn (gen_ldr_got_tiny (dest, imm));
2407 return;
2408
2409 case SYMBOL_TINY_TLSIE:
2410 {
2411 machine_mode mode = GET_MODE (dest);
2412 rtx tp = aarch64_load_tp (NULL);
2413
2414 if (mode == ptr_mode)
2415 {
2416 if (mode == DImode)
2417 emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
2418 else
2419 {
2420 tp = gen_lowpart (mode, tp);
2421 emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
2422 }
2423 }
2424 else
2425 {
2426 gcc_assert (mode == Pmode);
2427 emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
2428 }
2429
2430 if (REG_P (dest))
2431 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2432 return;
2433 }
2434
2435 default:
2436 gcc_unreachable ();
2437 }
2438 }
2439
2440 /* Emit a move from SRC to DEST. Assume that the move expanders can
2441 handle all moves if !can_create_pseudo_p (). The distinction is
2442 important because, unlike emit_move_insn, the move expanders know
2443 how to force Pmode objects into the constant pool even when the
2444 constant pool address is not itself legitimate. */
2445 static rtx
2446 aarch64_emit_move (rtx dest, rtx src)
2447 {
2448 return (can_create_pseudo_p ()
2449 ? emit_move_insn (dest, src)
2450 : emit_move_insn_1 (dest, src));
2451 }
2452
2453 /* Apply UNOPTAB to OP and store the result in DEST. */
2454
2455 static void
2456 aarch64_emit_unop (rtx dest, optab unoptab, rtx op)
2457 {
2458 rtx tmp = expand_unop (GET_MODE (dest), unoptab, op, dest, 0);
2459 if (dest != tmp)
2460 emit_move_insn (dest, tmp);
2461 }
2462
2463 /* Apply BINOPTAB to OP0 and OP1 and store the result in DEST. */
2464
2465 static void
2466 aarch64_emit_binop (rtx dest, optab binoptab, rtx op0, rtx op1)
2467 {
2468 rtx tmp = expand_binop (GET_MODE (dest), binoptab, op0, op1, dest, 0,
2469 OPTAB_DIRECT);
2470 if (dest != tmp)
2471 emit_move_insn (dest, tmp);
2472 }
2473
2474 /* Split a 128-bit move operation into two 64-bit move operations,
2475 taking care to handle partial overlap of register to register
2476 copies. Special cases are needed when moving between GP regs and
2477 FP regs. SRC can be a register, constant or memory; DST a register
2478 or memory. If either operand is memory it must not have any side
2479 effects. */
2480 void
2481 aarch64_split_128bit_move (rtx dst, rtx src)
2482 {
2483 rtx dst_lo, dst_hi;
2484 rtx src_lo, src_hi;
2485
2486 machine_mode mode = GET_MODE (dst);
2487
2488 gcc_assert (mode == TImode || mode == TFmode);
2489 gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
2490 gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
2491
2492 if (REG_P (dst) && REG_P (src))
2493 {
2494 int src_regno = REGNO (src);
2495 int dst_regno = REGNO (dst);
2496
2497 /* Handle FP <-> GP regs. */
2498 if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
2499 {
2500 src_lo = gen_lowpart (word_mode, src);
2501 src_hi = gen_highpart (word_mode, src);
2502
2503 emit_insn (gen_aarch64_movlow_di (mode, dst, src_lo));
2504 emit_insn (gen_aarch64_movhigh_di (mode, dst, src_hi));
2505 return;
2506 }
2507 else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
2508 {
2509 dst_lo = gen_lowpart (word_mode, dst);
2510 dst_hi = gen_highpart (word_mode, dst);
2511
2512 emit_insn (gen_aarch64_movdi_low (mode, dst_lo, src));
2513 emit_insn (gen_aarch64_movdi_high (mode, dst_hi, src));
2514 return;
2515 }
2516 }
2517
2518 dst_lo = gen_lowpart (word_mode, dst);
2519 dst_hi = gen_highpart (word_mode, dst);
2520 src_lo = gen_lowpart (word_mode, src);
2521 src_hi = gen_highpart_mode (word_mode, mode, src);
2522
2523 /* At most one pairing may overlap. */
2524 if (reg_overlap_mentioned_p (dst_lo, src_hi))
2525 {
2526 aarch64_emit_move (dst_hi, src_hi);
2527 aarch64_emit_move (dst_lo, src_lo);
2528 }
2529 else
2530 {
2531 aarch64_emit_move (dst_lo, src_lo);
2532 aarch64_emit_move (dst_hi, src_hi);
2533 }
2534 }
2535
2536 bool
2537 aarch64_split_128bit_move_p (rtx dst, rtx src)
2538 {
2539 return (! REG_P (src)
2540 || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
2541 }
2542
2543 /* Split a complex SIMD combine. */
2544
2545 void
2546 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
2547 {
2548 machine_mode src_mode = GET_MODE (src1);
2549 machine_mode dst_mode = GET_MODE (dst);
2550
2551 gcc_assert (VECTOR_MODE_P (dst_mode));
2552 gcc_assert (register_operand (dst, dst_mode)
2553 && register_operand (src1, src_mode)
2554 && register_operand (src2, src_mode));
2555
2556 emit_insn (gen_aarch64_simd_combine (src_mode, dst, src1, src2));
2557 return;
2558 }
2559
2560 /* Split a complex SIMD move. */
2561
2562 void
2563 aarch64_split_simd_move (rtx dst, rtx src)
2564 {
2565 machine_mode src_mode = GET_MODE (src);
2566 machine_mode dst_mode = GET_MODE (dst);
2567
2568 gcc_assert (VECTOR_MODE_P (dst_mode));
2569
2570 if (REG_P (dst) && REG_P (src))
2571 {
2572 gcc_assert (VECTOR_MODE_P (src_mode));
2573 emit_insn (gen_aarch64_split_simd_mov (src_mode, dst, src));
2574 }
2575 }
2576
2577 bool
2578 aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
2579 machine_mode ymode, rtx y)
2580 {
2581 rtx r = simplify_const_unary_operation (ZERO_EXTEND, xmode, y, ymode);
2582 gcc_assert (r != NULL);
2583 return rtx_equal_p (x, r);
2584 }
2585
2586
2587 /* Return TARGET if it is nonnull and a register of mode MODE.
2588 Otherwise, return a fresh register of mode MODE if we can,
2589 or TARGET reinterpreted as MODE if we can't. */
2590
2591 static rtx
2592 aarch64_target_reg (rtx target, machine_mode mode)
2593 {
2594 if (target && REG_P (target) && GET_MODE (target) == mode)
2595 return target;
2596 if (!can_create_pseudo_p ())
2597 {
2598 gcc_assert (target);
2599 return gen_lowpart (mode, target);
2600 }
2601 return gen_reg_rtx (mode);
2602 }
2603
2604 /* Return a register that contains the constant in BUILDER, given that
2605 the constant is a legitimate move operand. Use TARGET as the register
2606 if it is nonnull and convenient. */
2607
2608 static rtx
2609 aarch64_emit_set_immediate (rtx target, rtx_vector_builder &builder)
2610 {
2611 rtx src = builder.build ();
2612 target = aarch64_target_reg (target, GET_MODE (src));
2613 emit_insn (gen_rtx_SET (target, src));
2614 return target;
2615 }
2616
2617 static rtx
2618 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
2619 {
2620 if (can_create_pseudo_p ())
2621 return force_reg (mode, value);
2622 else
2623 {
2624 gcc_assert (x);
2625 aarch64_emit_move (x, value);
2626 return x;
2627 }
2628 }
2629
2630 /* Return true if predicate value X is a constant in which every element
2631 is a CONST_INT. When returning true, describe X in BUILDER as a VNx16BI
2632 value, i.e. as a predicate in which all bits are significant. */
2633
2634 static bool
2635 aarch64_get_sve_pred_bits (rtx_vector_builder &builder, rtx x)
2636 {
2637 if (GET_CODE (x) != CONST_VECTOR)
2638 return false;
2639
2640 unsigned int factor = vector_element_size (GET_MODE_NUNITS (VNx16BImode),
2641 GET_MODE_NUNITS (GET_MODE (x)));
2642 unsigned int npatterns = CONST_VECTOR_NPATTERNS (x) * factor;
2643 unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (x);
2644 builder.new_vector (VNx16BImode, npatterns, nelts_per_pattern);
2645
2646 unsigned int nelts = const_vector_encoded_nelts (x);
2647 for (unsigned int i = 0; i < nelts; ++i)
2648 {
2649 rtx elt = CONST_VECTOR_ENCODED_ELT (x, i);
2650 if (!CONST_INT_P (elt))
2651 return false;
2652
2653 builder.quick_push (elt);
2654 for (unsigned int j = 1; j < factor; ++j)
2655 builder.quick_push (const0_rtx);
2656 }
2657 builder.finalize ();
2658 return true;
2659 }
2660
2661 /* BUILDER contains a predicate constant of mode VNx16BI. Return the
2662 widest predicate element size it can have (that is, the largest size
2663 for which each element would still be 0 or 1). */
2664
2665 unsigned int
2666 aarch64_widest_sve_pred_elt_size (rtx_vector_builder &builder)
2667 {
2668 /* Start with the most optimistic assumption: that we only need
2669 one bit per pattern. This is what we will use if only the first
2670 bit in each pattern is ever set. */
2671 unsigned int mask = GET_MODE_SIZE (DImode);
2672 mask |= builder.npatterns ();
2673
2674 /* Look for set bits. */
2675 unsigned int nelts = builder.encoded_nelts ();
2676 for (unsigned int i = 1; i < nelts; ++i)
2677 if (INTVAL (builder.elt (i)) != 0)
2678 {
2679 if (i & 1)
2680 return 1;
2681 mask |= i;
2682 }
2683 return mask & -mask;
2684 }
2685
2686 /* BUILDER is a predicate constant of mode VNx16BI. Consider the value
2687 that the constant would have with predicate element size ELT_SIZE
2688 (ignoring the upper bits in each element) and return:
2689
2690 * -1 if all bits are set
2691 * N if the predicate has N leading set bits followed by all clear bits
2692 * 0 if the predicate does not have any of these forms. */
2693
2694 int
2695 aarch64_partial_ptrue_length (rtx_vector_builder &builder,
2696 unsigned int elt_size)
2697 {
2698 /* If nelts_per_pattern is 3, we have set bits followed by clear bits
2699 followed by set bits. */
2700 if (builder.nelts_per_pattern () == 3)
2701 return 0;
2702
2703 /* Skip over leading set bits. */
2704 unsigned int nelts = builder.encoded_nelts ();
2705 unsigned int i = 0;
2706 for (; i < nelts; i += elt_size)
2707 if (INTVAL (builder.elt (i)) == 0)
2708 break;
2709 unsigned int vl = i / elt_size;
2710
2711 /* Check for the all-true case. */
2712 if (i == nelts)
2713 return -1;
2714
2715 /* If nelts_per_pattern is 1, then either VL is zero, or we have a
2716 repeating pattern of set bits followed by clear bits. */
2717 if (builder.nelts_per_pattern () != 2)
2718 return 0;
2719
2720 /* We have a "foreground" value and a duplicated "background" value.
2721 If the background might repeat and the last set bit belongs to it,
2722 we might have set bits followed by clear bits followed by set bits. */
2723 if (i > builder.npatterns () && maybe_ne (nelts, builder.full_nelts ()))
2724 return 0;
2725
2726 /* Make sure that the rest are all clear. */
2727 for (; i < nelts; i += elt_size)
2728 if (INTVAL (builder.elt (i)) != 0)
2729 return 0;
2730
2731 return vl;
2732 }
2733
2734 /* See if there is an svpattern that encodes an SVE predicate of mode
2735 PRED_MODE in which the first VL bits are set and the rest are clear.
2736 Return the pattern if so, otherwise return AARCH64_NUM_SVPATTERNS.
2737 A VL of -1 indicates an all-true vector. */
2738
2739 aarch64_svpattern
2740 aarch64_svpattern_for_vl (machine_mode pred_mode, int vl)
2741 {
2742 if (vl < 0)
2743 return AARCH64_SV_ALL;
2744
2745 if (maybe_gt (vl, GET_MODE_NUNITS (pred_mode)))
2746 return AARCH64_NUM_SVPATTERNS;
2747
2748 if (vl >= 1 && vl <= 8)
2749 return aarch64_svpattern (AARCH64_SV_VL1 + (vl - 1));
2750
2751 if (vl >= 16 && vl <= 256 && pow2p_hwi (vl))
2752 return aarch64_svpattern (AARCH64_SV_VL16 + (exact_log2 (vl) - 4));
2753
2754 int max_vl;
2755 if (GET_MODE_NUNITS (pred_mode).is_constant (&max_vl))
2756 {
2757 if (vl == (max_vl / 3) * 3)
2758 return AARCH64_SV_MUL3;
2759 /* These would only trigger for non-power-of-2 lengths. */
2760 if (vl == (max_vl & -4))
2761 return AARCH64_SV_MUL4;
2762 if (vl == (1 << floor_log2 (max_vl)))
2763 return AARCH64_SV_POW2;
2764 if (vl == max_vl)
2765 return AARCH64_SV_ALL;
2766 }
2767 return AARCH64_NUM_SVPATTERNS;
2768 }
2769
2770 /* Return a VNx16BImode constant in which every sequence of ELT_SIZE
2771 bits has the lowest bit set and the upper bits clear. This is the
2772 VNx16BImode equivalent of a PTRUE for controlling elements of
2773 ELT_SIZE bytes. However, because the constant is VNx16BImode,
2774 all bits are significant, even the upper zeros. */
2775
2776 rtx
2777 aarch64_ptrue_all (unsigned int elt_size)
2778 {
2779 rtx_vector_builder builder (VNx16BImode, elt_size, 1);
2780 builder.quick_push (const1_rtx);
2781 for (unsigned int i = 1; i < elt_size; ++i)
2782 builder.quick_push (const0_rtx);
2783 return builder.build ();
2784 }
2785
2786 /* Return an all-true predicate register of mode MODE. */
2787
2788 rtx
2789 aarch64_ptrue_reg (machine_mode mode)
2790 {
2791 gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL);
2792 rtx reg = force_reg (VNx16BImode, CONSTM1_RTX (VNx16BImode));
2793 return gen_lowpart (mode, reg);
2794 }
2795
2796 /* Return an all-false predicate register of mode MODE. */
2797
2798 rtx
2799 aarch64_pfalse_reg (machine_mode mode)
2800 {
2801 gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL);
2802 rtx reg = force_reg (VNx16BImode, CONST0_RTX (VNx16BImode));
2803 return gen_lowpart (mode, reg);
2804 }
2805
2806 /* Return true if predicate PRED1[0] is true whenever predicate PRED2 is
2807 true, or alternatively if we know that the operation predicated by
2808 PRED1[0] is safe to perform whenever PRED2 is true. PRED1[1] is a
2809 aarch64_sve_gp_strictness operand that describes the operation
2810 predicated by PRED1[0]. */
2811
2812 bool
2813 aarch64_sve_pred_dominates_p (rtx *pred1, rtx pred2)
2814 {
2815 machine_mode mode = GET_MODE (pred2);
2816 gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
2817 && mode == GET_MODE (pred1[0])
2818 && aarch64_sve_gp_strictness (pred1[1], SImode));
2819 return (pred1[0] == CONSTM1_RTX (mode)
2820 || INTVAL (pred1[1]) == SVE_RELAXED_GP
2821 || rtx_equal_p (pred1[0], pred2));
2822 }
2823
2824 /* PRED1[0] is a PTEST predicate and PRED1[1] is an aarch64_sve_ptrue_flag
2825 for it. PRED2[0] is the predicate for the instruction whose result
2826 is tested by the PTEST and PRED2[1] is again an aarch64_sve_ptrue_flag
2827 for it. Return true if we can prove that the two predicates are
2828 equivalent for PTEST purposes; that is, if we can replace PRED2[0]
2829 with PRED1[0] without changing behavior. */
2830
2831 bool
2832 aarch64_sve_same_pred_for_ptest_p (rtx *pred1, rtx *pred2)
2833 {
2834 machine_mode mode = GET_MODE (pred1[0]);
2835 gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
2836 && mode == GET_MODE (pred2[0])
2837 && aarch64_sve_ptrue_flag (pred1[1], SImode)
2838 && aarch64_sve_ptrue_flag (pred2[1], SImode));
2839
2840 bool ptrue1_p = (pred1[0] == CONSTM1_RTX (mode)
2841 || INTVAL (pred1[1]) == SVE_KNOWN_PTRUE);
2842 bool ptrue2_p = (pred2[0] == CONSTM1_RTX (mode)
2843 || INTVAL (pred2[1]) == SVE_KNOWN_PTRUE);
2844 return (ptrue1_p && ptrue2_p) || rtx_equal_p (pred1[0], pred2[0]);
2845 }
2846
2847 /* Emit a comparison CMP between OP0 and OP1, both of which have mode
2848 DATA_MODE, and return the result in a predicate of mode PRED_MODE.
2849 Use TARGET as the target register if nonnull and convenient. */
2850
2851 static rtx
2852 aarch64_sve_emit_int_cmp (rtx target, machine_mode pred_mode, rtx_code cmp,
2853 machine_mode data_mode, rtx op1, rtx op2)
2854 {
2855 insn_code icode = code_for_aarch64_pred_cmp (cmp, data_mode);
2856 expand_operand ops[5];
2857 create_output_operand (&ops[0], target, pred_mode);
2858 create_input_operand (&ops[1], CONSTM1_RTX (pred_mode), pred_mode);
2859 create_integer_operand (&ops[2], SVE_KNOWN_PTRUE);
2860 create_input_operand (&ops[3], op1, data_mode);
2861 create_input_operand (&ops[4], op2, data_mode);
2862 expand_insn (icode, 5, ops);
2863 return ops[0].value;
2864 }
2865
2866 /* Use a comparison to convert integer vector SRC into MODE, which is
2867 the corresponding SVE predicate mode. Use TARGET for the result
2868 if it's nonnull and convenient. */
2869
2870 static rtx
2871 aarch64_convert_sve_data_to_pred (rtx target, machine_mode mode, rtx src)
2872 {
2873 machine_mode src_mode = GET_MODE (src);
2874 return aarch64_sve_emit_int_cmp (target, mode, NE, src_mode,
2875 src, CONST0_RTX (src_mode));
2876 }
2877
2878 /* Return true if we can move VALUE into a register using a single
2879 CNT[BHWD] instruction. */
2880
2881 static bool
2882 aarch64_sve_cnt_immediate_p (poly_int64 value)
2883 {
2884 HOST_WIDE_INT factor = value.coeffs[0];
2885 /* The coefficient must be [1, 16] * {2, 4, 8, 16}. */
2886 return (value.coeffs[1] == factor
2887 && IN_RANGE (factor, 2, 16 * 16)
2888 && (factor & 1) == 0
2889 && factor <= 16 * (factor & -factor));
2890 }
2891
2892 /* Likewise for rtx X. */
2893
2894 bool
2895 aarch64_sve_cnt_immediate_p (rtx x)
2896 {
2897 poly_int64 value;
2898 return poly_int_rtx_p (x, &value) && aarch64_sve_cnt_immediate_p (value);
2899 }
2900
2901 /* Return the asm string for an instruction with a CNT-like vector size
2902 operand (a vector pattern followed by a multiplier in the range [1, 16]).
2903 PREFIX is the mnemonic without the size suffix and OPERANDS is the
2904 first part of the operands template (the part that comes before the
2905 vector size itself). FACTOR is the number of quadwords.
2906 NELTS_PER_VQ, if nonzero, is the number of elements in each quadword.
2907 If it is zero, we can use any element size. */
2908
2909 static char *
2910 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
2911 unsigned int factor,
2912 unsigned int nelts_per_vq)
2913 {
2914 static char buffer[sizeof ("sqincd\t%x0, %w0, all, mul #16")];
2915
2916 if (nelts_per_vq == 0)
2917 /* There is some overlap in the ranges of the four CNT instructions.
2918 Here we always use the smallest possible element size, so that the
2919 multiplier is 1 whereever possible. */
2920 nelts_per_vq = factor & -factor;
2921 int shift = std::min (exact_log2 (nelts_per_vq), 4);
2922 gcc_assert (IN_RANGE (shift, 1, 4));
2923 char suffix = "dwhb"[shift - 1];
2924
2925 factor >>= shift;
2926 unsigned int written;
2927 if (factor == 1)
2928 written = snprintf (buffer, sizeof (buffer), "%s%c\t%s",
2929 prefix, suffix, operands);
2930 else
2931 written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, all, mul #%d",
2932 prefix, suffix, operands, factor);
2933 gcc_assert (written < sizeof (buffer));
2934 return buffer;
2935 }
2936
2937 /* Return the asm string for an instruction with a CNT-like vector size
2938 operand (a vector pattern followed by a multiplier in the range [1, 16]).
2939 PREFIX is the mnemonic without the size suffix and OPERANDS is the
2940 first part of the operands template (the part that comes before the
2941 vector size itself). X is the value of the vector size operand,
2942 as a polynomial integer rtx. */
2943
2944 char *
2945 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
2946 rtx x)
2947 {
2948 poly_int64 value = rtx_to_poly_int64 (x);
2949 gcc_assert (aarch64_sve_cnt_immediate_p (value));
2950 return aarch64_output_sve_cnt_immediate (prefix, operands,
2951 value.coeffs[1], 0);
2952 }
2953
2954 /* Return true if we can add X using a single SVE INC or DEC instruction. */
2955
2956 bool
2957 aarch64_sve_scalar_inc_dec_immediate_p (rtx x)
2958 {
2959 poly_int64 value;
2960 return (poly_int_rtx_p (x, &value)
2961 && (aarch64_sve_cnt_immediate_p (value)
2962 || aarch64_sve_cnt_immediate_p (-value)));
2963 }
2964
2965 /* Return the asm string for adding SVE INC/DEC immediate OFFSET to
2966 operand 0. */
2967
2968 char *
2969 aarch64_output_sve_scalar_inc_dec (rtx offset)
2970 {
2971 poly_int64 offset_value = rtx_to_poly_int64 (offset);
2972 gcc_assert (offset_value.coeffs[0] == offset_value.coeffs[1]);
2973 if (offset_value.coeffs[1] > 0)
2974 return aarch64_output_sve_cnt_immediate ("inc", "%x0",
2975 offset_value.coeffs[1], 0);
2976 else
2977 return aarch64_output_sve_cnt_immediate ("dec", "%x0",
2978 -offset_value.coeffs[1], 0);
2979 }
2980
2981 /* Return true if we can add VALUE to a register using a single ADDVL
2982 or ADDPL instruction. */
2983
2984 static bool
2985 aarch64_sve_addvl_addpl_immediate_p (poly_int64 value)
2986 {
2987 HOST_WIDE_INT factor = value.coeffs[0];
2988 if (factor == 0 || value.coeffs[1] != factor)
2989 return false;
2990 /* FACTOR counts VG / 2, so a value of 2 is one predicate width
2991 and a value of 16 is one vector width. */
2992 return (((factor & 15) == 0 && IN_RANGE (factor, -32 * 16, 31 * 16))
2993 || ((factor & 1) == 0 && IN_RANGE (factor, -32 * 2, 31 * 2)));
2994 }
2995
2996 /* Likewise for rtx X. */
2997
2998 bool
2999 aarch64_sve_addvl_addpl_immediate_p (rtx x)
3000 {
3001 poly_int64 value;
3002 return (poly_int_rtx_p (x, &value)
3003 && aarch64_sve_addvl_addpl_immediate_p (value));
3004 }
3005
3006 /* Return the asm string for adding ADDVL or ADDPL immediate OFFSET
3007 to operand 1 and storing the result in operand 0. */
3008
3009 char *
3010 aarch64_output_sve_addvl_addpl (rtx offset)
3011 {
3012 static char buffer[sizeof ("addpl\t%x0, %x1, #-") + 3 * sizeof (int)];
3013 poly_int64 offset_value = rtx_to_poly_int64 (offset);
3014 gcc_assert (aarch64_sve_addvl_addpl_immediate_p (offset_value));
3015
3016 int factor = offset_value.coeffs[1];
3017 if ((factor & 15) == 0)
3018 snprintf (buffer, sizeof (buffer), "addvl\t%%x0, %%x1, #%d", factor / 16);
3019 else
3020 snprintf (buffer, sizeof (buffer), "addpl\t%%x0, %%x1, #%d", factor / 2);
3021 return buffer;
3022 }
3023
3024 /* Return true if X is a valid immediate for an SVE vector INC or DEC
3025 instruction. If it is, store the number of elements in each vector
3026 quadword in *NELTS_PER_VQ_OUT (if nonnull) and store the multiplication
3027 factor in *FACTOR_OUT (if nonnull). */
3028
3029 bool
3030 aarch64_sve_vector_inc_dec_immediate_p (rtx x, int *factor_out,
3031 unsigned int *nelts_per_vq_out)
3032 {
3033 rtx elt;
3034 poly_int64 value;
3035
3036 if (!const_vec_duplicate_p (x, &elt)
3037 || !poly_int_rtx_p (elt, &value))
3038 return false;
3039
3040 unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (GET_MODE (x));
3041 if (nelts_per_vq != 8 && nelts_per_vq != 4 && nelts_per_vq != 2)
3042 /* There's no vector INCB. */
3043 return false;
3044
3045 HOST_WIDE_INT factor = value.coeffs[0];
3046 if (value.coeffs[1] != factor)
3047 return false;
3048
3049 /* The coefficient must be [1, 16] * NELTS_PER_VQ. */
3050 if ((factor % nelts_per_vq) != 0
3051 || !IN_RANGE (abs (factor), nelts_per_vq, 16 * nelts_per_vq))
3052 return false;
3053
3054 if (factor_out)
3055 *factor_out = factor;
3056 if (nelts_per_vq_out)
3057 *nelts_per_vq_out = nelts_per_vq;
3058 return true;
3059 }
3060
3061 /* Return true if X is a valid immediate for an SVE vector INC or DEC
3062 instruction. */
3063
3064 bool
3065 aarch64_sve_vector_inc_dec_immediate_p (rtx x)
3066 {
3067 return aarch64_sve_vector_inc_dec_immediate_p (x, NULL, NULL);
3068 }
3069
3070 /* Return the asm template for an SVE vector INC or DEC instruction.
3071 OPERANDS gives the operands before the vector count and X is the
3072 value of the vector count operand itself. */
3073
3074 char *
3075 aarch64_output_sve_vector_inc_dec (const char *operands, rtx x)
3076 {
3077 int factor;
3078 unsigned int nelts_per_vq;
3079 if (!aarch64_sve_vector_inc_dec_immediate_p (x, &factor, &nelts_per_vq))
3080 gcc_unreachable ();
3081 if (factor < 0)
3082 return aarch64_output_sve_cnt_immediate ("dec", operands, -factor,
3083 nelts_per_vq);
3084 else
3085 return aarch64_output_sve_cnt_immediate ("inc", operands, factor,
3086 nelts_per_vq);
3087 }
3088
3089 static int
3090 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
3091 scalar_int_mode mode)
3092 {
3093 int i;
3094 unsigned HOST_WIDE_INT val, val2, mask;
3095 int one_match, zero_match;
3096 int num_insns;
3097
3098 val = INTVAL (imm);
3099
3100 if (aarch64_move_imm (val, mode))
3101 {
3102 if (generate)
3103 emit_insn (gen_rtx_SET (dest, imm));
3104 return 1;
3105 }
3106
3107 /* Check to see if the low 32 bits are either 0xffffXXXX or 0xXXXXffff
3108 (with XXXX non-zero). In that case check to see if the move can be done in
3109 a smaller mode. */
3110 val2 = val & 0xffffffff;
3111 if (mode == DImode
3112 && aarch64_move_imm (val2, SImode)
3113 && (((val >> 32) & 0xffff) == 0 || (val >> 48) == 0))
3114 {
3115 if (generate)
3116 emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
3117
3118 /* Check if we have to emit a second instruction by checking to see
3119 if any of the upper 32 bits of the original DI mode value is set. */
3120 if (val == val2)
3121 return 1;
3122
3123 i = (val >> 48) ? 48 : 32;
3124
3125 if (generate)
3126 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
3127 GEN_INT ((val >> i) & 0xffff)));
3128
3129 return 2;
3130 }
3131
3132 if ((val >> 32) == 0 || mode == SImode)
3133 {
3134 if (generate)
3135 {
3136 emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
3137 if (mode == SImode)
3138 emit_insn (gen_insv_immsi (dest, GEN_INT (16),
3139 GEN_INT ((val >> 16) & 0xffff)));
3140 else
3141 emit_insn (gen_insv_immdi (dest, GEN_INT (16),
3142 GEN_INT ((val >> 16) & 0xffff)));
3143 }
3144 return 2;
3145 }
3146
3147 /* Remaining cases are all for DImode. */
3148
3149 mask = 0xffff;
3150 zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
3151 ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
3152 one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
3153 ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
3154
3155 if (zero_match != 2 && one_match != 2)
3156 {
3157 /* Try emitting a bitmask immediate with a movk replacing 16 bits.
3158 For a 64-bit bitmask try whether changing 16 bits to all ones or
3159 zeroes creates a valid bitmask. To check any repeated bitmask,
3160 try using 16 bits from the other 32-bit half of val. */
3161
3162 for (i = 0; i < 64; i += 16, mask <<= 16)
3163 {
3164 val2 = val & ~mask;
3165 if (val2 != val && aarch64_bitmask_imm (val2, mode))
3166 break;
3167 val2 = val | mask;
3168 if (val2 != val && aarch64_bitmask_imm (val2, mode))
3169 break;
3170 val2 = val2 & ~mask;
3171 val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
3172 if (val2 != val && aarch64_bitmask_imm (val2, mode))
3173 break;
3174 }
3175 if (i != 64)
3176 {
3177 if (generate)
3178 {
3179 emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
3180 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
3181 GEN_INT ((val >> i) & 0xffff)));
3182 }
3183 return 2;
3184 }
3185 }
3186
3187 /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
3188 are emitted by the initial mov. If one_match > zero_match, skip set bits,
3189 otherwise skip zero bits. */
3190
3191 num_insns = 1;
3192 mask = 0xffff;
3193 val2 = one_match > zero_match ? ~val : val;
3194 i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
3195
3196 if (generate)
3197 emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
3198 ? (val | ~(mask << i))
3199 : (val & (mask << i)))));
3200 for (i += 16; i < 64; i += 16)
3201 {
3202 if ((val2 & (mask << i)) == 0)
3203 continue;
3204 if (generate)
3205 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
3206 GEN_INT ((val >> i) & 0xffff)));
3207 num_insns ++;
3208 }
3209
3210 return num_insns;
3211 }
3212
3213 /* Return whether imm is a 128-bit immediate which is simple enough to
3214 expand inline. */
3215 bool
3216 aarch64_mov128_immediate (rtx imm)
3217 {
3218 if (GET_CODE (imm) == CONST_INT)
3219 return true;
3220
3221 gcc_assert (CONST_WIDE_INT_NUNITS (imm) == 2);
3222
3223 rtx lo = GEN_INT (CONST_WIDE_INT_ELT (imm, 0));
3224 rtx hi = GEN_INT (CONST_WIDE_INT_ELT (imm, 1));
3225
3226 return aarch64_internal_mov_immediate (NULL_RTX, lo, false, DImode)
3227 + aarch64_internal_mov_immediate (NULL_RTX, hi, false, DImode) <= 4;
3228 }
3229
3230
3231 /* Return the number of temporary registers that aarch64_add_offset_1
3232 would need to add OFFSET to a register. */
3233
3234 static unsigned int
3235 aarch64_add_offset_1_temporaries (HOST_WIDE_INT offset)
3236 {
3237 return abs_hwi (offset) < 0x1000000 ? 0 : 1;
3238 }
3239
3240 /* A subroutine of aarch64_add_offset. Set DEST to SRC + OFFSET for
3241 a non-polynomial OFFSET. MODE is the mode of the addition.
3242 FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
3243 be set and CFA adjustments added to the generated instructions.
3244
3245 TEMP1, if nonnull, is a register of mode MODE that can be used as a
3246 temporary if register allocation is already complete. This temporary
3247 register may overlap DEST but must not overlap SRC. If TEMP1 is known
3248 to hold abs (OFFSET), EMIT_MOVE_IMM can be set to false to avoid emitting
3249 the immediate again.
3250
3251 Since this function may be used to adjust the stack pointer, we must
3252 ensure that it cannot cause transient stack deallocation (for example
3253 by first incrementing SP and then decrementing when adjusting by a
3254 large immediate). */
3255
3256 static void
3257 aarch64_add_offset_1 (scalar_int_mode mode, rtx dest,
3258 rtx src, HOST_WIDE_INT offset, rtx temp1,
3259 bool frame_related_p, bool emit_move_imm)
3260 {
3261 gcc_assert (emit_move_imm || temp1 != NULL_RTX);
3262 gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
3263
3264 HOST_WIDE_INT moffset = abs_hwi (offset);
3265 rtx_insn *insn;
3266
3267 if (!moffset)
3268 {
3269 if (!rtx_equal_p (dest, src))
3270 {
3271 insn = emit_insn (gen_rtx_SET (dest, src));
3272 RTX_FRAME_RELATED_P (insn) = frame_related_p;
3273 }
3274 return;
3275 }
3276
3277 /* Single instruction adjustment. */
3278 if (aarch64_uimm12_shift (moffset))
3279 {
3280 insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (offset)));
3281 RTX_FRAME_RELATED_P (insn) = frame_related_p;
3282 return;
3283 }
3284
3285 /* Emit 2 additions/subtractions if the adjustment is less than 24 bits
3286 and either:
3287
3288 a) the offset cannot be loaded by a 16-bit move or
3289 b) there is no spare register into which we can move it. */
3290 if (moffset < 0x1000000
3291 && ((!temp1 && !can_create_pseudo_p ())
3292 || !aarch64_move_imm (moffset, mode)))
3293 {
3294 HOST_WIDE_INT low_off = moffset & 0xfff;
3295
3296 low_off = offset < 0 ? -low_off : low_off;
3297 insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (low_off)));
3298 RTX_FRAME_RELATED_P (insn) = frame_related_p;
3299 insn = emit_insn (gen_add2_insn (dest, GEN_INT (offset - low_off)));
3300 RTX_FRAME_RELATED_P (insn) = frame_related_p;
3301 return;
3302 }
3303
3304 /* Emit a move immediate if required and an addition/subtraction. */
3305 if (emit_move_imm)
3306 {
3307 gcc_assert (temp1 != NULL_RTX || can_create_pseudo_p ());
3308 temp1 = aarch64_force_temporary (mode, temp1, GEN_INT (moffset));
3309 }
3310 insn = emit_insn (offset < 0
3311 ? gen_sub3_insn (dest, src, temp1)
3312 : gen_add3_insn (dest, src, temp1));
3313 if (frame_related_p)
3314 {
3315 RTX_FRAME_RELATED_P (insn) = frame_related_p;
3316 rtx adj = plus_constant (mode, src, offset);
3317 add_reg_note (insn, REG_CFA_ADJUST_CFA, gen_rtx_SET (dest, adj));
3318 }
3319 }
3320
3321 /* Return the number of temporary registers that aarch64_add_offset
3322 would need to move OFFSET into a register or add OFFSET to a register;
3323 ADD_P is true if we want the latter rather than the former. */
3324
3325 static unsigned int
3326 aarch64_offset_temporaries (bool add_p, poly_int64 offset)
3327 {
3328 /* This follows the same structure as aarch64_add_offset. */
3329 if (add_p && aarch64_sve_addvl_addpl_immediate_p (offset))
3330 return 0;
3331
3332 unsigned int count = 0;
3333 HOST_WIDE_INT factor = offset.coeffs[1];
3334 HOST_WIDE_INT constant = offset.coeffs[0] - factor;
3335 poly_int64 poly_offset (factor, factor);
3336 if (add_p && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
3337 /* Need one register for the ADDVL/ADDPL result. */
3338 count += 1;
3339 else if (factor != 0)
3340 {
3341 factor = abs (factor);
3342 if (factor > 16 * (factor & -factor))
3343 /* Need one register for the CNT result and one for the multiplication
3344 factor. If necessary, the second temporary can be reused for the
3345 constant part of the offset. */
3346 return 2;
3347 /* Need one register for the CNT result (which might then
3348 be shifted). */
3349 count += 1;
3350 }
3351 return count + aarch64_add_offset_1_temporaries (constant);
3352 }
3353
3354 /* If X can be represented as a poly_int64, return the number
3355 of temporaries that are required to add it to a register.
3356 Return -1 otherwise. */
3357
3358 int
3359 aarch64_add_offset_temporaries (rtx x)
3360 {
3361 poly_int64 offset;
3362 if (!poly_int_rtx_p (x, &offset))
3363 return -1;
3364 return aarch64_offset_temporaries (true, offset);
3365 }
3366
3367 /* Set DEST to SRC + OFFSET. MODE is the mode of the addition.
3368 FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
3369 be set and CFA adjustments added to the generated instructions.
3370
3371 TEMP1, if nonnull, is a register of mode MODE that can be used as a
3372 temporary if register allocation is already complete. This temporary
3373 register may overlap DEST if !FRAME_RELATED_P but must not overlap SRC.
3374 If TEMP1 is known to hold abs (OFFSET), EMIT_MOVE_IMM can be set to
3375 false to avoid emitting the immediate again.
3376
3377 TEMP2, if nonnull, is a second temporary register that doesn't
3378 overlap either DEST or REG.
3379
3380 Since this function may be used to adjust the stack pointer, we must
3381 ensure that it cannot cause transient stack deallocation (for example
3382 by first incrementing SP and then decrementing when adjusting by a
3383 large immediate). */
3384
3385 static void
3386 aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx src,
3387 poly_int64 offset, rtx temp1, rtx temp2,
3388 bool frame_related_p, bool emit_move_imm = true)
3389 {
3390 gcc_assert (emit_move_imm || temp1 != NULL_RTX);
3391 gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
3392 gcc_assert (temp1 == NULL_RTX
3393 || !frame_related_p
3394 || !reg_overlap_mentioned_p (temp1, dest));
3395 gcc_assert (temp2 == NULL_RTX || !reg_overlap_mentioned_p (dest, temp2));
3396
3397 /* Try using ADDVL or ADDPL to add the whole value. */
3398 if (src != const0_rtx && aarch64_sve_addvl_addpl_immediate_p (offset))
3399 {
3400 rtx offset_rtx = gen_int_mode (offset, mode);
3401 rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
3402 RTX_FRAME_RELATED_P (insn) = frame_related_p;
3403 return;
3404 }
3405
3406 /* Coefficient 1 is multiplied by the number of 128-bit blocks in an
3407 SVE vector register, over and above the minimum size of 128 bits.
3408 This is equivalent to half the value returned by CNTD with a
3409 vector shape of ALL. */
3410 HOST_WIDE_INT factor = offset.coeffs[1];
3411 HOST_WIDE_INT constant = offset.coeffs[0] - factor;
3412
3413 /* Try using ADDVL or ADDPL to add the VG-based part. */
3414 poly_int64 poly_offset (factor, factor);
3415 if (src != const0_rtx
3416 && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
3417 {
3418 rtx offset_rtx = gen_int_mode (poly_offset, mode);
3419 if (frame_related_p)
3420 {
3421 rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
3422 RTX_FRAME_RELATED_P (insn) = true;
3423 src = dest;
3424 }
3425 else
3426 {
3427 rtx addr = gen_rtx_PLUS (mode, src, offset_rtx);
3428 src = aarch64_force_temporary (mode, temp1, addr);
3429 temp1 = temp2;
3430 temp2 = NULL_RTX;
3431 }
3432 }
3433 /* Otherwise use a CNT-based sequence. */
3434 else if (factor != 0)
3435 {
3436 /* Use a subtraction if we have a negative factor. */
3437 rtx_code code = PLUS;
3438 if (factor < 0)
3439 {
3440 factor = -factor;
3441 code = MINUS;
3442 }
3443
3444 /* Calculate CNTD * FACTOR / 2. First try to fold the division
3445 into the multiplication. */
3446 rtx val;
3447 int shift = 0;
3448 if (factor & 1)
3449 /* Use a right shift by 1. */
3450 shift = -1;
3451 else
3452 factor /= 2;
3453 HOST_WIDE_INT low_bit = factor & -factor;
3454 if (factor <= 16 * low_bit)
3455 {
3456 if (factor > 16 * 8)
3457 {
3458 /* "CNTB Xn, ALL, MUL #FACTOR" is out of range, so calculate
3459 the value with the minimum multiplier and shift it into
3460 position. */
3461 int extra_shift = exact_log2 (low_bit);
3462 shift += extra_shift;
3463 factor >>= extra_shift;
3464 }
3465 val = gen_int_mode (poly_int64 (factor * 2, factor * 2), mode);
3466 }
3467 else
3468 {
3469 /* Base the factor on LOW_BIT if we can calculate LOW_BIT
3470 directly, since that should increase the chances of being
3471 able to use a shift and add sequence. If LOW_BIT itself
3472 is out of range, just use CNTD. */
3473 if (low_bit <= 16 * 8)
3474 factor /= low_bit;
3475 else
3476 low_bit = 1;
3477
3478 val = gen_int_mode (poly_int64 (low_bit * 2, low_bit * 2), mode);
3479 val = aarch64_force_temporary (mode, temp1, val);
3480
3481 if (can_create_pseudo_p ())
3482 {
3483 rtx coeff1 = gen_int_mode (factor, mode);
3484 val = expand_mult (mode, val, coeff1, NULL_RTX, false, true);
3485 }
3486 else
3487 {
3488 /* Go back to using a negative multiplication factor if we have
3489 no register from which to subtract. */
3490 if (code == MINUS && src == const0_rtx)
3491 {
3492 factor = -factor;
3493 code = PLUS;
3494 }
3495 rtx coeff1 = gen_int_mode (factor, mode);
3496 coeff1 = aarch64_force_temporary (mode, temp2, coeff1);
3497 val = gen_rtx_MULT (mode, val, coeff1);
3498 }
3499 }
3500
3501 if (shift > 0)
3502 {
3503 /* Multiply by 1 << SHIFT. */
3504 val = aarch64_force_temporary (mode, temp1, val);
3505 val = gen_rtx_ASHIFT (mode, val, GEN_INT (shift));
3506 }
3507 else if (shift == -1)
3508 {
3509 /* Divide by 2. */
3510 val = aarch64_force_temporary (mode, temp1, val);
3511 val = gen_rtx_ASHIFTRT (mode, val, const1_rtx);
3512 }
3513
3514 /* Calculate SRC +/- CNTD * FACTOR / 2. */
3515 if (src != const0_rtx)
3516 {
3517 val = aarch64_force_temporary (mode, temp1, val);
3518 val = gen_rtx_fmt_ee (code, mode, src, val);
3519 }
3520 else if (code == MINUS)
3521 {
3522 val = aarch64_force_temporary (mode, temp1, val);
3523 val = gen_rtx_NEG (mode, val);
3524 }
3525
3526 if (constant == 0 || frame_related_p)
3527 {
3528 rtx_insn *insn = emit_insn (gen_rtx_SET (dest, val));
3529 if (frame_related_p)
3530 {
3531 RTX_FRAME_RELATED_P (insn) = true;
3532 add_reg_note (insn, REG_CFA_ADJUST_CFA,
3533 gen_rtx_SET (dest, plus_constant (Pmode, src,
3534 poly_offset)));
3535 }
3536 src = dest;
3537 if (constant == 0)
3538 return;
3539 }
3540 else
3541 {
3542 src = aarch64_force_temporary (mode, temp1, val);
3543 temp1 = temp2;
3544 temp2 = NULL_RTX;
3545 }
3546
3547 emit_move_imm = true;
3548 }
3549
3550 aarch64_add_offset_1 (mode, dest, src, constant, temp1,
3551 frame_related_p, emit_move_imm);
3552 }
3553
3554 /* Like aarch64_add_offset, but the offset is given as an rtx rather
3555 than a poly_int64. */
3556
3557 void
3558 aarch64_split_add_offset (scalar_int_mode mode, rtx dest, rtx src,
3559 rtx offset_rtx, rtx temp1, rtx temp2)
3560 {
3561 aarch64_add_offset (mode, dest, src, rtx_to_poly_int64 (offset_rtx),
3562 temp1, temp2, false);
3563 }
3564
3565 /* Add DELTA to the stack pointer, marking the instructions frame-related.
3566 TEMP1 is available as a temporary if nonnull. EMIT_MOVE_IMM is false
3567 if TEMP1 already contains abs (DELTA). */
3568
3569 static inline void
3570 aarch64_add_sp (rtx temp1, rtx temp2, poly_int64 delta, bool emit_move_imm)
3571 {
3572 aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, delta,
3573 temp1, temp2, true, emit_move_imm);
3574 }
3575
3576 /* Subtract DELTA from the stack pointer, marking the instructions
3577 frame-related if FRAME_RELATED_P. TEMP1 is available as a temporary
3578 if nonnull. */
3579
3580 static inline void
3581 aarch64_sub_sp (rtx temp1, rtx temp2, poly_int64 delta, bool frame_related_p,
3582 bool emit_move_imm = true)
3583 {
3584 aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, -delta,
3585 temp1, temp2, frame_related_p, emit_move_imm);
3586 }
3587
3588 /* Set DEST to (vec_series BASE STEP). */
3589
3590 static void
3591 aarch64_expand_vec_series (rtx dest, rtx base, rtx step)
3592 {
3593 machine_mode mode = GET_MODE (dest);
3594 scalar_mode inner = GET_MODE_INNER (mode);
3595
3596 /* Each operand can be a register or an immediate in the range [-16, 15]. */
3597 if (!aarch64_sve_index_immediate_p (base))
3598 base = force_reg (inner, base);
3599 if (!aarch64_sve_index_immediate_p (step))
3600 step = force_reg (inner, step);
3601
3602 emit_set_insn (dest, gen_rtx_VEC_SERIES (mode, base, step));
3603 }
3604
3605 /* Duplicate 128-bit Advanced SIMD vector SRC so that it fills an SVE
3606 register of mode MODE. Use TARGET for the result if it's nonnull
3607 and convenient.
3608
3609 The two vector modes must have the same element mode. The behavior
3610 is to duplicate architectural lane N of SRC into architectural lanes
3611 N + I * STEP of the result. On big-endian targets, architectural
3612 lane 0 of an Advanced SIMD vector is the last element of the vector
3613 in memory layout, so for big-endian targets this operation has the
3614 effect of reversing SRC before duplicating it. Callers need to
3615 account for this. */
3616
3617 rtx
3618 aarch64_expand_sve_dupq (rtx target, machine_mode mode, rtx src)
3619 {
3620 machine_mode src_mode = GET_MODE (src);
3621 gcc_assert (GET_MODE_INNER (mode) == GET_MODE_INNER (src_mode));
3622 insn_code icode = (BYTES_BIG_ENDIAN
3623 ? code_for_aarch64_vec_duplicate_vq_be (mode)
3624 : code_for_aarch64_vec_duplicate_vq_le (mode));
3625
3626 unsigned int i = 0;
3627 expand_operand ops[3];
3628 create_output_operand (&ops[i++], target, mode);
3629 create_output_operand (&ops[i++], src, src_mode);
3630 if (BYTES_BIG_ENDIAN)
3631 {
3632 /* Create a PARALLEL describing the reversal of SRC. */
3633 unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (mode);
3634 rtx sel = aarch64_gen_stepped_int_parallel (nelts_per_vq,
3635 nelts_per_vq - 1, -1);
3636 create_fixed_operand (&ops[i++], sel);
3637 }
3638 expand_insn (icode, i, ops);
3639 return ops[0].value;
3640 }
3641
3642 /* Try to force 128-bit vector value SRC into memory and use LD1RQ to fetch
3643 the memory image into DEST. Return true on success. */
3644
3645 static bool
3646 aarch64_expand_sve_ld1rq (rtx dest, rtx src)
3647 {
3648 src = force_const_mem (GET_MODE (src), src);
3649 if (!src)
3650 return false;
3651
3652 /* Make sure that the address is legitimate. */
3653 if (!aarch64_sve_ld1rq_operand_p (src))
3654 {
3655 rtx addr = force_reg (Pmode, XEXP (src, 0));
3656 src = replace_equiv_address (src, addr);
3657 }
3658
3659 machine_mode mode = GET_MODE (dest);
3660 unsigned int elem_bytes = GET_MODE_UNIT_SIZE (mode);
3661 machine_mode pred_mode = aarch64_sve_pred_mode (elem_bytes).require ();
3662 rtx ptrue = aarch64_ptrue_reg (pred_mode);
3663 emit_insn (gen_aarch64_sve_ld1rq (mode, dest, src, ptrue));
3664 return true;
3665 }
3666
3667 /* Return a register containing CONST_VECTOR SRC, given that SRC has an
3668 SVE data mode and isn't a legitimate constant. Use TARGET for the
3669 result if convenient.
3670
3671 The returned register can have whatever mode seems most natural
3672 given the contents of SRC. */
3673
3674 static rtx
3675 aarch64_expand_sve_const_vector (rtx target, rtx src)
3676 {
3677 machine_mode mode = GET_MODE (src);
3678 unsigned int npatterns = CONST_VECTOR_NPATTERNS (src);
3679 unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (src);
3680 scalar_mode elt_mode = GET_MODE_INNER (mode);
3681 unsigned int elt_bits = GET_MODE_BITSIZE (elt_mode);
3682 unsigned int encoded_bits = npatterns * nelts_per_pattern * elt_bits;
3683
3684 if (nelts_per_pattern == 1 && encoded_bits == 128)
3685 {
3686 /* The constant is a duplicated quadword but can't be narrowed
3687 beyond a quadword. Get the memory image of the first quadword
3688 as a 128-bit vector and try using LD1RQ to load it from memory.
3689
3690 The effect for both endiannesses is to load memory lane N into
3691 architectural lanes N + I * STEP of the result. On big-endian
3692 targets, the layout of the 128-bit vector in an Advanced SIMD
3693 register would be different from its layout in an SVE register,
3694 but this 128-bit vector is a memory value only. */
3695 machine_mode vq_mode = aarch64_vq_mode (elt_mode).require ();
3696 rtx vq_value = simplify_gen_subreg (vq_mode, src, mode, 0);
3697 if (vq_value && aarch64_expand_sve_ld1rq (target, vq_value))
3698 return target;
3699 }
3700
3701 if (nelts_per_pattern == 1 && encoded_bits < 128)
3702 {
3703 /* The vector is a repeating sequence of 64 bits or fewer.
3704 See if we can load them using an Advanced SIMD move and then
3705 duplicate it to fill a vector. This is better than using a GPR
3706 move because it keeps everything in the same register file. */
3707 machine_mode vq_mode = aarch64_vq_mode (elt_mode).require ();
3708 rtx_vector_builder builder (vq_mode, npatterns, 1);
3709 for (unsigned int i = 0; i < npatterns; ++i)
3710 {
3711 /* We want memory lane N to go into architectural lane N,
3712 so reverse for big-endian targets. The DUP .Q pattern
3713 has a compensating reverse built-in. */
3714 unsigned int srci = BYTES_BIG_ENDIAN ? npatterns - i - 1 : i;
3715 builder.quick_push (CONST_VECTOR_ENCODED_ELT (src, srci));
3716 }
3717 rtx vq_src = builder.build ();
3718 if (aarch64_simd_valid_immediate (vq_src, NULL))
3719 {
3720 vq_src = force_reg (vq_mode, vq_src);
3721 return aarch64_expand_sve_dupq (target, mode, vq_src);
3722 }
3723
3724 /* Get an integer representation of the repeating part of Advanced
3725 SIMD vector VQ_SRC. This preserves the endianness of VQ_SRC,
3726 which for big-endian targets is lane-swapped wrt a normal
3727 Advanced SIMD vector. This means that for both endiannesses,
3728 memory lane N of SVE vector SRC corresponds to architectural
3729 lane N of a register holding VQ_SRC. This in turn means that
3730 memory lane 0 of SVE vector SRC is in the lsb of VQ_SRC (viewed
3731 as a single 128-bit value) and thus that memory lane 0 of SRC is
3732 in the lsb of the integer. Duplicating the integer therefore
3733 ensures that memory lane N of SRC goes into architectural lane
3734 N + I * INDEX of the SVE register. */
3735 scalar_mode int_mode = int_mode_for_size (encoded_bits, 0).require ();
3736 rtx elt_value = simplify_gen_subreg (int_mode, vq_src, vq_mode, 0);
3737 if (elt_value)
3738 {
3739 /* Pretend that we had a vector of INT_MODE to start with. */
3740 elt_mode = int_mode;
3741 mode = aarch64_full_sve_mode (int_mode).require ();
3742
3743 /* If the integer can be moved into a general register by a
3744 single instruction, do that and duplicate the result. */
3745 if (CONST_INT_P (elt_value)
3746 && aarch64_move_imm (INTVAL (elt_value), elt_mode))
3747 {
3748 elt_value = force_reg (elt_mode, elt_value);
3749 return expand_vector_broadcast (mode, elt_value);
3750 }
3751 }
3752 else if (npatterns == 1)
3753 /* We're duplicating a single value, but can't do better than
3754 force it to memory and load from there. This handles things
3755 like symbolic constants. */
3756 elt_value = CONST_VECTOR_ENCODED_ELT (src, 0);
3757
3758 if (elt_value)
3759 {
3760 /* Load the element from memory if we can, otherwise move it into
3761 a register and use a DUP. */
3762 rtx op = force_const_mem (elt_mode, elt_value);
3763 if (!op)
3764 op = force_reg (elt_mode, elt_value);
3765 return expand_vector_broadcast (mode, op);
3766 }
3767 }
3768
3769 /* Try using INDEX. */
3770 rtx base, step;
3771 if (const_vec_series_p (src, &base, &step))
3772 {
3773 aarch64_expand_vec_series (target, base, step);
3774 return target;
3775 }
3776
3777 /* From here on, it's better to force the whole constant to memory
3778 if we can. */
3779 if (GET_MODE_NUNITS (mode).is_constant ())
3780 return NULL_RTX;
3781
3782 /* Expand each pattern individually. */
3783 gcc_assert (npatterns > 1);
3784 rtx_vector_builder builder;
3785 auto_vec<rtx, 16> vectors (npatterns);
3786 for (unsigned int i = 0; i < npatterns; ++i)
3787 {
3788 builder.new_vector (mode, 1, nelts_per_pattern);
3789 for (unsigned int j = 0; j < nelts_per_pattern; ++j)
3790 builder.quick_push (CONST_VECTOR_ELT (src, i + j * npatterns));
3791 vectors.quick_push (force_reg (mode, builder.build ()));
3792 }
3793
3794 /* Use permutes to interleave the separate vectors. */
3795 while (npatterns > 1)
3796 {
3797 npatterns /= 2;
3798 for (unsigned int i = 0; i < npatterns; ++i)
3799 {
3800 rtx tmp = (npatterns == 1 ? target : gen_reg_rtx (mode));
3801 rtvec v = gen_rtvec (2, vectors[i], vectors[i + npatterns]);
3802 emit_set_insn (tmp, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
3803 vectors[i] = tmp;
3804 }
3805 }
3806 gcc_assert (vectors[0] == target);
3807 return target;
3808 }
3809
3810 /* Use WHILE to set a predicate register of mode MODE in which the first
3811 VL bits are set and the rest are clear. Use TARGET for the register
3812 if it's nonnull and convenient. */
3813
3814 static rtx
3815 aarch64_sve_move_pred_via_while (rtx target, machine_mode mode,
3816 unsigned int vl)
3817 {
3818 rtx limit = force_reg (DImode, gen_int_mode (vl, DImode));
3819 target = aarch64_target_reg (target, mode);
3820 emit_insn (gen_while_ult (DImode, mode, target, const0_rtx, limit));
3821 return target;
3822 }
3823
3824 static rtx
3825 aarch64_expand_sve_const_pred_1 (rtx, rtx_vector_builder &, bool);
3826
3827 /* BUILDER is a constant predicate in which the index of every set bit
3828 is a multiple of ELT_SIZE (which is <= 8). Try to load the constant
3829 by inverting every element at a multiple of ELT_SIZE and EORing the
3830 result with an ELT_SIZE PTRUE.
3831
3832 Return a register that contains the constant on success, otherwise
3833 return null. Use TARGET as the register if it is nonnull and
3834 convenient. */
3835
3836 static rtx
3837 aarch64_expand_sve_const_pred_eor (rtx target, rtx_vector_builder &builder,
3838 unsigned int elt_size)
3839 {
3840 /* Invert every element at a multiple of ELT_SIZE, keeping the
3841 other bits zero. */
3842 rtx_vector_builder inv_builder (VNx16BImode, builder.npatterns (),
3843 builder.nelts_per_pattern ());
3844 for (unsigned int i = 0; i < builder.encoded_nelts (); ++i)
3845 if ((i & (elt_size - 1)) == 0 && INTVAL (builder.elt (i)) == 0)
3846 inv_builder.quick_push (const1_rtx);
3847 else
3848 inv_builder.quick_push (const0_rtx);
3849 inv_builder.finalize ();
3850
3851 /* See if we can load the constant cheaply. */
3852 rtx inv = aarch64_expand_sve_const_pred_1 (NULL_RTX, inv_builder, false);
3853 if (!inv)
3854 return NULL_RTX;
3855
3856 /* EOR the result with an ELT_SIZE PTRUE. */
3857 rtx mask = aarch64_ptrue_all (elt_size);
3858 mask = force_reg (VNx16BImode, mask);
3859 target = aarch64_target_reg (target, VNx16BImode);
3860 emit_insn (gen_aarch64_pred_z (XOR, VNx16BImode, target, mask, inv, mask));
3861 return target;
3862 }
3863
3864 /* BUILDER is a constant predicate in which the index of every set bit
3865 is a multiple of ELT_SIZE (which is <= 8). Try to load the constant
3866 using a TRN1 of size PERMUTE_SIZE, which is >= ELT_SIZE. Return the
3867 register on success, otherwise return null. Use TARGET as the register
3868 if nonnull and convenient. */
3869
3870 static rtx
3871 aarch64_expand_sve_const_pred_trn (rtx target, rtx_vector_builder &builder,
3872 unsigned int elt_size,
3873 unsigned int permute_size)
3874 {
3875 /* We're going to split the constant into two new constants A and B,
3876 with element I of BUILDER going into A if (I & PERMUTE_SIZE) == 0
3877 and into B otherwise. E.g. for PERMUTE_SIZE == 4 && ELT_SIZE == 1:
3878
3879 A: { 0, 1, 2, 3, _, _, _, _, 8, 9, 10, 11, _, _, _, _ }
3880 B: { 4, 5, 6, 7, _, _, _, _, 12, 13, 14, 15, _, _, _, _ }
3881
3882 where _ indicates elements that will be discarded by the permute.
3883
3884 First calculate the ELT_SIZEs for A and B. */
3885 unsigned int a_elt_size = GET_MODE_SIZE (DImode);
3886 unsigned int b_elt_size = GET_MODE_SIZE (DImode);
3887 for (unsigned int i = 0; i < builder.encoded_nelts (); i += elt_size)
3888 if (INTVAL (builder.elt (i)) != 0)
3889 {
3890 if (i & permute_size)
3891 b_elt_size |= i - permute_size;
3892 else
3893 a_elt_size |= i;
3894 }
3895 a_elt_size &= -a_elt_size;
3896 b_elt_size &= -b_elt_size;
3897
3898 /* Now construct the vectors themselves. */
3899 rtx_vector_builder a_builder (VNx16BImode, builder.npatterns (),
3900 builder.nelts_per_pattern ());
3901 rtx_vector_builder b_builder (VNx16BImode, builder.npatterns (),
3902 builder.nelts_per_pattern ());
3903 unsigned int nelts = builder.encoded_nelts ();
3904 for (unsigned int i = 0; i < nelts; ++i)
3905 if (i & (elt_size - 1))
3906 {
3907 a_builder.quick_push (const0_rtx);
3908 b_builder.quick_push (const0_rtx);
3909 }
3910 else if ((i & permute_size) == 0)
3911 {
3912 /* The A and B elements are significant. */
3913 a_builder.quick_push (builder.elt (i));
3914 b_builder.quick_push (builder.elt (i + permute_size));
3915 }
3916 else
3917 {
3918 /* The A and B elements are going to be discarded, so pick whatever
3919 is likely to give a nice constant. We are targeting element
3920 sizes A_ELT_SIZE and B_ELT_SIZE for A and B respectively,
3921 with the aim of each being a sequence of ones followed by
3922 a sequence of zeros. So:
3923
3924 * if X_ELT_SIZE <= PERMUTE_SIZE, the best approach is to
3925 duplicate the last X_ELT_SIZE element, to extend the
3926 current sequence of ones or zeros.
3927
3928 * if X_ELT_SIZE > PERMUTE_SIZE, the best approach is to add a
3929 zero, so that the constant really does have X_ELT_SIZE and
3930 not a smaller size. */
3931 if (a_elt_size > permute_size)
3932 a_builder.quick_push (const0_rtx);
3933 else
3934 a_builder.quick_push (a_builder.elt (i - a_elt_size));
3935 if (b_elt_size > permute_size)
3936 b_builder.quick_push (const0_rtx);
3937 else
3938 b_builder.quick_push (b_builder.elt (i - b_elt_size));
3939 }
3940 a_builder.finalize ();
3941 b_builder.finalize ();
3942
3943 /* Try loading A into a register. */
3944 rtx_insn *last = get_last_insn ();
3945 rtx a = aarch64_expand_sve_const_pred_1 (NULL_RTX, a_builder, false);
3946 if (!a)
3947 return NULL_RTX;
3948
3949 /* Try loading B into a register. */
3950 rtx b = a;
3951 if (a_builder != b_builder)
3952 {
3953 b = aarch64_expand_sve_const_pred_1 (NULL_RTX, b_builder, false);
3954 if (!b)
3955 {
3956 delete_insns_since (last);
3957 return NULL_RTX;
3958 }
3959 }
3960
3961 /* Emit the TRN1 itself. */
3962 machine_mode mode = aarch64_sve_pred_mode (permute_size).require ();
3963 target = aarch64_target_reg (target, mode);
3964 emit_insn (gen_aarch64_sve (UNSPEC_TRN1, mode, target,
3965 gen_lowpart (mode, a),
3966 gen_lowpart (mode, b)));
3967 return target;
3968 }
3969
3970 /* Subroutine of aarch64_expand_sve_const_pred. Try to load the VNx16BI
3971 constant in BUILDER into an SVE predicate register. Return the register
3972 on success, otherwise return null. Use TARGET for the register if
3973 nonnull and convenient.
3974
3975 ALLOW_RECURSE_P is true if we can use methods that would call this
3976 function recursively. */
3977
3978 static rtx
3979 aarch64_expand_sve_const_pred_1 (rtx target, rtx_vector_builder &builder,
3980 bool allow_recurse_p)
3981 {
3982 if (builder.encoded_nelts () == 1)
3983 /* A PFALSE or a PTRUE .B ALL. */
3984 return aarch64_emit_set_immediate (target, builder);
3985
3986 unsigned int elt_size = aarch64_widest_sve_pred_elt_size (builder);
3987 if (int vl = aarch64_partial_ptrue_length (builder, elt_size))
3988 {
3989 /* If we can load the constant using PTRUE, use it as-is. */
3990 machine_mode mode = aarch64_sve_pred_mode (elt_size).require ();
3991 if (aarch64_svpattern_for_vl (mode, vl) != AARCH64_NUM_SVPATTERNS)
3992 return aarch64_emit_set_immediate (target, builder);
3993
3994 /* Otherwise use WHILE to set the first VL bits. */
3995 return aarch64_sve_move_pred_via_while (target, mode, vl);
3996 }
3997
3998 if (!allow_recurse_p)
3999 return NULL_RTX;
4000
4001 /* Try inverting the vector in element size ELT_SIZE and then EORing
4002 the result with an ELT_SIZE PTRUE. */
4003 if (INTVAL (builder.elt (0)) == 0)
4004 if (rtx res = aarch64_expand_sve_const_pred_eor (target, builder,
4005 elt_size))
4006 return res;
4007
4008 /* Try using TRN1 to permute two simpler constants. */
4009 for (unsigned int i = elt_size; i <= 8; i *= 2)
4010 if (rtx res = aarch64_expand_sve_const_pred_trn (target, builder,
4011 elt_size, i))
4012 return res;
4013
4014 return NULL_RTX;
4015 }
4016
4017 /* Return an SVE predicate register that contains the VNx16BImode
4018 constant in BUILDER, without going through the move expanders.
4019
4020 The returned register can have whatever mode seems most natural
4021 given the contents of BUILDER. Use TARGET for the result if
4022 convenient. */
4023
4024 static rtx
4025 aarch64_expand_sve_const_pred (rtx target, rtx_vector_builder &builder)
4026 {
4027 /* Try loading the constant using pure predicate operations. */
4028 if (rtx res = aarch64_expand_sve_const_pred_1 (target, builder, true))
4029 return res;
4030
4031 /* Try forcing the constant to memory. */
4032 if (builder.full_nelts ().is_constant ())
4033 if (rtx mem = force_const_mem (VNx16BImode, builder.build ()))
4034 {
4035 target = aarch64_target_reg (target, VNx16BImode);
4036 emit_move_insn (target, mem);
4037 return target;
4038 }
4039
4040 /* The last resort is to load the constant as an integer and then
4041 compare it against zero. Use -1 for set bits in order to increase
4042 the changes of using SVE DUPM or an Advanced SIMD byte mask. */
4043 rtx_vector_builder int_builder (VNx16QImode, builder.npatterns (),
4044 builder.nelts_per_pattern ());
4045 for (unsigned int i = 0; i < builder.encoded_nelts (); ++i)
4046 int_builder.quick_push (INTVAL (builder.elt (i))
4047 ? constm1_rtx : const0_rtx);
4048 return aarch64_convert_sve_data_to_pred (target, VNx16BImode,
4049 int_builder.build ());
4050 }
4051
4052 /* Set DEST to immediate IMM. */
4053
4054 void
4055 aarch64_expand_mov_immediate (rtx dest, rtx imm)
4056 {
4057 machine_mode mode = GET_MODE (dest);
4058
4059 /* Check on what type of symbol it is. */
4060 scalar_int_mode int_mode;
4061 if ((GET_CODE (imm) == SYMBOL_REF
4062 || GET_CODE (imm) == LABEL_REF
4063 || GET_CODE (imm) == CONST
4064 || GET_CODE (imm) == CONST_POLY_INT)
4065 && is_a <scalar_int_mode> (mode, &int_mode))
4066 {
4067 rtx mem;
4068 poly_int64 offset;
4069 HOST_WIDE_INT const_offset;
4070 enum aarch64_symbol_type sty;
4071
4072 /* If we have (const (plus symbol offset)), separate out the offset
4073 before we start classifying the symbol. */
4074 rtx base = strip_offset (imm, &offset);
4075
4076 /* We must always add an offset involving VL separately, rather than
4077 folding it into the relocation. */
4078 if (!offset.is_constant (&const_offset))
4079 {
4080 if (base == const0_rtx && aarch64_sve_cnt_immediate_p (offset))
4081 emit_insn (gen_rtx_SET (dest, imm));
4082 else
4083 {
4084 /* Do arithmetic on 32-bit values if the result is smaller
4085 than that. */
4086 if (partial_subreg_p (int_mode, SImode))
4087 {
4088 /* It is invalid to do symbol calculations in modes
4089 narrower than SImode. */
4090 gcc_assert (base == const0_rtx);
4091 dest = gen_lowpart (SImode, dest);
4092 int_mode = SImode;
4093 }
4094 if (base != const0_rtx)
4095 {
4096 base = aarch64_force_temporary (int_mode, dest, base);
4097 aarch64_add_offset (int_mode, dest, base, offset,
4098 NULL_RTX, NULL_RTX, false);
4099 }
4100 else
4101 aarch64_add_offset (int_mode, dest, base, offset,
4102 dest, NULL_RTX, false);
4103 }
4104 return;
4105 }
4106
4107 sty = aarch64_classify_symbol (base, const_offset);
4108 switch (sty)
4109 {
4110 case SYMBOL_FORCE_TO_MEM:
4111 if (const_offset != 0
4112 && targetm.cannot_force_const_mem (int_mode, imm))
4113 {
4114 gcc_assert (can_create_pseudo_p ());
4115 base = aarch64_force_temporary (int_mode, dest, base);
4116 aarch64_add_offset (int_mode, dest, base, const_offset,
4117 NULL_RTX, NULL_RTX, false);
4118 return;
4119 }
4120
4121 mem = force_const_mem (ptr_mode, imm);
4122 gcc_assert (mem);
4123
4124 /* If we aren't generating PC relative literals, then
4125 we need to expand the literal pool access carefully.
4126 This is something that needs to be done in a number
4127 of places, so could well live as a separate function. */
4128 if (!aarch64_pcrelative_literal_loads)
4129 {
4130 gcc_assert (can_create_pseudo_p ());
4131 base = gen_reg_rtx (ptr_mode);
4132 aarch64_expand_mov_immediate (base, XEXP (mem, 0));
4133 if (ptr_mode != Pmode)
4134 base = convert_memory_address (Pmode, base);
4135 mem = gen_rtx_MEM (ptr_mode, base);
4136 }
4137
4138 if (int_mode != ptr_mode)
4139 mem = gen_rtx_ZERO_EXTEND (int_mode, mem);
4140
4141 emit_insn (gen_rtx_SET (dest, mem));
4142
4143 return;
4144
4145 case SYMBOL_SMALL_TLSGD:
4146 case SYMBOL_SMALL_TLSDESC:
4147 case SYMBOL_SMALL_TLSIE:
4148 case SYMBOL_SMALL_GOT_28K:
4149 case SYMBOL_SMALL_GOT_4G:
4150 case SYMBOL_TINY_GOT:
4151 case SYMBOL_TINY_TLSIE:
4152 if (const_offset != 0)
4153 {
4154 gcc_assert(can_create_pseudo_p ());
4155 base = aarch64_force_temporary (int_mode, dest, base);
4156 aarch64_add_offset (int_mode, dest, base, const_offset,
4157 NULL_RTX, NULL_RTX, false);
4158 return;
4159 }
4160 /* FALLTHRU */
4161
4162 case SYMBOL_SMALL_ABSOLUTE:
4163 case SYMBOL_TINY_ABSOLUTE:
4164 case SYMBOL_TLSLE12:
4165 case SYMBOL_TLSLE24:
4166 case SYMBOL_TLSLE32:
4167 case SYMBOL_TLSLE48:
4168 aarch64_load_symref_appropriately (dest, imm, sty);
4169 return;
4170
4171 default:
4172 gcc_unreachable ();
4173 }
4174 }
4175
4176 if (!CONST_INT_P (imm))
4177 {
4178 if (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL)
4179 {
4180 /* Only the low bit of each .H, .S and .D element is defined,
4181 so we can set the upper bits to whatever we like. If the
4182 predicate is all-true in MODE, prefer to set all the undefined
4183 bits as well, so that we can share a single .B predicate for
4184 all modes. */
4185 if (imm == CONSTM1_RTX (mode))
4186 imm = CONSTM1_RTX (VNx16BImode);
4187
4188 /* All methods for constructing predicate modes wider than VNx16BI
4189 will set the upper bits of each element to zero. Expose this
4190 by moving such constants as a VNx16BI, so that all bits are
4191 significant and so that constants for different modes can be
4192 shared. The wider constant will still be available as a
4193 REG_EQUAL note. */
4194 rtx_vector_builder builder;
4195 if (aarch64_get_sve_pred_bits (builder, imm))
4196 {
4197 rtx res = aarch64_expand_sve_const_pred (dest, builder);
4198 if (dest != res)
4199 emit_move_insn (dest, gen_lowpart (mode, res));
4200 return;
4201 }
4202 }
4203
4204 if (GET_CODE (imm) == HIGH
4205 || aarch64_simd_valid_immediate (imm, NULL))
4206 {
4207 emit_insn (gen_rtx_SET (dest, imm));
4208 return;
4209 }
4210
4211 if (GET_CODE (imm) == CONST_VECTOR && aarch64_sve_data_mode_p (mode))
4212 if (rtx res = aarch64_expand_sve_const_vector (dest, imm))
4213 {
4214 if (dest != res)
4215 emit_insn (gen_aarch64_sve_reinterpret (mode, dest, res));
4216 return;
4217 }
4218
4219 rtx mem = force_const_mem (mode, imm);
4220 gcc_assert (mem);
4221 emit_move_insn (dest, mem);
4222 return;
4223 }
4224
4225 aarch64_internal_mov_immediate (dest, imm, true,
4226 as_a <scalar_int_mode> (mode));
4227 }
4228
4229 /* Emit an SVE predicated move from SRC to DEST. PRED is a predicate
4230 that is known to contain PTRUE. */
4231
4232 void
4233 aarch64_emit_sve_pred_move (rtx dest, rtx pred, rtx src)
4234 {
4235 expand_operand ops[3];
4236 machine_mode mode = GET_MODE (dest);
4237 create_output_operand (&ops[0], dest, mode);
4238 create_input_operand (&ops[1], pred, GET_MODE(pred));
4239 create_input_operand (&ops[2], src, mode);
4240 temporary_volatile_ok v (true);
4241 expand_insn (code_for_aarch64_pred_mov (mode), 3, ops);
4242 }
4243
4244 /* Expand a pre-RA SVE data move from SRC to DEST in which at least one
4245 operand is in memory. In this case we need to use the predicated LD1
4246 and ST1 instead of LDR and STR, both for correctness on big-endian
4247 targets and because LD1 and ST1 support a wider range of addressing modes.
4248 PRED_MODE is the mode of the predicate.
4249
4250 See the comment at the head of aarch64-sve.md for details about the
4251 big-endian handling. */
4252
4253 void
4254 aarch64_expand_sve_mem_move (rtx dest, rtx src, machine_mode pred_mode)
4255 {
4256 machine_mode mode = GET_MODE (dest);
4257 rtx ptrue = aarch64_ptrue_reg (pred_mode);
4258 if (!register_operand (src, mode)
4259 && !register_operand (dest, mode))
4260 {
4261 rtx tmp = gen_reg_rtx (mode);
4262 if (MEM_P (src))
4263 aarch64_emit_sve_pred_move (tmp, ptrue, src);
4264 else
4265 emit_move_insn (tmp, src);
4266 src = tmp;
4267 }
4268 aarch64_emit_sve_pred_move (dest, ptrue, src);
4269 }
4270
4271 /* Called only on big-endian targets. See whether an SVE vector move
4272 from SRC to DEST is effectively a REV[BHW] instruction, because at
4273 least one operand is a subreg of an SVE vector that has wider or
4274 narrower elements. Return true and emit the instruction if so.
4275
4276 For example:
4277
4278 (set (reg:VNx8HI R1) (subreg:VNx8HI (reg:VNx16QI R2) 0))
4279
4280 represents a VIEW_CONVERT between the following vectors, viewed
4281 in memory order:
4282
4283 R2: { [0].high, [0].low, [1].high, [1].low, ... }
4284 R1: { [0], [1], [2], [3], ... }
4285
4286 The high part of lane X in R2 should therefore correspond to lane X*2
4287 of R1, but the register representations are:
4288
4289 msb lsb
4290 R2: ...... [1].high [1].low [0].high [0].low
4291 R1: ...... [3] [2] [1] [0]
4292
4293 where the low part of lane X in R2 corresponds to lane X*2 in R1.
4294 We therefore need a reverse operation to swap the high and low values
4295 around.
4296
4297 This is purely an optimization. Without it we would spill the
4298 subreg operand to the stack in one mode and reload it in the
4299 other mode, which has the same effect as the REV. */
4300
4301 bool
4302 aarch64_maybe_expand_sve_subreg_move (rtx dest, rtx src)
4303 {
4304 gcc_assert (BYTES_BIG_ENDIAN);
4305 if (GET_CODE (dest) == SUBREG)
4306 dest = SUBREG_REG (dest);
4307 if (GET_CODE (src) == SUBREG)
4308 src = SUBREG_REG (src);
4309
4310 /* The optimization handles two single SVE REGs with different element
4311 sizes. */
4312 if (!REG_P (dest)
4313 || !REG_P (src)
4314 || aarch64_classify_vector_mode (GET_MODE (dest)) != VEC_SVE_DATA
4315 || aarch64_classify_vector_mode (GET_MODE (src)) != VEC_SVE_DATA
4316 || (GET_MODE_UNIT_SIZE (GET_MODE (dest))
4317 == GET_MODE_UNIT_SIZE (GET_MODE (src))))
4318 return false;
4319
4320 /* Generate *aarch64_sve_mov<mode>_subreg_be. */
4321 rtx ptrue = aarch64_ptrue_reg (VNx16BImode);
4322 rtx unspec = gen_rtx_UNSPEC (GET_MODE (dest), gen_rtvec (2, ptrue, src),
4323 UNSPEC_REV_SUBREG);
4324 emit_insn (gen_rtx_SET (dest, unspec));
4325 return true;
4326 }
4327
4328 /* Return a copy of X with mode MODE, without changing its other
4329 attributes. Unlike gen_lowpart, this doesn't care whether the
4330 mode change is valid. */
4331
4332 static rtx
4333 aarch64_replace_reg_mode (rtx x, machine_mode mode)
4334 {
4335 if (GET_MODE (x) == mode)
4336 return x;
4337
4338 x = shallow_copy_rtx (x);
4339 set_mode_and_regno (x, mode, REGNO (x));
4340 return x;
4341 }
4342
4343 /* Return the SVE REV[BHW] unspec for reversing quantites of mode MODE
4344 stored in wider integer containers. */
4345
4346 static unsigned int
4347 aarch64_sve_rev_unspec (machine_mode mode)
4348 {
4349 switch (GET_MODE_UNIT_SIZE (mode))
4350 {
4351 case 1: return UNSPEC_REVB;
4352 case 2: return UNSPEC_REVH;
4353 case 4: return UNSPEC_REVW;
4354 }
4355 gcc_unreachable ();
4356 }
4357
4358 /* Split a *aarch64_sve_mov<mode>_subreg_be pattern with the given
4359 operands. */
4360
4361 void
4362 aarch64_split_sve_subreg_move (rtx dest, rtx ptrue, rtx src)
4363 {
4364 /* Decide which REV operation we need. The mode with wider elements
4365 determines the mode of the operands and the mode with the narrower
4366 elements determines the reverse width. */
4367 machine_mode mode_with_wider_elts = GET_MODE (dest);
4368 machine_mode mode_with_narrower_elts = GET_MODE (src);
4369 if (GET_MODE_UNIT_SIZE (mode_with_wider_elts)
4370 < GET_MODE_UNIT_SIZE (mode_with_narrower_elts))
4371 std::swap (mode_with_wider_elts, mode_with_narrower_elts);
4372
4373 unsigned int unspec = aarch64_sve_rev_unspec (mode_with_narrower_elts);
4374 unsigned int wider_bytes = GET_MODE_UNIT_SIZE (mode_with_wider_elts);
4375 machine_mode pred_mode = aarch64_sve_pred_mode (wider_bytes).require ();
4376
4377 /* Get the operands in the appropriate modes and emit the instruction. */
4378 ptrue = gen_lowpart (pred_mode, ptrue);
4379 dest = aarch64_replace_reg_mode (dest, mode_with_wider_elts);
4380 src = aarch64_replace_reg_mode (src, mode_with_wider_elts);
4381 emit_insn (gen_aarch64_pred (unspec, mode_with_wider_elts,
4382 dest, ptrue, src));
4383 }
4384
4385 static bool
4386 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
4387 tree exp ATTRIBUTE_UNUSED)
4388 {
4389 if (aarch64_simd_decl_p (cfun->decl) != aarch64_simd_decl_p (decl))
4390 return false;
4391
4392 return true;
4393 }
4394
4395 /* Implement TARGET_PASS_BY_REFERENCE. */
4396
4397 static bool
4398 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED,
4399 machine_mode mode,
4400 const_tree type,
4401 bool named ATTRIBUTE_UNUSED)
4402 {
4403 HOST_WIDE_INT size;
4404 machine_mode dummymode;
4405 int nregs;
4406
4407 /* GET_MODE_SIZE (BLKmode) is useless since it is 0. */
4408 if (mode == BLKmode && type)
4409 size = int_size_in_bytes (type);
4410 else
4411 /* No frontends can create types with variable-sized modes, so we
4412 shouldn't be asked to pass or return them. */
4413 size = GET_MODE_SIZE (mode).to_constant ();
4414
4415 /* Aggregates are passed by reference based on their size. */
4416 if (type && AGGREGATE_TYPE_P (type))
4417 {
4418 size = int_size_in_bytes (type);
4419 }
4420
4421 /* Variable sized arguments are always returned by reference. */
4422 if (size < 0)
4423 return true;
4424
4425 /* Can this be a candidate to be passed in fp/simd register(s)? */
4426 if (aarch64_vfp_is_call_or_return_candidate (mode, type,
4427 &dummymode, &nregs,
4428 NULL))
4429 return false;
4430
4431 /* Arguments which are variable sized or larger than 2 registers are
4432 passed by reference unless they are a homogenous floating point
4433 aggregate. */
4434 return size > 2 * UNITS_PER_WORD;
4435 }
4436
4437 /* Return TRUE if VALTYPE is padded to its least significant bits. */
4438 static bool
4439 aarch64_return_in_msb (const_tree valtype)
4440 {
4441 machine_mode dummy_mode;
4442 int dummy_int;
4443
4444 /* Never happens in little-endian mode. */
4445 if (!BYTES_BIG_ENDIAN)
4446 return false;
4447
4448 /* Only composite types smaller than or equal to 16 bytes can
4449 be potentially returned in registers. */
4450 if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
4451 || int_size_in_bytes (valtype) <= 0
4452 || int_size_in_bytes (valtype) > 16)
4453 return false;
4454
4455 /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
4456 or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
4457 is always passed/returned in the least significant bits of fp/simd
4458 register(s). */
4459 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
4460 &dummy_mode, &dummy_int, NULL))
4461 return false;
4462
4463 return true;
4464 }
4465
4466 /* Implement TARGET_FUNCTION_VALUE.
4467 Define how to find the value returned by a function. */
4468
4469 static rtx
4470 aarch64_function_value (const_tree type, const_tree func,
4471 bool outgoing ATTRIBUTE_UNUSED)
4472 {
4473 machine_mode mode;
4474 int unsignedp;
4475 int count;
4476 machine_mode ag_mode;
4477
4478 mode = TYPE_MODE (type);
4479 if (INTEGRAL_TYPE_P (type))
4480 mode = promote_function_mode (type, mode, &unsignedp, func, 1);
4481
4482 if (aarch64_return_in_msb (type))
4483 {
4484 HOST_WIDE_INT size = int_size_in_bytes (type);
4485
4486 if (size % UNITS_PER_WORD != 0)
4487 {
4488 size += UNITS_PER_WORD - size % UNITS_PER_WORD;
4489 mode = int_mode_for_size (size * BITS_PER_UNIT, 0).require ();
4490 }
4491 }
4492
4493 if (aarch64_vfp_is_call_or_return_candidate (mode, type,
4494 &ag_mode, &count, NULL))
4495 {
4496 if (!aarch64_composite_type_p (type, mode))
4497 {
4498 gcc_assert (count == 1 && mode == ag_mode);
4499 return gen_rtx_REG (mode, V0_REGNUM);
4500 }
4501 else
4502 {
4503 int i;
4504 rtx par;
4505
4506 par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
4507 for (i = 0; i < count; i++)
4508 {
4509 rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
4510 rtx offset = gen_int_mode (i * GET_MODE_SIZE (ag_mode), Pmode);
4511 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
4512 XVECEXP (par, 0, i) = tmp;
4513 }
4514 return par;
4515 }
4516 }
4517 else
4518 return gen_rtx_REG (mode, R0_REGNUM);
4519 }
4520
4521 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
4522 Return true if REGNO is the number of a hard register in which the values
4523 of called function may come back. */
4524
4525 static bool
4526 aarch64_function_value_regno_p (const unsigned int regno)
4527 {
4528 /* Maximum of 16 bytes can be returned in the general registers. Examples
4529 of 16-byte return values are: 128-bit integers and 16-byte small
4530 structures (excluding homogeneous floating-point aggregates). */
4531 if (regno == R0_REGNUM || regno == R1_REGNUM)
4532 return true;
4533
4534 /* Up to four fp/simd registers can return a function value, e.g. a
4535 homogeneous floating-point aggregate having four members. */
4536 if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
4537 return TARGET_FLOAT;
4538
4539 return false;
4540 }
4541
4542 /* Implement TARGET_RETURN_IN_MEMORY.
4543
4544 If the type T of the result of a function is such that
4545 void func (T arg)
4546 would require that arg be passed as a value in a register (or set of
4547 registers) according to the parameter passing rules, then the result
4548 is returned in the same registers as would be used for such an
4549 argument. */
4550
4551 static bool
4552 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
4553 {
4554 HOST_WIDE_INT size;
4555 machine_mode ag_mode;
4556 int count;
4557
4558 if (!AGGREGATE_TYPE_P (type)
4559 && TREE_CODE (type) != COMPLEX_TYPE
4560 && TREE_CODE (type) != VECTOR_TYPE)
4561 /* Simple scalar types always returned in registers. */
4562 return false;
4563
4564 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
4565 type,
4566 &ag_mode,
4567 &count,
4568 NULL))
4569 return false;
4570
4571 /* Types larger than 2 registers returned in memory. */
4572 size = int_size_in_bytes (type);
4573 return (size < 0 || size > 2 * UNITS_PER_WORD);
4574 }
4575
4576 static bool
4577 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
4578 const_tree type, int *nregs)
4579 {
4580 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
4581 return aarch64_vfp_is_call_or_return_candidate (mode,
4582 type,
4583 &pcum->aapcs_vfp_rmode,
4584 nregs,
4585 NULL);
4586 }
4587
4588 /* Given MODE and TYPE of a function argument, return the alignment in
4589 bits. The idea is to suppress any stronger alignment requested by
4590 the user and opt for the natural alignment (specified in AAPCS64 \S
4591 4.1). ABI_BREAK is set to true if the alignment was incorrectly
4592 calculated in versions of GCC prior to GCC-9. This is a helper
4593 function for local use only. */
4594
4595 static unsigned int
4596 aarch64_function_arg_alignment (machine_mode mode, const_tree type,
4597 bool *abi_break)
4598 {
4599 *abi_break = false;
4600 if (!type)
4601 return GET_MODE_ALIGNMENT (mode);
4602
4603 if (integer_zerop (TYPE_SIZE (type)))
4604 return 0;
4605
4606 gcc_assert (TYPE_MODE (type) == mode);
4607
4608 if (!AGGREGATE_TYPE_P (type))
4609 return TYPE_ALIGN (TYPE_MAIN_VARIANT (type));
4610
4611 if (TREE_CODE (type) == ARRAY_TYPE)
4612 return TYPE_ALIGN (TREE_TYPE (type));
4613
4614 unsigned int alignment = 0;
4615 unsigned int bitfield_alignment = 0;
4616 for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
4617 if (TREE_CODE (field) == FIELD_DECL)
4618 {
4619 alignment = std::max (alignment, DECL_ALIGN (field));
4620 if (DECL_BIT_FIELD_TYPE (field))
4621 bitfield_alignment
4622 = std::max (bitfield_alignment,
4623 TYPE_ALIGN (DECL_BIT_FIELD_TYPE (field)));
4624 }
4625
4626 if (bitfield_alignment > alignment)
4627 {
4628 *abi_break = true;
4629 return bitfield_alignment;
4630 }
4631
4632 return alignment;
4633 }
4634
4635 /* Layout a function argument according to the AAPCS64 rules. The rule
4636 numbers refer to the rule numbers in the AAPCS64. */
4637
4638 static void
4639 aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
4640 const_tree type,
4641 bool named ATTRIBUTE_UNUSED)
4642 {
4643 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
4644 int ncrn, nvrn, nregs;
4645 bool allocate_ncrn, allocate_nvrn;
4646 HOST_WIDE_INT size;
4647 bool abi_break;
4648
4649 /* We need to do this once per argument. */
4650 if (pcum->aapcs_arg_processed)
4651 return;
4652
4653 pcum->aapcs_arg_processed = true;
4654
4655 /* Size in bytes, rounded to the nearest multiple of 8 bytes. */
4656 if (type)
4657 size = int_size_in_bytes (type);
4658 else
4659 /* No frontends can create types with variable-sized modes, so we
4660 shouldn't be asked to pass or return them. */
4661 size = GET_MODE_SIZE (mode).to_constant ();
4662 size = ROUND_UP (size, UNITS_PER_WORD);
4663
4664 allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
4665 allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
4666 mode,
4667 type,
4668 &nregs);
4669
4670 /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
4671 The following code thus handles passing by SIMD/FP registers first. */
4672
4673 nvrn = pcum->aapcs_nvrn;
4674
4675 /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
4676 and homogenous short-vector aggregates (HVA). */
4677 if (allocate_nvrn)
4678 {
4679 if (!TARGET_FLOAT)
4680 aarch64_err_no_fpadvsimd (mode);
4681
4682 if (nvrn + nregs <= NUM_FP_ARG_REGS)
4683 {
4684 pcum->aapcs_nextnvrn = nvrn + nregs;
4685 if (!aarch64_composite_type_p (type, mode))
4686 {
4687 gcc_assert (nregs == 1);
4688 pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
4689 }
4690 else
4691 {
4692 rtx par;
4693 int i;
4694 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
4695 for (i = 0; i < nregs; i++)
4696 {
4697 rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
4698 V0_REGNUM + nvrn + i);
4699 rtx offset = gen_int_mode
4700 (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode), Pmode);
4701 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
4702 XVECEXP (par, 0, i) = tmp;
4703 }
4704 pcum->aapcs_reg = par;
4705 }
4706 return;
4707 }
4708 else
4709 {
4710 /* C.3 NSRN is set to 8. */
4711 pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
4712 goto on_stack;
4713 }
4714 }
4715
4716 ncrn = pcum->aapcs_ncrn;
4717 nregs = size / UNITS_PER_WORD;
4718
4719 /* C6 - C9. though the sign and zero extension semantics are
4720 handled elsewhere. This is the case where the argument fits
4721 entirely general registers. */
4722 if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
4723 {
4724 gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
4725
4726 /* C.8 if the argument has an alignment of 16 then the NGRN is
4727 rounded up to the next even number. */
4728 if (nregs == 2
4729 && ncrn % 2
4730 /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
4731 comparison is there because for > 16 * BITS_PER_UNIT
4732 alignment nregs should be > 2 and therefore it should be
4733 passed by reference rather than value. */
4734 && (aarch64_function_arg_alignment (mode, type, &abi_break)
4735 == 16 * BITS_PER_UNIT))
4736 {
4737 if (abi_break && warn_psabi && currently_expanding_gimple_stmt)
4738 inform (input_location, "parameter passing for argument of type "
4739 "%qT changed in GCC 9.1", type);
4740 ++ncrn;
4741 gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
4742 }
4743
4744 /* NREGS can be 0 when e.g. an empty structure is to be passed.
4745 A reg is still generated for it, but the caller should be smart
4746 enough not to use it. */
4747 if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
4748 pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
4749 else
4750 {
4751 rtx par;
4752 int i;
4753
4754 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
4755 for (i = 0; i < nregs; i++)
4756 {
4757 rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
4758 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
4759 GEN_INT (i * UNITS_PER_WORD));
4760 XVECEXP (par, 0, i) = tmp;
4761 }
4762 pcum->aapcs_reg = par;
4763 }
4764
4765 pcum->aapcs_nextncrn = ncrn + nregs;
4766 return;
4767 }
4768
4769 /* C.11 */
4770 pcum->aapcs_nextncrn = NUM_ARG_REGS;
4771
4772 /* The argument is passed on stack; record the needed number of words for
4773 this argument and align the total size if necessary. */
4774 on_stack:
4775 pcum->aapcs_stack_words = size / UNITS_PER_WORD;
4776
4777 if (aarch64_function_arg_alignment (mode, type, &abi_break)
4778 == 16 * BITS_PER_UNIT)
4779 {
4780 int new_size = ROUND_UP (pcum->aapcs_stack_size, 16 / UNITS_PER_WORD);
4781 if (pcum->aapcs_stack_size != new_size)
4782 {
4783 if (abi_break && warn_psabi && currently_expanding_gimple_stmt)
4784 inform (input_location, "parameter passing for argument of type "
4785 "%qT changed in GCC 9.1", type);
4786 pcum->aapcs_stack_size = new_size;
4787 }
4788 }
4789 return;
4790 }
4791
4792 /* Implement TARGET_FUNCTION_ARG. */
4793
4794 static rtx
4795 aarch64_function_arg (cumulative_args_t pcum_v, machine_mode mode,
4796 const_tree type, bool named)
4797 {
4798 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
4799 gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64);
4800
4801 if (mode == VOIDmode)
4802 return NULL_RTX;
4803
4804 aarch64_layout_arg (pcum_v, mode, type, named);
4805 return pcum->aapcs_reg;
4806 }
4807
4808 void
4809 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
4810 const_tree fntype ATTRIBUTE_UNUSED,
4811 rtx libname ATTRIBUTE_UNUSED,
4812 const_tree fndecl ATTRIBUTE_UNUSED,
4813 unsigned n_named ATTRIBUTE_UNUSED)
4814 {
4815 pcum->aapcs_ncrn = 0;
4816 pcum->aapcs_nvrn = 0;
4817 pcum->aapcs_nextncrn = 0;
4818 pcum->aapcs_nextnvrn = 0;
4819 pcum->pcs_variant = ARM_PCS_AAPCS64;
4820 pcum->aapcs_reg = NULL_RTX;
4821 pcum->aapcs_arg_processed = false;
4822 pcum->aapcs_stack_words = 0;
4823 pcum->aapcs_stack_size = 0;
4824
4825 if (!TARGET_FLOAT
4826 && fndecl && TREE_PUBLIC (fndecl)
4827 && fntype && fntype != error_mark_node)
4828 {
4829 const_tree type = TREE_TYPE (fntype);
4830 machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument. */
4831 int nregs ATTRIBUTE_UNUSED; /* Likewise. */
4832 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
4833 &mode, &nregs, NULL))
4834 aarch64_err_no_fpadvsimd (TYPE_MODE (type));
4835 }
4836 return;
4837 }
4838
4839 static void
4840 aarch64_function_arg_advance (cumulative_args_t pcum_v,
4841 machine_mode mode,
4842 const_tree type,
4843 bool named)
4844 {
4845 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
4846 if (pcum->pcs_variant == ARM_PCS_AAPCS64)
4847 {
4848 aarch64_layout_arg (pcum_v, mode, type, named);
4849 gcc_assert ((pcum->aapcs_reg != NULL_RTX)
4850 != (pcum->aapcs_stack_words != 0));
4851 pcum->aapcs_arg_processed = false;
4852 pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
4853 pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
4854 pcum->aapcs_stack_size += pcum->aapcs_stack_words;
4855 pcum->aapcs_stack_words = 0;
4856 pcum->aapcs_reg = NULL_RTX;
4857 }
4858 }
4859
4860 bool
4861 aarch64_function_arg_regno_p (unsigned regno)
4862 {
4863 return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
4864 || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
4865 }
4866
4867 /* Implement FUNCTION_ARG_BOUNDARY. Every parameter gets at least
4868 PARM_BOUNDARY bits of alignment, but will be given anything up
4869 to STACK_BOUNDARY bits if the type requires it. This makes sure
4870 that both before and after the layout of each argument, the Next
4871 Stacked Argument Address (NSAA) will have a minimum alignment of
4872 8 bytes. */
4873
4874 static unsigned int
4875 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
4876 {
4877 bool abi_break;
4878 unsigned int alignment = aarch64_function_arg_alignment (mode, type,
4879 &abi_break);
4880 if (abi_break & warn_psabi)
4881 inform (input_location, "parameter passing for argument of type "
4882 "%qT changed in GCC 9.1", type);
4883
4884 return MIN (MAX (alignment, PARM_BOUNDARY), STACK_BOUNDARY);
4885 }
4886
4887 /* Implement TARGET_GET_RAW_RESULT_MODE and TARGET_GET_RAW_ARG_MODE. */
4888
4889 static fixed_size_mode
4890 aarch64_get_reg_raw_mode (int regno)
4891 {
4892 if (TARGET_SVE && FP_REGNUM_P (regno))
4893 /* Don't use the SVE part of the register for __builtin_apply and
4894 __builtin_return. The SVE registers aren't used by the normal PCS,
4895 so using them there would be a waste of time. The PCS extensions
4896 for SVE types are fundamentally incompatible with the
4897 __builtin_return/__builtin_apply interface. */
4898 return as_a <fixed_size_mode> (V16QImode);
4899 return default_get_reg_raw_mode (regno);
4900 }
4901
4902 /* Implement TARGET_FUNCTION_ARG_PADDING.
4903
4904 Small aggregate types are placed in the lowest memory address.
4905
4906 The related parameter passing rules are B.4, C.3, C.5 and C.14. */
4907
4908 static pad_direction
4909 aarch64_function_arg_padding (machine_mode mode, const_tree type)
4910 {
4911 /* On little-endian targets, the least significant byte of every stack
4912 argument is passed at the lowest byte address of the stack slot. */
4913 if (!BYTES_BIG_ENDIAN)
4914 return PAD_UPWARD;
4915
4916 /* Otherwise, integral, floating-point and pointer types are padded downward:
4917 the least significant byte of a stack argument is passed at the highest
4918 byte address of the stack slot. */
4919 if (type
4920 ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
4921 || POINTER_TYPE_P (type))
4922 : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
4923 return PAD_DOWNWARD;
4924
4925 /* Everything else padded upward, i.e. data in first byte of stack slot. */
4926 return PAD_UPWARD;
4927 }
4928
4929 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
4930
4931 It specifies padding for the last (may also be the only)
4932 element of a block move between registers and memory. If
4933 assuming the block is in the memory, padding upward means that
4934 the last element is padded after its highest significant byte,
4935 while in downward padding, the last element is padded at the
4936 its least significant byte side.
4937
4938 Small aggregates and small complex types are always padded
4939 upwards.
4940
4941 We don't need to worry about homogeneous floating-point or
4942 short-vector aggregates; their move is not affected by the
4943 padding direction determined here. Regardless of endianness,
4944 each element of such an aggregate is put in the least
4945 significant bits of a fp/simd register.
4946
4947 Return !BYTES_BIG_ENDIAN if the least significant byte of the
4948 register has useful data, and return the opposite if the most
4949 significant byte does. */
4950
4951 bool
4952 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
4953 bool first ATTRIBUTE_UNUSED)
4954 {
4955
4956 /* Small composite types are always padded upward. */
4957 if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
4958 {
4959 HOST_WIDE_INT size;
4960 if (type)
4961 size = int_size_in_bytes (type);
4962 else
4963 /* No frontends can create types with variable-sized modes, so we
4964 shouldn't be asked to pass or return them. */
4965 size = GET_MODE_SIZE (mode).to_constant ();
4966 if (size < 2 * UNITS_PER_WORD)
4967 return true;
4968 }
4969
4970 /* Otherwise, use the default padding. */
4971 return !BYTES_BIG_ENDIAN;
4972 }
4973
4974 static scalar_int_mode
4975 aarch64_libgcc_cmp_return_mode (void)
4976 {
4977 return SImode;
4978 }
4979
4980 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
4981
4982 /* We use the 12-bit shifted immediate arithmetic instructions so values
4983 must be multiple of (1 << 12), i.e. 4096. */
4984 #define ARITH_FACTOR 4096
4985
4986 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
4987 #error Cannot use simple address calculation for stack probing
4988 #endif
4989
4990 /* The pair of scratch registers used for stack probing. */
4991 #define PROBE_STACK_FIRST_REG R9_REGNUM
4992 #define PROBE_STACK_SECOND_REG R10_REGNUM
4993
4994 /* Emit code to probe a range of stack addresses from FIRST to FIRST+POLY_SIZE,
4995 inclusive. These are offsets from the current stack pointer. */
4996
4997 static void
4998 aarch64_emit_probe_stack_range (HOST_WIDE_INT first, poly_int64 poly_size)
4999 {
5000 HOST_WIDE_INT size;
5001 if (!poly_size.is_constant (&size))
5002 {
5003 sorry ("stack probes for SVE frames");
5004 return;
5005 }
5006
5007 rtx reg1 = gen_rtx_REG (Pmode, PROBE_STACK_FIRST_REG);
5008
5009 /* See the same assertion on PROBE_INTERVAL above. */
5010 gcc_assert ((first % ARITH_FACTOR) == 0);
5011
5012 /* See if we have a constant small number of probes to generate. If so,
5013 that's the easy case. */
5014 if (size <= PROBE_INTERVAL)
5015 {
5016 const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR);
5017
5018 emit_set_insn (reg1,
5019 plus_constant (Pmode,
5020 stack_pointer_rtx, -(first + base)));
5021 emit_stack_probe (plus_constant (Pmode, reg1, base - size));
5022 }
5023
5024 /* The run-time loop is made up of 8 insns in the generic case while the
5025 compile-time loop is made up of 4+2*(n-2) insns for n # of intervals. */
5026 else if (size <= 4 * PROBE_INTERVAL)
5027 {
5028 HOST_WIDE_INT i, rem;
5029
5030 emit_set_insn (reg1,
5031 plus_constant (Pmode,
5032 stack_pointer_rtx,
5033 -(first + PROBE_INTERVAL)));
5034 emit_stack_probe (reg1);
5035
5036 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
5037 it exceeds SIZE. If only two probes are needed, this will not
5038 generate any code. Then probe at FIRST + SIZE. */
5039 for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
5040 {
5041 emit_set_insn (reg1,
5042 plus_constant (Pmode, reg1, -PROBE_INTERVAL));
5043 emit_stack_probe (reg1);
5044 }
5045
5046 rem = size - (i - PROBE_INTERVAL);
5047 if (rem > 256)
5048 {
5049 const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
5050
5051 emit_set_insn (reg1, plus_constant (Pmode, reg1, -base));
5052 emit_stack_probe (plus_constant (Pmode, reg1, base - rem));
5053 }
5054 else
5055 emit_stack_probe (plus_constant (Pmode, reg1, -rem));
5056 }
5057
5058 /* Otherwise, do the same as above, but in a loop. Note that we must be
5059 extra careful with variables wrapping around because we might be at
5060 the very top (or the very bottom) of the address space and we have
5061 to be able to handle this case properly; in particular, we use an
5062 equality test for the loop condition. */
5063 else
5064 {
5065 rtx reg2 = gen_rtx_REG (Pmode, PROBE_STACK_SECOND_REG);
5066
5067 /* Step 1: round SIZE to the previous multiple of the interval. */
5068
5069 HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
5070
5071
5072 /* Step 2: compute initial and final value of the loop counter. */
5073
5074 /* TEST_ADDR = SP + FIRST. */
5075 emit_set_insn (reg1,
5076 plus_constant (Pmode, stack_pointer_rtx, -first));
5077
5078 /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE. */
5079 HOST_WIDE_INT adjustment = - (first + rounded_size);
5080 if (! aarch64_uimm12_shift (adjustment))
5081 {
5082 aarch64_internal_mov_immediate (reg2, GEN_INT (adjustment),
5083 true, Pmode);
5084 emit_set_insn (reg2, gen_rtx_PLUS (Pmode, stack_pointer_rtx, reg2));
5085 }
5086 else
5087 emit_set_insn (reg2,
5088 plus_constant (Pmode, stack_pointer_rtx, adjustment));
5089
5090 /* Step 3: the loop
5091
5092 do
5093 {
5094 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
5095 probe at TEST_ADDR
5096 }
5097 while (TEST_ADDR != LAST_ADDR)
5098
5099 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
5100 until it is equal to ROUNDED_SIZE. */
5101
5102 emit_insn (gen_probe_stack_range (reg1, reg1, reg2));
5103
5104
5105 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
5106 that SIZE is equal to ROUNDED_SIZE. */
5107
5108 if (size != rounded_size)
5109 {
5110 HOST_WIDE_INT rem = size - rounded_size;
5111
5112 if (rem > 256)
5113 {
5114 const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
5115
5116 emit_set_insn (reg2, plus_constant (Pmode, reg2, -base));
5117 emit_stack_probe (plus_constant (Pmode, reg2, base - rem));
5118 }
5119 else
5120 emit_stack_probe (plus_constant (Pmode, reg2, -rem));
5121 }
5122 }
5123
5124 /* Make sure nothing is scheduled before we are done. */
5125 emit_insn (gen_blockage ());
5126 }
5127
5128 /* Probe a range of stack addresses from REG1 to REG2 inclusive. These are
5129 absolute addresses. */
5130
5131 const char *
5132 aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
5133 {
5134 static int labelno = 0;
5135 char loop_lab[32];
5136 rtx xops[2];
5137
5138 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
5139
5140 /* Loop. */
5141 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
5142
5143 HOST_WIDE_INT stack_clash_probe_interval
5144 = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
5145
5146 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
5147 xops[0] = reg1;
5148 HOST_WIDE_INT interval;
5149 if (flag_stack_clash_protection)
5150 interval = stack_clash_probe_interval;
5151 else
5152 interval = PROBE_INTERVAL;
5153
5154 gcc_assert (aarch64_uimm12_shift (interval));
5155 xops[1] = GEN_INT (interval);
5156
5157 output_asm_insn ("sub\t%0, %0, %1", xops);
5158
5159 /* If doing stack clash protection then we probe up by the ABI specified
5160 amount. We do this because we're dropping full pages at a time in the
5161 loop. But if we're doing non-stack clash probing, probe at SP 0. */
5162 if (flag_stack_clash_protection)
5163 xops[1] = GEN_INT (STACK_CLASH_CALLER_GUARD);
5164 else
5165 xops[1] = CONST0_RTX (GET_MODE (xops[1]));
5166
5167 /* Probe at TEST_ADDR. If we're inside the loop it is always safe to probe
5168 by this amount for each iteration. */
5169 output_asm_insn ("str\txzr, [%0, %1]", xops);
5170
5171 /* Test if TEST_ADDR == LAST_ADDR. */
5172 xops[1] = reg2;
5173 output_asm_insn ("cmp\t%0, %1", xops);
5174
5175 /* Branch. */
5176 fputs ("\tb.ne\t", asm_out_file);
5177 assemble_name_raw (asm_out_file, loop_lab);
5178 fputc ('\n', asm_out_file);
5179
5180 return "";
5181 }
5182
5183 /* Emit the probe loop for doing stack clash probes and stack adjustments for
5184 SVE. This emits probes from BASE to BASE - ADJUSTMENT based on a guard size
5185 of GUARD_SIZE. When a probe is emitted it is done at most
5186 MIN_PROBE_THRESHOLD bytes from the current BASE at an interval of
5187 at most MIN_PROBE_THRESHOLD. By the end of this function
5188 BASE = BASE - ADJUSTMENT. */
5189
5190 const char *
5191 aarch64_output_probe_sve_stack_clash (rtx base, rtx adjustment,
5192 rtx min_probe_threshold, rtx guard_size)
5193 {
5194 /* This function is not allowed to use any instruction generation function
5195 like gen_ and friends. If you do you'll likely ICE during CFG validation,
5196 so instead emit the code you want using output_asm_insn. */
5197 gcc_assert (flag_stack_clash_protection);
5198 gcc_assert (CONST_INT_P (min_probe_threshold) && CONST_INT_P (guard_size));
5199 gcc_assert (INTVAL (guard_size) > INTVAL (min_probe_threshold));
5200
5201 /* The minimum required allocation before the residual requires probing. */
5202 HOST_WIDE_INT residual_probe_guard = INTVAL (min_probe_threshold);
5203
5204 /* Clamp the value down to the nearest value that can be used with a cmp. */
5205 residual_probe_guard = aarch64_clamp_to_uimm12_shift (residual_probe_guard);
5206 rtx probe_offset_value_rtx = gen_int_mode (residual_probe_guard, Pmode);
5207
5208 gcc_assert (INTVAL (min_probe_threshold) >= residual_probe_guard);
5209 gcc_assert (aarch64_uimm12_shift (residual_probe_guard));
5210
5211 static int labelno = 0;
5212 char loop_start_lab[32];
5213 char loop_end_lab[32];
5214 rtx xops[2];
5215
5216 ASM_GENERATE_INTERNAL_LABEL (loop_start_lab, "SVLPSPL", labelno);
5217 ASM_GENERATE_INTERNAL_LABEL (loop_end_lab, "SVLPEND", labelno++);
5218
5219 /* Emit loop start label. */
5220 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_start_lab);
5221
5222 /* ADJUSTMENT < RESIDUAL_PROBE_GUARD. */
5223 xops[0] = adjustment;
5224 xops[1] = probe_offset_value_rtx;
5225 output_asm_insn ("cmp\t%0, %1", xops);
5226
5227 /* Branch to end if not enough adjustment to probe. */
5228 fputs ("\tb.lt\t", asm_out_file);
5229 assemble_name_raw (asm_out_file, loop_end_lab);
5230 fputc ('\n', asm_out_file);
5231
5232 /* BASE = BASE - RESIDUAL_PROBE_GUARD. */
5233 xops[0] = base;
5234 xops[1] = probe_offset_value_rtx;
5235 output_asm_insn ("sub\t%0, %0, %1", xops);
5236
5237 /* Probe at BASE. */
5238 xops[1] = const0_rtx;
5239 output_asm_insn ("str\txzr, [%0, %1]", xops);
5240
5241 /* ADJUSTMENT = ADJUSTMENT - RESIDUAL_PROBE_GUARD. */
5242 xops[0] = adjustment;
5243 xops[1] = probe_offset_value_rtx;
5244 output_asm_insn ("sub\t%0, %0, %1", xops);
5245
5246 /* Branch to start if still more bytes to allocate. */
5247 fputs ("\tb\t", asm_out_file);
5248 assemble_name_raw (asm_out_file, loop_start_lab);
5249 fputc ('\n', asm_out_file);
5250
5251 /* No probe leave. */
5252 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_end_lab);
5253
5254 /* BASE = BASE - ADJUSTMENT. */
5255 xops[0] = base;
5256 xops[1] = adjustment;
5257 output_asm_insn ("sub\t%0, %0, %1", xops);
5258 return "";
5259 }
5260
5261 /* Determine whether a frame chain needs to be generated. */
5262 static bool
5263 aarch64_needs_frame_chain (void)
5264 {
5265 /* Force a frame chain for EH returns so the return address is at FP+8. */
5266 if (frame_pointer_needed || crtl->calls_eh_return)
5267 return true;
5268
5269 /* A leaf function cannot have calls or write LR. */
5270 bool is_leaf = crtl->is_leaf && !df_regs_ever_live_p (LR_REGNUM);
5271
5272 /* Don't use a frame chain in leaf functions if leaf frame pointers
5273 are disabled. */
5274 if (flag_omit_leaf_frame_pointer && is_leaf)
5275 return false;
5276
5277 return aarch64_use_frame_pointer;
5278 }
5279
5280 /* Mark the registers that need to be saved by the callee and calculate
5281 the size of the callee-saved registers area and frame record (both FP
5282 and LR may be omitted). */
5283 static void
5284 aarch64_layout_frame (void)
5285 {
5286 HOST_WIDE_INT offset = 0;
5287 int regno, last_fp_reg = INVALID_REGNUM;
5288 bool simd_function = aarch64_simd_decl_p (cfun->decl);
5289
5290 cfun->machine->frame.emit_frame_chain = aarch64_needs_frame_chain ();
5291
5292 /* Adjust the outgoing arguments size if required. Keep it in sync with what
5293 the mid-end is doing. */
5294 crtl->outgoing_args_size = STACK_DYNAMIC_OFFSET (cfun);
5295
5296 #define SLOT_NOT_REQUIRED (-2)
5297 #define SLOT_REQUIRED (-1)
5298
5299 cfun->machine->frame.wb_candidate1 = INVALID_REGNUM;
5300 cfun->machine->frame.wb_candidate2 = INVALID_REGNUM;
5301
5302 /* If this is a non-leaf simd function with calls we assume that
5303 at least one of those calls is to a non-simd function and thus
5304 we must save V8 to V23 in the prologue. */
5305
5306 if (simd_function && !crtl->is_leaf)
5307 {
5308 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
5309 if (FP_SIMD_SAVED_REGNUM_P (regno))
5310 df_set_regs_ever_live (regno, true);
5311 }
5312
5313 /* First mark all the registers that really need to be saved... */
5314 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
5315 cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
5316
5317 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
5318 cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
5319
5320 /* ... that includes the eh data registers (if needed)... */
5321 if (crtl->calls_eh_return)
5322 for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
5323 cfun->machine->frame.reg_offset[EH_RETURN_DATA_REGNO (regno)]
5324 = SLOT_REQUIRED;
5325
5326 /* ... and any callee saved register that dataflow says is live. */
5327 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
5328 if (df_regs_ever_live_p (regno)
5329 && (regno == R30_REGNUM
5330 || !call_used_regs[regno]))
5331 cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
5332
5333 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
5334 if (df_regs_ever_live_p (regno)
5335 && (!call_used_regs[regno]
5336 || (simd_function && FP_SIMD_SAVED_REGNUM_P (regno))))
5337 {
5338 cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
5339 last_fp_reg = regno;
5340 }
5341
5342 if (cfun->machine->frame.emit_frame_chain)
5343 {
5344 /* FP and LR are placed in the linkage record. */
5345 cfun->machine->frame.reg_offset[R29_REGNUM] = 0;
5346 cfun->machine->frame.wb_candidate1 = R29_REGNUM;
5347 cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
5348 cfun->machine->frame.wb_candidate2 = R30_REGNUM;
5349 offset = 2 * UNITS_PER_WORD;
5350 }
5351
5352 /* With stack-clash, LR must be saved in non-leaf functions. */
5353 gcc_assert (crtl->is_leaf
5354 || (cfun->machine->frame.reg_offset[R30_REGNUM]
5355 != SLOT_NOT_REQUIRED));
5356
5357 /* Now assign stack slots for them. */
5358 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
5359 if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
5360 {
5361 cfun->machine->frame.reg_offset[regno] = offset;
5362 if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
5363 cfun->machine->frame.wb_candidate1 = regno;
5364 else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM)
5365 cfun->machine->frame.wb_candidate2 = regno;
5366 offset += UNITS_PER_WORD;
5367 }
5368
5369 HOST_WIDE_INT max_int_offset = offset;
5370 offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
5371 bool has_align_gap = offset != max_int_offset;
5372
5373 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
5374 if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
5375 {
5376 /* If there is an alignment gap between integer and fp callee-saves,
5377 allocate the last fp register to it if possible. */
5378 if (regno == last_fp_reg
5379 && has_align_gap
5380 && !simd_function
5381 && (offset & 8) == 0)
5382 {
5383 cfun->machine->frame.reg_offset[regno] = max_int_offset;
5384 break;
5385 }
5386
5387 cfun->machine->frame.reg_offset[regno] = offset;
5388 if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
5389 cfun->machine->frame.wb_candidate1 = regno;
5390 else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM
5391 && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
5392 cfun->machine->frame.wb_candidate2 = regno;
5393 offset += simd_function ? UNITS_PER_VREG : UNITS_PER_WORD;
5394 }
5395
5396 offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
5397
5398 cfun->machine->frame.saved_regs_size = offset;
5399
5400 HOST_WIDE_INT varargs_and_saved_regs_size
5401 = offset + cfun->machine->frame.saved_varargs_size;
5402
5403 cfun->machine->frame.hard_fp_offset
5404 = aligned_upper_bound (varargs_and_saved_regs_size
5405 + get_frame_size (),
5406 STACK_BOUNDARY / BITS_PER_UNIT);
5407
5408 /* Both these values are already aligned. */
5409 gcc_assert (multiple_p (crtl->outgoing_args_size,
5410 STACK_BOUNDARY / BITS_PER_UNIT));
5411 cfun->machine->frame.frame_size
5412 = (cfun->machine->frame.hard_fp_offset
5413 + crtl->outgoing_args_size);
5414
5415 cfun->machine->frame.locals_offset = cfun->machine->frame.saved_varargs_size;
5416
5417 cfun->machine->frame.initial_adjust = 0;
5418 cfun->machine->frame.final_adjust = 0;
5419 cfun->machine->frame.callee_adjust = 0;
5420 cfun->machine->frame.callee_offset = 0;
5421
5422 HOST_WIDE_INT max_push_offset = 0;
5423 if (cfun->machine->frame.wb_candidate2 != INVALID_REGNUM)
5424 max_push_offset = 512;
5425 else if (cfun->machine->frame.wb_candidate1 != INVALID_REGNUM)
5426 max_push_offset = 256;
5427
5428 HOST_WIDE_INT const_size, const_fp_offset;
5429 if (cfun->machine->frame.frame_size.is_constant (&const_size)
5430 && const_size < max_push_offset
5431 && known_eq (crtl->outgoing_args_size, 0))
5432 {
5433 /* Simple, small frame with no outgoing arguments:
5434 stp reg1, reg2, [sp, -frame_size]!
5435 stp reg3, reg4, [sp, 16] */
5436 cfun->machine->frame.callee_adjust = const_size;
5437 }
5438 else if (known_lt (crtl->outgoing_args_size
5439 + cfun->machine->frame.saved_regs_size, 512)
5440 && !(cfun->calls_alloca
5441 && known_lt (cfun->machine->frame.hard_fp_offset,
5442 max_push_offset)))
5443 {
5444 /* Frame with small outgoing arguments:
5445 sub sp, sp, frame_size
5446 stp reg1, reg2, [sp, outgoing_args_size]
5447 stp reg3, reg4, [sp, outgoing_args_size + 16] */
5448 cfun->machine->frame.initial_adjust = cfun->machine->frame.frame_size;
5449 cfun->machine->frame.callee_offset
5450 = cfun->machine->frame.frame_size - cfun->machine->frame.hard_fp_offset;
5451 }
5452 else if (cfun->machine->frame.hard_fp_offset.is_constant (&const_fp_offset)
5453 && const_fp_offset < max_push_offset)
5454 {
5455 /* Frame with large outgoing arguments but a small local area:
5456 stp reg1, reg2, [sp, -hard_fp_offset]!
5457 stp reg3, reg4, [sp, 16]
5458 sub sp, sp, outgoing_args_size */
5459 cfun->machine->frame.callee_adjust = const_fp_offset;
5460 cfun->machine->frame.final_adjust
5461 = cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
5462 }
5463 else
5464 {
5465 /* Frame with large local area and outgoing arguments using frame pointer:
5466 sub sp, sp, hard_fp_offset
5467 stp x29, x30, [sp, 0]
5468 add x29, sp, 0
5469 stp reg3, reg4, [sp, 16]
5470 sub sp, sp, outgoing_args_size */
5471 cfun->machine->frame.initial_adjust = cfun->machine->frame.hard_fp_offset;
5472 cfun->machine->frame.final_adjust
5473 = cfun->machine->frame.frame_size - cfun->machine->frame.initial_adjust;
5474 }
5475
5476 cfun->machine->frame.laid_out = true;
5477 }
5478
5479 /* Return true if the register REGNO is saved on entry to
5480 the current function. */
5481
5482 static bool
5483 aarch64_register_saved_on_entry (int regno)
5484 {
5485 return cfun->machine->frame.reg_offset[regno] >= 0;
5486 }
5487
5488 /* Return the next register up from REGNO up to LIMIT for the callee
5489 to save. */
5490
5491 static unsigned
5492 aarch64_next_callee_save (unsigned regno, unsigned limit)
5493 {
5494 while (regno <= limit && !aarch64_register_saved_on_entry (regno))
5495 regno ++;
5496 return regno;
5497 }
5498
5499 /* Push the register number REGNO of mode MODE to the stack with write-back
5500 adjusting the stack by ADJUSTMENT. */
5501
5502 static void
5503 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
5504 HOST_WIDE_INT adjustment)
5505 {
5506 rtx base_rtx = stack_pointer_rtx;
5507 rtx insn, reg, mem;
5508
5509 reg = gen_rtx_REG (mode, regno);
5510 mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
5511 plus_constant (Pmode, base_rtx, -adjustment));
5512 mem = gen_frame_mem (mode, mem);
5513
5514 insn = emit_move_insn (mem, reg);
5515 RTX_FRAME_RELATED_P (insn) = 1;
5516 }
5517
5518 /* Generate and return an instruction to store the pair of registers
5519 REG and REG2 of mode MODE to location BASE with write-back adjusting
5520 the stack location BASE by ADJUSTMENT. */
5521
5522 static rtx
5523 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
5524 HOST_WIDE_INT adjustment)
5525 {
5526 switch (mode)
5527 {
5528 case E_DImode:
5529 return gen_storewb_pairdi_di (base, base, reg, reg2,
5530 GEN_INT (-adjustment),
5531 GEN_INT (UNITS_PER_WORD - adjustment));
5532 case E_DFmode:
5533 return gen_storewb_pairdf_di (base, base, reg, reg2,
5534 GEN_INT (-adjustment),
5535 GEN_INT (UNITS_PER_WORD - adjustment));
5536 case E_TFmode:
5537 return gen_storewb_pairtf_di (base, base, reg, reg2,
5538 GEN_INT (-adjustment),
5539 GEN_INT (UNITS_PER_VREG - adjustment));
5540 default:
5541 gcc_unreachable ();
5542 }
5543 }
5544
5545 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
5546 stack pointer by ADJUSTMENT. */
5547
5548 static void
5549 aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
5550 {
5551 rtx_insn *insn;
5552 machine_mode mode = aarch64_reg_save_mode (cfun->decl, regno1);
5553
5554 if (regno2 == INVALID_REGNUM)
5555 return aarch64_pushwb_single_reg (mode, regno1, adjustment);
5556
5557 rtx reg1 = gen_rtx_REG (mode, regno1);
5558 rtx reg2 = gen_rtx_REG (mode, regno2);
5559
5560 insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
5561 reg2, adjustment));
5562 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
5563 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
5564 RTX_FRAME_RELATED_P (insn) = 1;
5565 }
5566
5567 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
5568 adjusting it by ADJUSTMENT afterwards. */
5569
5570 static rtx
5571 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
5572 HOST_WIDE_INT adjustment)
5573 {
5574 switch (mode)
5575 {
5576 case E_DImode:
5577 return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
5578 GEN_INT (UNITS_PER_WORD));
5579 case E_DFmode:
5580 return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
5581 GEN_INT (UNITS_PER_WORD));
5582 case E_TFmode:
5583 return gen_loadwb_pairtf_di (base, base, reg, reg2, GEN_INT (adjustment),
5584 GEN_INT (UNITS_PER_VREG));
5585 default:
5586 gcc_unreachable ();
5587 }
5588 }
5589
5590 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
5591 afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
5592 into CFI_OPS. */
5593
5594 static void
5595 aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
5596 rtx *cfi_ops)
5597 {
5598 machine_mode mode = aarch64_reg_save_mode (cfun->decl, regno1);
5599 rtx reg1 = gen_rtx_REG (mode, regno1);
5600
5601 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
5602
5603 if (regno2 == INVALID_REGNUM)
5604 {
5605 rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment);
5606 mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
5607 emit_move_insn (reg1, gen_frame_mem (mode, mem));
5608 }
5609 else
5610 {
5611 rtx reg2 = gen_rtx_REG (mode, regno2);
5612 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
5613 emit_insn (aarch64_gen_loadwb_pair (mode, stack_pointer_rtx, reg1,
5614 reg2, adjustment));
5615 }
5616 }
5617
5618 /* Generate and return a store pair instruction of mode MODE to store
5619 register REG1 to MEM1 and register REG2 to MEM2. */
5620
5621 static rtx
5622 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
5623 rtx reg2)
5624 {
5625 switch (mode)
5626 {
5627 case E_DImode:
5628 return gen_store_pair_dw_didi (mem1, reg1, mem2, reg2);
5629
5630 case E_DFmode:
5631 return gen_store_pair_dw_dfdf (mem1, reg1, mem2, reg2);
5632
5633 case E_TFmode:
5634 return gen_store_pair_dw_tftf (mem1, reg1, mem2, reg2);
5635
5636 default:
5637 gcc_unreachable ();
5638 }
5639 }
5640
5641 /* Generate and regurn a load pair isntruction of mode MODE to load register
5642 REG1 from MEM1 and register REG2 from MEM2. */
5643
5644 static rtx
5645 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
5646 rtx mem2)
5647 {
5648 switch (mode)
5649 {
5650 case E_DImode:
5651 return gen_load_pair_dw_didi (reg1, mem1, reg2, mem2);
5652
5653 case E_DFmode:
5654 return gen_load_pair_dw_dfdf (reg1, mem1, reg2, mem2);
5655
5656 case E_TFmode:
5657 return gen_load_pair_dw_tftf (reg1, mem1, reg2, mem2);
5658
5659 default:
5660 gcc_unreachable ();
5661 }
5662 }
5663
5664 /* Return TRUE if return address signing should be enabled for the current
5665 function, otherwise return FALSE. */
5666
5667 bool
5668 aarch64_return_address_signing_enabled (void)
5669 {
5670 /* This function should only be called after frame laid out. */
5671 gcc_assert (cfun->machine->frame.laid_out);
5672
5673 /* If signing scope is AARCH64_FUNCTION_NON_LEAF, we only sign a leaf function
5674 if its LR is pushed onto stack. */
5675 return (aarch64_ra_sign_scope == AARCH64_FUNCTION_ALL
5676 || (aarch64_ra_sign_scope == AARCH64_FUNCTION_NON_LEAF
5677 && cfun->machine->frame.reg_offset[LR_REGNUM] >= 0));
5678 }
5679
5680 /* Return TRUE if Branch Target Identification Mechanism is enabled. */
5681 bool
5682 aarch64_bti_enabled (void)
5683 {
5684 return (aarch64_enable_bti == 1);
5685 }
5686
5687 /* Emit code to save the callee-saved registers from register number START
5688 to LIMIT to the stack at the location starting at offset START_OFFSET,
5689 skipping any write-back candidates if SKIP_WB is true. */
5690
5691 static void
5692 aarch64_save_callee_saves (machine_mode mode, poly_int64 start_offset,
5693 unsigned start, unsigned limit, bool skip_wb)
5694 {
5695 rtx_insn *insn;
5696 unsigned regno;
5697 unsigned regno2;
5698
5699 for (regno = aarch64_next_callee_save (start, limit);
5700 regno <= limit;
5701 regno = aarch64_next_callee_save (regno + 1, limit))
5702 {
5703 rtx reg, mem;
5704 poly_int64 offset;
5705 int offset_diff;
5706
5707 if (skip_wb
5708 && (regno == cfun->machine->frame.wb_candidate1
5709 || regno == cfun->machine->frame.wb_candidate2))
5710 continue;
5711
5712 if (cfun->machine->reg_is_wrapped_separately[regno])
5713 continue;
5714
5715 reg = gen_rtx_REG (mode, regno);
5716 offset = start_offset + cfun->machine->frame.reg_offset[regno];
5717 mem = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
5718 offset));
5719
5720 regno2 = aarch64_next_callee_save (regno + 1, limit);
5721 offset_diff = cfun->machine->frame.reg_offset[regno2]
5722 - cfun->machine->frame.reg_offset[regno];
5723
5724 if (regno2 <= limit
5725 && !cfun->machine->reg_is_wrapped_separately[regno2]
5726 && known_eq (GET_MODE_SIZE (mode), offset_diff))
5727 {
5728 rtx reg2 = gen_rtx_REG (mode, regno2);
5729 rtx mem2;
5730
5731 offset = start_offset + cfun->machine->frame.reg_offset[regno2];
5732 mem2 = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
5733 offset));
5734 insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
5735 reg2));
5736
5737 /* The first part of a frame-related parallel insn is
5738 always assumed to be relevant to the frame
5739 calculations; subsequent parts, are only
5740 frame-related if explicitly marked. */
5741 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
5742 regno = regno2;
5743 }
5744 else
5745 insn = emit_move_insn (mem, reg);
5746
5747 RTX_FRAME_RELATED_P (insn) = 1;
5748 }
5749 }
5750
5751 /* Emit code to restore the callee registers of mode MODE from register
5752 number START up to and including LIMIT. Restore from the stack offset
5753 START_OFFSET, skipping any write-back candidates if SKIP_WB is true.
5754 Write the appropriate REG_CFA_RESTORE notes into CFI_OPS. */
5755
5756 static void
5757 aarch64_restore_callee_saves (machine_mode mode,
5758 poly_int64 start_offset, unsigned start,
5759 unsigned limit, bool skip_wb, rtx *cfi_ops)
5760 {
5761 rtx base_rtx = stack_pointer_rtx;
5762 unsigned regno;
5763 unsigned regno2;
5764 poly_int64 offset;
5765
5766 for (regno = aarch64_next_callee_save (start, limit);
5767 regno <= limit;
5768 regno = aarch64_next_callee_save (regno + 1, limit))
5769 {
5770 if (cfun->machine->reg_is_wrapped_separately[regno])
5771 continue;
5772
5773 rtx reg, mem;
5774 int offset_diff;
5775
5776 if (skip_wb
5777 && (regno == cfun->machine->frame.wb_candidate1
5778 || regno == cfun->machine->frame.wb_candidate2))
5779 continue;
5780
5781 reg = gen_rtx_REG (mode, regno);
5782 offset = start_offset + cfun->machine->frame.reg_offset[regno];
5783 mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
5784
5785 regno2 = aarch64_next_callee_save (regno + 1, limit);
5786 offset_diff = cfun->machine->frame.reg_offset[regno2]
5787 - cfun->machine->frame.reg_offset[regno];
5788
5789 if (regno2 <= limit
5790 && !cfun->machine->reg_is_wrapped_separately[regno2]
5791 && known_eq (GET_MODE_SIZE (mode), offset_diff))
5792 {
5793 rtx reg2 = gen_rtx_REG (mode, regno2);
5794 rtx mem2;
5795
5796 offset = start_offset + cfun->machine->frame.reg_offset[regno2];
5797 mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
5798 emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
5799
5800 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
5801 regno = regno2;
5802 }
5803 else
5804 emit_move_insn (reg, mem);
5805 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
5806 }
5807 }
5808
5809 /* Return true if OFFSET is a signed 4-bit value multiplied by the size
5810 of MODE. */
5811
5812 static inline bool
5813 offset_4bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
5814 {
5815 HOST_WIDE_INT multiple;
5816 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5817 && IN_RANGE (multiple, -8, 7));
5818 }
5819
5820 /* Return true if OFFSET is a unsigned 6-bit value multiplied by the size
5821 of MODE. */
5822
5823 static inline bool
5824 offset_6bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
5825 {
5826 HOST_WIDE_INT multiple;
5827 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5828 && IN_RANGE (multiple, 0, 63));
5829 }
5830
5831 /* Return true if OFFSET is a signed 7-bit value multiplied by the size
5832 of MODE. */
5833
5834 bool
5835 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
5836 {
5837 HOST_WIDE_INT multiple;
5838 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5839 && IN_RANGE (multiple, -64, 63));
5840 }
5841
5842 /* Return true if OFFSET is a signed 9-bit value. */
5843
5844 bool
5845 aarch64_offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
5846 poly_int64 offset)
5847 {
5848 HOST_WIDE_INT const_offset;
5849 return (offset.is_constant (&const_offset)
5850 && IN_RANGE (const_offset, -256, 255));
5851 }
5852
5853 /* Return true if OFFSET is a signed 9-bit value multiplied by the size
5854 of MODE. */
5855
5856 static inline bool
5857 offset_9bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
5858 {
5859 HOST_WIDE_INT multiple;
5860 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5861 && IN_RANGE (multiple, -256, 255));
5862 }
5863
5864 /* Return true if OFFSET is an unsigned 12-bit value multiplied by the size
5865 of MODE. */
5866
5867 static inline bool
5868 offset_12bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
5869 {
5870 HOST_WIDE_INT multiple;
5871 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5872 && IN_RANGE (multiple, 0, 4095));
5873 }
5874
5875 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS. */
5876
5877 static sbitmap
5878 aarch64_get_separate_components (void)
5879 {
5880 sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
5881 bitmap_clear (components);
5882
5883 /* The registers we need saved to the frame. */
5884 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
5885 if (aarch64_register_saved_on_entry (regno))
5886 {
5887 poly_int64 offset = cfun->machine->frame.reg_offset[regno];
5888 if (!frame_pointer_needed)
5889 offset += cfun->machine->frame.frame_size
5890 - cfun->machine->frame.hard_fp_offset;
5891 /* Check that we can access the stack slot of the register with one
5892 direct load with no adjustments needed. */
5893 if (offset_12bit_unsigned_scaled_p (DImode, offset))
5894 bitmap_set_bit (components, regno);
5895 }
5896
5897 /* Don't mess with the hard frame pointer. */
5898 if (frame_pointer_needed)
5899 bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM);
5900
5901 unsigned reg1 = cfun->machine->frame.wb_candidate1;
5902 unsigned reg2 = cfun->machine->frame.wb_candidate2;
5903 /* If registers have been chosen to be stored/restored with
5904 writeback don't interfere with them to avoid having to output explicit
5905 stack adjustment instructions. */
5906 if (reg2 != INVALID_REGNUM)
5907 bitmap_clear_bit (components, reg2);
5908 if (reg1 != INVALID_REGNUM)
5909 bitmap_clear_bit (components, reg1);
5910
5911 bitmap_clear_bit (components, LR_REGNUM);
5912 bitmap_clear_bit (components, SP_REGNUM);
5913
5914 return components;
5915 }
5916
5917 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB. */
5918
5919 static sbitmap
5920 aarch64_components_for_bb (basic_block bb)
5921 {
5922 bitmap in = DF_LIVE_IN (bb);
5923 bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
5924 bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
5925 bool simd_function = aarch64_simd_decl_p (cfun->decl);
5926
5927 sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
5928 bitmap_clear (components);
5929
5930 /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets. */
5931 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
5932 if ((!call_used_regs[regno]
5933 || (simd_function && FP_SIMD_SAVED_REGNUM_P (regno)))
5934 && (bitmap_bit_p (in, regno)
5935 || bitmap_bit_p (gen, regno)
5936 || bitmap_bit_p (kill, regno)))
5937 {
5938 unsigned regno2, offset, offset2;
5939 bitmap_set_bit (components, regno);
5940
5941 /* If there is a callee-save at an adjacent offset, add it too
5942 to increase the use of LDP/STP. */
5943 offset = cfun->machine->frame.reg_offset[regno];
5944 regno2 = ((offset & 8) == 0) ? regno + 1 : regno - 1;
5945
5946 if (regno2 <= LAST_SAVED_REGNUM)
5947 {
5948 offset2 = cfun->machine->frame.reg_offset[regno2];
5949 if ((offset & ~8) == (offset2 & ~8))
5950 bitmap_set_bit (components, regno2);
5951 }
5952 }
5953
5954 return components;
5955 }
5956
5957 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
5958 Nothing to do for aarch64. */
5959
5960 static void
5961 aarch64_disqualify_components (sbitmap, edge, sbitmap, bool)
5962 {
5963 }
5964
5965 /* Return the next set bit in BMP from START onwards. Return the total number
5966 of bits in BMP if no set bit is found at or after START. */
5967
5968 static unsigned int
5969 aarch64_get_next_set_bit (sbitmap bmp, unsigned int start)
5970 {
5971 unsigned int nbits = SBITMAP_SIZE (bmp);
5972 if (start == nbits)
5973 return start;
5974
5975 gcc_assert (start < nbits);
5976 for (unsigned int i = start; i < nbits; i++)
5977 if (bitmap_bit_p (bmp, i))
5978 return i;
5979
5980 return nbits;
5981 }
5982
5983 /* Do the work for aarch64_emit_prologue_components and
5984 aarch64_emit_epilogue_components. COMPONENTS is the bitmap of registers
5985 to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
5986 for these components or the epilogue sequence. That is, it determines
5987 whether we should emit stores or loads and what kind of CFA notes to attach
5988 to the insns. Otherwise the logic for the two sequences is very
5989 similar. */
5990
5991 static void
5992 aarch64_process_components (sbitmap components, bool prologue_p)
5993 {
5994 rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
5995 ? HARD_FRAME_POINTER_REGNUM
5996 : STACK_POINTER_REGNUM);
5997
5998 unsigned last_regno = SBITMAP_SIZE (components);
5999 unsigned regno = aarch64_get_next_set_bit (components, R0_REGNUM);
6000 rtx_insn *insn = NULL;
6001
6002 while (regno != last_regno)
6003 {
6004 /* AAPCS64 section 5.1.2 requires only the low 64 bits to be saved
6005 so DFmode for the vector registers is enough. For simd functions
6006 we want to save the low 128 bits. */
6007 machine_mode mode = aarch64_reg_save_mode (cfun->decl, regno);
6008
6009 rtx reg = gen_rtx_REG (mode, regno);
6010 poly_int64 offset = cfun->machine->frame.reg_offset[regno];
6011 if (!frame_pointer_needed)
6012 offset += cfun->machine->frame.frame_size
6013 - cfun->machine->frame.hard_fp_offset;
6014 rtx addr = plus_constant (Pmode, ptr_reg, offset);
6015 rtx mem = gen_frame_mem (mode, addr);
6016
6017 rtx set = prologue_p ? gen_rtx_SET (mem, reg) : gen_rtx_SET (reg, mem);
6018 unsigned regno2 = aarch64_get_next_set_bit (components, regno + 1);
6019 /* No more registers to handle after REGNO.
6020 Emit a single save/restore and exit. */
6021 if (regno2 == last_regno)
6022 {
6023 insn = emit_insn (set);
6024 RTX_FRAME_RELATED_P (insn) = 1;
6025 if (prologue_p)
6026 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
6027 else
6028 add_reg_note (insn, REG_CFA_RESTORE, reg);
6029 break;
6030 }
6031
6032 poly_int64 offset2 = cfun->machine->frame.reg_offset[regno2];
6033 /* The next register is not of the same class or its offset is not
6034 mergeable with the current one into a pair. */
6035 if (!satisfies_constraint_Ump (mem)
6036 || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
6037 || (aarch64_simd_decl_p (cfun->decl) && FP_REGNUM_P (regno))
6038 || maybe_ne ((offset2 - cfun->machine->frame.reg_offset[regno]),
6039 GET_MODE_SIZE (mode)))
6040 {
6041 insn = emit_insn (set);
6042 RTX_FRAME_RELATED_P (insn) = 1;
6043 if (prologue_p)
6044 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
6045 else
6046 add_reg_note (insn, REG_CFA_RESTORE, reg);
6047
6048 regno = regno2;
6049 continue;
6050 }
6051
6052 /* REGNO2 can be saved/restored in a pair with REGNO. */
6053 rtx reg2 = gen_rtx_REG (mode, regno2);
6054 if (!frame_pointer_needed)
6055 offset2 += cfun->machine->frame.frame_size
6056 - cfun->machine->frame.hard_fp_offset;
6057 rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
6058 rtx mem2 = gen_frame_mem (mode, addr2);
6059 rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
6060 : gen_rtx_SET (reg2, mem2);
6061
6062 if (prologue_p)
6063 insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2, reg2));
6064 else
6065 insn = emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
6066
6067 RTX_FRAME_RELATED_P (insn) = 1;
6068 if (prologue_p)
6069 {
6070 add_reg_note (insn, REG_CFA_OFFSET, set);
6071 add_reg_note (insn, REG_CFA_OFFSET, set2);
6072 }
6073 else
6074 {
6075 add_reg_note (insn, REG_CFA_RESTORE, reg);
6076 add_reg_note (insn, REG_CFA_RESTORE, reg2);
6077 }
6078
6079 regno = aarch64_get_next_set_bit (components, regno2 + 1);
6080 }
6081 }
6082
6083 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS. */
6084
6085 static void
6086 aarch64_emit_prologue_components (sbitmap components)
6087 {
6088 aarch64_process_components (components, true);
6089 }
6090
6091 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS. */
6092
6093 static void
6094 aarch64_emit_epilogue_components (sbitmap components)
6095 {
6096 aarch64_process_components (components, false);
6097 }
6098
6099 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS. */
6100
6101 static void
6102 aarch64_set_handled_components (sbitmap components)
6103 {
6104 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
6105 if (bitmap_bit_p (components, regno))
6106 cfun->machine->reg_is_wrapped_separately[regno] = true;
6107 }
6108
6109 /* On AArch64 we have an ABI defined safe buffer. This constant is used to
6110 determining the probe offset for alloca. */
6111
6112 static HOST_WIDE_INT
6113 aarch64_stack_clash_protection_alloca_probe_range (void)
6114 {
6115 return STACK_CLASH_CALLER_GUARD;
6116 }
6117
6118
6119 /* Allocate POLY_SIZE bytes of stack space using TEMP1 and TEMP2 as scratch
6120 registers. If POLY_SIZE is not large enough to require a probe this function
6121 will only adjust the stack. When allocating the stack space
6122 FRAME_RELATED_P is then used to indicate if the allocation is frame related.
6123 FINAL_ADJUSTMENT_P indicates whether we are allocating the outgoing
6124 arguments. If we are then we ensure that any allocation larger than the ABI
6125 defined buffer needs a probe so that the invariant of having a 1KB buffer is
6126 maintained.
6127
6128 We emit barriers after each stack adjustment to prevent optimizations from
6129 breaking the invariant that we never drop the stack more than a page. This
6130 invariant is needed to make it easier to correctly handle asynchronous
6131 events, e.g. if we were to allow the stack to be dropped by more than a page
6132 and then have multiple probes up and we take a signal somewhere in between
6133 then the signal handler doesn't know the state of the stack and can make no
6134 assumptions about which pages have been probed. */
6135
6136 static void
6137 aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
6138 poly_int64 poly_size,
6139 bool frame_related_p,
6140 bool final_adjustment_p)
6141 {
6142 HOST_WIDE_INT guard_size
6143 = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
6144 HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
6145 /* When doing the final adjustment for the outgoing argument size we can't
6146 assume that LR was saved at position 0. So subtract it's offset from the
6147 ABI safe buffer so that we don't accidentally allow an adjustment that
6148 would result in an allocation larger than the ABI buffer without
6149 probing. */
6150 HOST_WIDE_INT min_probe_threshold
6151 = final_adjustment_p
6152 ? guard_used_by_caller - cfun->machine->frame.reg_offset[LR_REGNUM]
6153 : guard_size - guard_used_by_caller;
6154
6155 poly_int64 frame_size = cfun->machine->frame.frame_size;
6156
6157 /* We should always have a positive probe threshold. */
6158 gcc_assert (min_probe_threshold > 0);
6159
6160 if (flag_stack_clash_protection && !final_adjustment_p)
6161 {
6162 poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
6163 poly_int64 final_adjust = cfun->machine->frame.final_adjust;
6164
6165 if (known_eq (frame_size, 0))
6166 {
6167 dump_stack_clash_frame_info (NO_PROBE_NO_FRAME, false);
6168 }
6169 else if (known_lt (initial_adjust, guard_size - guard_used_by_caller)
6170 && known_lt (final_adjust, guard_used_by_caller))
6171 {
6172 dump_stack_clash_frame_info (NO_PROBE_SMALL_FRAME, true);
6173 }
6174 }
6175
6176 /* If SIZE is not large enough to require probing, just adjust the stack and
6177 exit. */
6178 if (known_lt (poly_size, min_probe_threshold)
6179 || !flag_stack_clash_protection)
6180 {
6181 aarch64_sub_sp (temp1, temp2, poly_size, frame_related_p);
6182 return;
6183 }
6184
6185 HOST_WIDE_INT size;
6186 /* Handle the SVE non-constant case first. */
6187 if (!poly_size.is_constant (&size))
6188 {
6189 if (dump_file)
6190 {
6191 fprintf (dump_file, "Stack clash SVE prologue: ");
6192 print_dec (poly_size, dump_file);
6193 fprintf (dump_file, " bytes, dynamic probing will be required.\n");
6194 }
6195
6196 /* First calculate the amount of bytes we're actually spilling. */
6197 aarch64_add_offset (Pmode, temp1, CONST0_RTX (Pmode),
6198 poly_size, temp1, temp2, false, true);
6199
6200 rtx_insn *insn = get_last_insn ();
6201
6202 if (frame_related_p)
6203 {
6204 /* This is done to provide unwinding information for the stack
6205 adjustments we're about to do, however to prevent the optimizers
6206 from removing the R11 move and leaving the CFA note (which would be
6207 very wrong) we tie the old and new stack pointer together.
6208 The tie will expand to nothing but the optimizers will not touch
6209 the instruction. */
6210 rtx stack_ptr_copy = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
6211 emit_move_insn (stack_ptr_copy, stack_pointer_rtx);
6212 emit_insn (gen_stack_tie (stack_ptr_copy, stack_pointer_rtx));
6213
6214 /* We want the CFA independent of the stack pointer for the
6215 duration of the loop. */
6216 add_reg_note (insn, REG_CFA_DEF_CFA, stack_ptr_copy);
6217 RTX_FRAME_RELATED_P (insn) = 1;
6218 }
6219
6220 rtx probe_const = gen_int_mode (min_probe_threshold, Pmode);
6221 rtx guard_const = gen_int_mode (guard_size, Pmode);
6222
6223 insn = emit_insn (gen_probe_sve_stack_clash (Pmode, stack_pointer_rtx,
6224 stack_pointer_rtx, temp1,
6225 probe_const, guard_const));
6226
6227 /* Now reset the CFA register if needed. */
6228 if (frame_related_p)
6229 {
6230 add_reg_note (insn, REG_CFA_DEF_CFA,
6231 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
6232 gen_int_mode (poly_size, Pmode)));
6233 RTX_FRAME_RELATED_P (insn) = 1;
6234 }
6235
6236 return;
6237 }
6238
6239 if (dump_file)
6240 fprintf (dump_file,
6241 "Stack clash AArch64 prologue: " HOST_WIDE_INT_PRINT_DEC
6242 " bytes, probing will be required.\n", size);
6243
6244 /* Round size to the nearest multiple of guard_size, and calculate the
6245 residual as the difference between the original size and the rounded
6246 size. */
6247 HOST_WIDE_INT rounded_size = ROUND_DOWN (size, guard_size);
6248 HOST_WIDE_INT residual = size - rounded_size;
6249
6250 /* We can handle a small number of allocations/probes inline. Otherwise
6251 punt to a loop. */
6252 if (rounded_size <= STACK_CLASH_MAX_UNROLL_PAGES * guard_size)
6253 {
6254 for (HOST_WIDE_INT i = 0; i < rounded_size; i += guard_size)
6255 {
6256 aarch64_sub_sp (NULL, temp2, guard_size, true);
6257 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
6258 guard_used_by_caller));
6259 emit_insn (gen_blockage ());
6260 }
6261 dump_stack_clash_frame_info (PROBE_INLINE, size != rounded_size);
6262 }
6263 else
6264 {
6265 /* Compute the ending address. */
6266 aarch64_add_offset (Pmode, temp1, stack_pointer_rtx, -rounded_size,
6267 temp1, NULL, false, true);
6268 rtx_insn *insn = get_last_insn ();
6269
6270 /* For the initial allocation, we don't have a frame pointer
6271 set up, so we always need CFI notes. If we're doing the
6272 final allocation, then we may have a frame pointer, in which
6273 case it is the CFA, otherwise we need CFI notes.
6274
6275 We can determine which allocation we are doing by looking at
6276 the value of FRAME_RELATED_P since the final allocations are not
6277 frame related. */
6278 if (frame_related_p)
6279 {
6280 /* We want the CFA independent of the stack pointer for the
6281 duration of the loop. */
6282 add_reg_note (insn, REG_CFA_DEF_CFA,
6283 plus_constant (Pmode, temp1, rounded_size));
6284 RTX_FRAME_RELATED_P (insn) = 1;
6285 }
6286
6287 /* This allocates and probes the stack. Note that this re-uses some of
6288 the existing Ada stack protection code. However we are guaranteed not
6289 to enter the non loop or residual branches of that code.
6290
6291 The non-loop part won't be entered because if our allocation amount
6292 doesn't require a loop, the case above would handle it.
6293
6294 The residual amount won't be entered because TEMP1 is a mutliple of
6295 the allocation size. The residual will always be 0. As such, the only
6296 part we are actually using from that code is the loop setup. The
6297 actual probing is done in aarch64_output_probe_stack_range. */
6298 insn = emit_insn (gen_probe_stack_range (stack_pointer_rtx,
6299 stack_pointer_rtx, temp1));
6300
6301 /* Now reset the CFA register if needed. */
6302 if (frame_related_p)
6303 {
6304 add_reg_note (insn, REG_CFA_DEF_CFA,
6305 plus_constant (Pmode, stack_pointer_rtx, rounded_size));
6306 RTX_FRAME_RELATED_P (insn) = 1;
6307 }
6308
6309 emit_insn (gen_blockage ());
6310 dump_stack_clash_frame_info (PROBE_LOOP, size != rounded_size);
6311 }
6312
6313 /* Handle any residuals. Residuals of at least MIN_PROBE_THRESHOLD have to
6314 be probed. This maintains the requirement that each page is probed at
6315 least once. For initial probing we probe only if the allocation is
6316 more than GUARD_SIZE - buffer, and for the outgoing arguments we probe
6317 if the amount is larger than buffer. GUARD_SIZE - buffer + buffer ==
6318 GUARD_SIZE. This works that for any allocation that is large enough to
6319 trigger a probe here, we'll have at least one, and if they're not large
6320 enough for this code to emit anything for them, The page would have been
6321 probed by the saving of FP/LR either by this function or any callees. If
6322 we don't have any callees then we won't have more stack adjustments and so
6323 are still safe. */
6324 if (residual)
6325 {
6326 HOST_WIDE_INT residual_probe_offset = guard_used_by_caller;
6327 /* If we're doing final adjustments, and we've done any full page
6328 allocations then any residual needs to be probed. */
6329 if (final_adjustment_p && rounded_size != 0)
6330 min_probe_threshold = 0;
6331 /* If doing a small final adjustment, we always probe at offset 0.
6332 This is done to avoid issues when LR is not at position 0 or when
6333 the final adjustment is smaller than the probing offset. */
6334 else if (final_adjustment_p && rounded_size == 0)
6335 residual_probe_offset = 0;
6336
6337 aarch64_sub_sp (temp1, temp2, residual, frame_related_p);
6338 if (residual >= min_probe_threshold)
6339 {
6340 if (dump_file)
6341 fprintf (dump_file,
6342 "Stack clash AArch64 prologue residuals: "
6343 HOST_WIDE_INT_PRINT_DEC " bytes, probing will be required."
6344 "\n", residual);
6345
6346 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
6347 residual_probe_offset));
6348 emit_insn (gen_blockage ());
6349 }
6350 }
6351 }
6352
6353 /* Return 1 if the register is used by the epilogue. We need to say the
6354 return register is used, but only after epilogue generation is complete.
6355 Note that in the case of sibcalls, the values "used by the epilogue" are
6356 considered live at the start of the called function.
6357
6358 For SIMD functions we need to return 1 for FP registers that are saved and
6359 restored by a function but are not zero in call_used_regs. If we do not do
6360 this optimizations may remove the restore of the register. */
6361
6362 int
6363 aarch64_epilogue_uses (int regno)
6364 {
6365 if (epilogue_completed)
6366 {
6367 if (regno == LR_REGNUM)
6368 return 1;
6369 if (aarch64_simd_decl_p (cfun->decl) && FP_SIMD_SAVED_REGNUM_P (regno))
6370 return 1;
6371 }
6372 return 0;
6373 }
6374
6375 /* Add a REG_CFA_EXPRESSION note to INSN to say that register REG
6376 is saved at BASE + OFFSET. */
6377
6378 static void
6379 aarch64_add_cfa_expression (rtx_insn *insn, unsigned int reg,
6380 rtx base, poly_int64 offset)
6381 {
6382 rtx mem = gen_frame_mem (DImode, plus_constant (Pmode, base, offset));
6383 add_reg_note (insn, REG_CFA_EXPRESSION,
6384 gen_rtx_SET (mem, regno_reg_rtx[reg]));
6385 }
6386
6387 /* AArch64 stack frames generated by this compiler look like:
6388
6389 +-------------------------------+
6390 | |
6391 | incoming stack arguments |
6392 | |
6393 +-------------------------------+
6394 | | <-- incoming stack pointer (aligned)
6395 | callee-allocated save area |
6396 | for register varargs |
6397 | |
6398 +-------------------------------+
6399 | local variables | <-- frame_pointer_rtx
6400 | |
6401 +-------------------------------+
6402 | padding | \
6403 +-------------------------------+ |
6404 | callee-saved registers | | frame.saved_regs_size
6405 +-------------------------------+ |
6406 | LR' | |
6407 +-------------------------------+ |
6408 | FP' | / <- hard_frame_pointer_rtx (aligned)
6409 +-------------------------------+
6410 | dynamic allocation |
6411 +-------------------------------+
6412 | padding |
6413 +-------------------------------+
6414 | outgoing stack arguments | <-- arg_pointer
6415 | |
6416 +-------------------------------+
6417 | | <-- stack_pointer_rtx (aligned)
6418
6419 Dynamic stack allocations via alloca() decrease stack_pointer_rtx
6420 but leave frame_pointer_rtx and hard_frame_pointer_rtx
6421 unchanged.
6422
6423 By default for stack-clash we assume the guard is at least 64KB, but this
6424 value is configurable to either 4KB or 64KB. We also force the guard size to
6425 be the same as the probing interval and both values are kept in sync.
6426
6427 With those assumptions the callee can allocate up to 63KB (or 3KB depending
6428 on the guard size) of stack space without probing.
6429
6430 When probing is needed, we emit a probe at the start of the prologue
6431 and every PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE bytes thereafter.
6432
6433 We have to track how much space has been allocated and the only stores
6434 to the stack we track as implicit probes are the FP/LR stores.
6435
6436 For outgoing arguments we probe if the size is larger than 1KB, such that
6437 the ABI specified buffer is maintained for the next callee.
6438
6439 The following registers are reserved during frame layout and should not be
6440 used for any other purpose:
6441
6442 - r11: Used by stack clash protection when SVE is enabled.
6443 - r12(EP0) and r13(EP1): Used as temporaries for stack adjustment.
6444 - r14 and r15: Used for speculation tracking.
6445 - r16(IP0), r17(IP1): Used by indirect tailcalls.
6446 - r30(LR), r29(FP): Used by standard frame layout.
6447
6448 These registers must be avoided in frame layout related code unless the
6449 explicit intention is to interact with one of the features listed above. */
6450
6451 /* Generate the prologue instructions for entry into a function.
6452 Establish the stack frame by decreasing the stack pointer with a
6453 properly calculated size and, if necessary, create a frame record
6454 filled with the values of LR and previous frame pointer. The
6455 current FP is also set up if it is in use. */
6456
6457 void
6458 aarch64_expand_prologue (void)
6459 {
6460 poly_int64 frame_size = cfun->machine->frame.frame_size;
6461 poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
6462 HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
6463 poly_int64 final_adjust = cfun->machine->frame.final_adjust;
6464 poly_int64 callee_offset = cfun->machine->frame.callee_offset;
6465 unsigned reg1 = cfun->machine->frame.wb_candidate1;
6466 unsigned reg2 = cfun->machine->frame.wb_candidate2;
6467 bool emit_frame_chain = cfun->machine->frame.emit_frame_chain;
6468 rtx_insn *insn;
6469
6470 /* Sign return address for functions. */
6471 if (aarch64_return_address_signing_enabled ())
6472 {
6473 switch (aarch64_ra_sign_key)
6474 {
6475 case AARCH64_KEY_A:
6476 insn = emit_insn (gen_paciasp ());
6477 break;
6478 case AARCH64_KEY_B:
6479 insn = emit_insn (gen_pacibsp ());
6480 break;
6481 default:
6482 gcc_unreachable ();
6483 }
6484 add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
6485 RTX_FRAME_RELATED_P (insn) = 1;
6486 }
6487
6488 if (flag_stack_usage_info)
6489 current_function_static_stack_size = constant_lower_bound (frame_size);
6490
6491 if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
6492 {
6493 if (crtl->is_leaf && !cfun->calls_alloca)
6494 {
6495 if (maybe_gt (frame_size, PROBE_INTERVAL)
6496 && maybe_gt (frame_size, get_stack_check_protect ()))
6497 aarch64_emit_probe_stack_range (get_stack_check_protect (),
6498 (frame_size
6499 - get_stack_check_protect ()));
6500 }
6501 else if (maybe_gt (frame_size, 0))
6502 aarch64_emit_probe_stack_range (get_stack_check_protect (), frame_size);
6503 }
6504
6505 rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
6506 rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
6507
6508 /* In theory we should never have both an initial adjustment
6509 and a callee save adjustment. Verify that is the case since the
6510 code below does not handle it for -fstack-clash-protection. */
6511 gcc_assert (known_eq (initial_adjust, 0) || callee_adjust == 0);
6512
6513 /* Will only probe if the initial adjustment is larger than the guard
6514 less the amount of the guard reserved for use by the caller's
6515 outgoing args. */
6516 aarch64_allocate_and_probe_stack_space (tmp0_rtx, tmp1_rtx, initial_adjust,
6517 true, false);
6518
6519 if (callee_adjust != 0)
6520 aarch64_push_regs (reg1, reg2, callee_adjust);
6521
6522 if (emit_frame_chain)
6523 {
6524 poly_int64 reg_offset = callee_adjust;
6525 if (callee_adjust == 0)
6526 {
6527 reg1 = R29_REGNUM;
6528 reg2 = R30_REGNUM;
6529 reg_offset = callee_offset;
6530 aarch64_save_callee_saves (DImode, reg_offset, reg1, reg2, false);
6531 }
6532 aarch64_add_offset (Pmode, hard_frame_pointer_rtx,
6533 stack_pointer_rtx, callee_offset,
6534 tmp1_rtx, tmp0_rtx, frame_pointer_needed);
6535 if (frame_pointer_needed && !frame_size.is_constant ())
6536 {
6537 /* Variable-sized frames need to describe the save slot
6538 address using DW_CFA_expression rather than DW_CFA_offset.
6539 This means that, without taking further action, the
6540 locations of the registers that we've already saved would
6541 remain based on the stack pointer even after we redefine
6542 the CFA based on the frame pointer. We therefore need new
6543 DW_CFA_expressions to re-express the save slots with addresses
6544 based on the frame pointer. */
6545 rtx_insn *insn = get_last_insn ();
6546 gcc_assert (RTX_FRAME_RELATED_P (insn));
6547
6548 /* Add an explicit CFA definition if this was previously
6549 implicit. */
6550 if (!find_reg_note (insn, REG_CFA_ADJUST_CFA, NULL_RTX))
6551 {
6552 rtx src = plus_constant (Pmode, stack_pointer_rtx,
6553 callee_offset);
6554 add_reg_note (insn, REG_CFA_ADJUST_CFA,
6555 gen_rtx_SET (hard_frame_pointer_rtx, src));
6556 }
6557
6558 /* Change the save slot expressions for the registers that
6559 we've already saved. */
6560 reg_offset -= callee_offset;
6561 aarch64_add_cfa_expression (insn, reg2, hard_frame_pointer_rtx,
6562 reg_offset + UNITS_PER_WORD);
6563 aarch64_add_cfa_expression (insn, reg1, hard_frame_pointer_rtx,
6564 reg_offset);
6565 }
6566 emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
6567 }
6568
6569 aarch64_save_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
6570 callee_adjust != 0 || emit_frame_chain);
6571 if (aarch64_simd_decl_p (cfun->decl))
6572 aarch64_save_callee_saves (TFmode, callee_offset, V0_REGNUM, V31_REGNUM,
6573 callee_adjust != 0 || emit_frame_chain);
6574 else
6575 aarch64_save_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
6576 callee_adjust != 0 || emit_frame_chain);
6577
6578 /* We may need to probe the final adjustment if it is larger than the guard
6579 that is assumed by the called. */
6580 aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx, final_adjust,
6581 !frame_pointer_needed, true);
6582 }
6583
6584 /* Return TRUE if we can use a simple_return insn.
6585
6586 This function checks whether the callee saved stack is empty, which
6587 means no restore actions are need. The pro_and_epilogue will use
6588 this to check whether shrink-wrapping opt is feasible. */
6589
6590 bool
6591 aarch64_use_return_insn_p (void)
6592 {
6593 if (!reload_completed)
6594 return false;
6595
6596 if (crtl->profile)
6597 return false;
6598
6599 return known_eq (cfun->machine->frame.frame_size, 0);
6600 }
6601
6602 /* Return false for non-leaf SIMD functions in order to avoid
6603 shrink-wrapping them. Doing this will lose the necessary
6604 save/restore of FP registers. */
6605
6606 bool
6607 aarch64_use_simple_return_insn_p (void)
6608 {
6609 if (aarch64_simd_decl_p (cfun->decl) && !crtl->is_leaf)
6610 return false;
6611
6612 return true;
6613 }
6614
6615 /* Generate the epilogue instructions for returning from a function.
6616 This is almost exactly the reverse of the prolog sequence, except
6617 that we need to insert barriers to avoid scheduling loads that read
6618 from a deallocated stack, and we optimize the unwind records by
6619 emitting them all together if possible. */
6620 void
6621 aarch64_expand_epilogue (bool for_sibcall)
6622 {
6623 poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
6624 HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
6625 poly_int64 final_adjust = cfun->machine->frame.final_adjust;
6626 poly_int64 callee_offset = cfun->machine->frame.callee_offset;
6627 unsigned reg1 = cfun->machine->frame.wb_candidate1;
6628 unsigned reg2 = cfun->machine->frame.wb_candidate2;
6629 rtx cfi_ops = NULL;
6630 rtx_insn *insn;
6631 /* A stack clash protection prologue may not have left EP0_REGNUM or
6632 EP1_REGNUM in a usable state. The same is true for allocations
6633 with an SVE component, since we then need both temporary registers
6634 for each allocation. For stack clash we are in a usable state if
6635 the adjustment is less than GUARD_SIZE - GUARD_USED_BY_CALLER. */
6636 HOST_WIDE_INT guard_size
6637 = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
6638 HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
6639
6640 /* We can re-use the registers when the allocation amount is smaller than
6641 guard_size - guard_used_by_caller because we won't be doing any probes
6642 then. In such situations the register should remain live with the correct
6643 value. */
6644 bool can_inherit_p = (initial_adjust.is_constant ()
6645 && final_adjust.is_constant ())
6646 && (!flag_stack_clash_protection
6647 || known_lt (initial_adjust,
6648 guard_size - guard_used_by_caller));
6649
6650 /* We need to add memory barrier to prevent read from deallocated stack. */
6651 bool need_barrier_p
6652 = maybe_ne (get_frame_size ()
6653 + cfun->machine->frame.saved_varargs_size, 0);
6654
6655 /* Emit a barrier to prevent loads from a deallocated stack. */
6656 if (maybe_gt (final_adjust, crtl->outgoing_args_size)
6657 || cfun->calls_alloca
6658 || crtl->calls_eh_return)
6659 {
6660 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
6661 need_barrier_p = false;
6662 }
6663
6664 /* Restore the stack pointer from the frame pointer if it may not
6665 be the same as the stack pointer. */
6666 rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
6667 rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
6668 if (frame_pointer_needed
6669 && (maybe_ne (final_adjust, 0) || cfun->calls_alloca))
6670 /* If writeback is used when restoring callee-saves, the CFA
6671 is restored on the instruction doing the writeback. */
6672 aarch64_add_offset (Pmode, stack_pointer_rtx,
6673 hard_frame_pointer_rtx, -callee_offset,
6674 tmp1_rtx, tmp0_rtx, callee_adjust == 0);
6675 else
6676 /* The case where we need to re-use the register here is very rare, so
6677 avoid the complicated condition and just always emit a move if the
6678 immediate doesn't fit. */
6679 aarch64_add_sp (tmp1_rtx, tmp0_rtx, final_adjust, true);
6680
6681 aarch64_restore_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
6682 callee_adjust != 0, &cfi_ops);
6683 if (aarch64_simd_decl_p (cfun->decl))
6684 aarch64_restore_callee_saves (TFmode, callee_offset, V0_REGNUM, V31_REGNUM,
6685 callee_adjust != 0, &cfi_ops);
6686 else
6687 aarch64_restore_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
6688 callee_adjust != 0, &cfi_ops);
6689
6690 if (need_barrier_p)
6691 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
6692
6693 if (callee_adjust != 0)
6694 aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops);
6695
6696 if (callee_adjust != 0 || maybe_gt (initial_adjust, 65536))
6697 {
6698 /* Emit delayed restores and set the CFA to be SP + initial_adjust. */
6699 insn = get_last_insn ();
6700 rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust);
6701 REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
6702 RTX_FRAME_RELATED_P (insn) = 1;
6703 cfi_ops = NULL;
6704 }
6705
6706 /* Liveness of EP0_REGNUM can not be trusted across function calls either, so
6707 add restriction on emit_move optimization to leaf functions. */
6708 aarch64_add_sp (tmp0_rtx, tmp1_rtx, initial_adjust,
6709 (!can_inherit_p || !crtl->is_leaf
6710 || df_regs_ever_live_p (EP0_REGNUM)));
6711
6712 if (cfi_ops)
6713 {
6714 /* Emit delayed restores and reset the CFA to be SP. */
6715 insn = get_last_insn ();
6716 cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
6717 REG_NOTES (insn) = cfi_ops;
6718 RTX_FRAME_RELATED_P (insn) = 1;
6719 }
6720
6721 /* We prefer to emit the combined return/authenticate instruction RETAA,
6722 however there are three cases in which we must instead emit an explicit
6723 authentication instruction.
6724
6725 1) Sibcalls don't return in a normal way, so if we're about to call one
6726 we must authenticate.
6727
6728 2) The RETAA instruction is not available before ARMv8.3-A, so if we are
6729 generating code for !TARGET_ARMV8_3 we can't use it and must
6730 explicitly authenticate.
6731
6732 3) On an eh_return path we make extra stack adjustments to update the
6733 canonical frame address to be the exception handler's CFA. We want
6734 to authenticate using the CFA of the function which calls eh_return.
6735 */
6736 if (aarch64_return_address_signing_enabled ()
6737 && (for_sibcall || !TARGET_ARMV8_3 || crtl->calls_eh_return))
6738 {
6739 switch (aarch64_ra_sign_key)
6740 {
6741 case AARCH64_KEY_A:
6742 insn = emit_insn (gen_autiasp ());
6743 break;
6744 case AARCH64_KEY_B:
6745 insn = emit_insn (gen_autibsp ());
6746 break;
6747 default:
6748 gcc_unreachable ();
6749 }
6750 add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
6751 RTX_FRAME_RELATED_P (insn) = 1;
6752 }
6753
6754 /* Stack adjustment for exception handler. */
6755 if (crtl->calls_eh_return && !for_sibcall)
6756 {
6757 /* We need to unwind the stack by the offset computed by
6758 EH_RETURN_STACKADJ_RTX. We have already reset the CFA
6759 to be SP; letting the CFA move during this adjustment
6760 is just as correct as retaining the CFA from the body
6761 of the function. Therefore, do nothing special. */
6762 emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
6763 }
6764
6765 emit_use (gen_rtx_REG (DImode, LR_REGNUM));
6766 if (!for_sibcall)
6767 emit_jump_insn (ret_rtx);
6768 }
6769
6770 /* Implement EH_RETURN_HANDLER_RTX. EH returns need to either return
6771 normally or return to a previous frame after unwinding.
6772
6773 An EH return uses a single shared return sequence. The epilogue is
6774 exactly like a normal epilogue except that it has an extra input
6775 register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
6776 that must be applied after the frame has been destroyed. An extra label
6777 is inserted before the epilogue which initializes this register to zero,
6778 and this is the entry point for a normal return.
6779
6780 An actual EH return updates the return address, initializes the stack
6781 adjustment and jumps directly into the epilogue (bypassing the zeroing
6782 of the adjustment). Since the return address is typically saved on the
6783 stack when a function makes a call, the saved LR must be updated outside
6784 the epilogue.
6785
6786 This poses problems as the store is generated well before the epilogue,
6787 so the offset of LR is not known yet. Also optimizations will remove the
6788 store as it appears dead, even after the epilogue is generated (as the
6789 base or offset for loading LR is different in many cases).
6790
6791 To avoid these problems this implementation forces the frame pointer
6792 in eh_return functions so that the location of LR is fixed and known early.
6793 It also marks the store volatile, so no optimization is permitted to
6794 remove the store. */
6795 rtx
6796 aarch64_eh_return_handler_rtx (void)
6797 {
6798 rtx tmp = gen_frame_mem (Pmode,
6799 plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
6800
6801 /* Mark the store volatile, so no optimization is permitted to remove it. */
6802 MEM_VOLATILE_P (tmp) = true;
6803 return tmp;
6804 }
6805
6806 /* Output code to add DELTA to the first argument, and then jump
6807 to FUNCTION. Used for C++ multiple inheritance. */
6808 static void
6809 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
6810 HOST_WIDE_INT delta,
6811 HOST_WIDE_INT vcall_offset,
6812 tree function)
6813 {
6814 /* The this pointer is always in x0. Note that this differs from
6815 Arm where the this pointer maybe bumped to r1 if r0 is required
6816 to return a pointer to an aggregate. On AArch64 a result value
6817 pointer will be in x8. */
6818 int this_regno = R0_REGNUM;
6819 rtx this_rtx, temp0, temp1, addr, funexp;
6820 rtx_insn *insn;
6821 const char *fnname = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (thunk));
6822
6823 if (aarch64_bti_enabled ())
6824 emit_insn (gen_bti_c());
6825
6826 reload_completed = 1;
6827 emit_note (NOTE_INSN_PROLOGUE_END);
6828
6829 this_rtx = gen_rtx_REG (Pmode, this_regno);
6830 temp0 = gen_rtx_REG (Pmode, EP0_REGNUM);
6831 temp1 = gen_rtx_REG (Pmode, EP1_REGNUM);
6832
6833 if (vcall_offset == 0)
6834 aarch64_add_offset (Pmode, this_rtx, this_rtx, delta, temp1, temp0, false);
6835 else
6836 {
6837 gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
6838
6839 addr = this_rtx;
6840 if (delta != 0)
6841 {
6842 if (delta >= -256 && delta < 256)
6843 addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
6844 plus_constant (Pmode, this_rtx, delta));
6845 else
6846 aarch64_add_offset (Pmode, this_rtx, this_rtx, delta,
6847 temp1, temp0, false);
6848 }
6849
6850 if (Pmode == ptr_mode)
6851 aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
6852 else
6853 aarch64_emit_move (temp0,
6854 gen_rtx_ZERO_EXTEND (Pmode,
6855 gen_rtx_MEM (ptr_mode, addr)));
6856
6857 if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
6858 addr = plus_constant (Pmode, temp0, vcall_offset);
6859 else
6860 {
6861 aarch64_internal_mov_immediate (temp1, GEN_INT (vcall_offset), true,
6862 Pmode);
6863 addr = gen_rtx_PLUS (Pmode, temp0, temp1);
6864 }
6865
6866 if (Pmode == ptr_mode)
6867 aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
6868 else
6869 aarch64_emit_move (temp1,
6870 gen_rtx_SIGN_EXTEND (Pmode,
6871 gen_rtx_MEM (ptr_mode, addr)));
6872
6873 emit_insn (gen_add2_insn (this_rtx, temp1));
6874 }
6875
6876 /* Generate a tail call to the target function. */
6877 if (!TREE_USED (function))
6878 {
6879 assemble_external (function);
6880 TREE_USED (function) = 1;
6881 }
6882 funexp = XEXP (DECL_RTL (function), 0);
6883 funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
6884 insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, NULL_RTX));
6885 SIBLING_CALL_P (insn) = 1;
6886
6887 insn = get_insns ();
6888 shorten_branches (insn);
6889
6890 assemble_start_function (thunk, fnname);
6891 final_start_function (insn, file, 1);
6892 final (insn, file, 1);
6893 final_end_function ();
6894 assemble_end_function (thunk, fnname);
6895
6896 /* Stop pretending to be a post-reload pass. */
6897 reload_completed = 0;
6898 }
6899
6900 static bool
6901 aarch64_tls_referenced_p (rtx x)
6902 {
6903 if (!TARGET_HAVE_TLS)
6904 return false;
6905 subrtx_iterator::array_type array;
6906 FOR_EACH_SUBRTX (iter, array, x, ALL)
6907 {
6908 const_rtx x = *iter;
6909 if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
6910 return true;
6911 /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
6912 TLS offsets, not real symbol references. */
6913 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
6914 iter.skip_subrtxes ();
6915 }
6916 return false;
6917 }
6918
6919
6920 /* Return true if val can be encoded as a 12-bit unsigned immediate with
6921 a left shift of 0 or 12 bits. */
6922 bool
6923 aarch64_uimm12_shift (HOST_WIDE_INT val)
6924 {
6925 return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
6926 || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
6927 );
6928 }
6929
6930 /* Returns the nearest value to VAL that will fit as a 12-bit unsigned immediate
6931 that can be created with a left shift of 0 or 12. */
6932 static HOST_WIDE_INT
6933 aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val)
6934 {
6935 /* Check to see if the value fits in 24 bits, as that is the maximum we can
6936 handle correctly. */
6937 gcc_assert ((val & 0xffffff) == val);
6938
6939 if (((val & 0xfff) << 0) == val)
6940 return val;
6941
6942 return val & (0xfff << 12);
6943 }
6944
6945 /* Return true if val is an immediate that can be loaded into a
6946 register by a MOVZ instruction. */
6947 static bool
6948 aarch64_movw_imm (HOST_WIDE_INT val, scalar_int_mode mode)
6949 {
6950 if (GET_MODE_SIZE (mode) > 4)
6951 {
6952 if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
6953 || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
6954 return 1;
6955 }
6956 else
6957 {
6958 /* Ignore sign extension. */
6959 val &= (HOST_WIDE_INT) 0xffffffff;
6960 }
6961 return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
6962 || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
6963 }
6964
6965 /* VAL is a value with the inner mode of MODE. Replicate it to fill a
6966 64-bit (DImode) integer. */
6967
6968 static unsigned HOST_WIDE_INT
6969 aarch64_replicate_bitmask_imm (unsigned HOST_WIDE_INT val, machine_mode mode)
6970 {
6971 unsigned int size = GET_MODE_UNIT_PRECISION (mode);
6972 while (size < 64)
6973 {
6974 val &= (HOST_WIDE_INT_1U << size) - 1;
6975 val |= val << size;
6976 size *= 2;
6977 }
6978 return val;
6979 }
6980
6981 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2. */
6982
6983 static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
6984 {
6985 0x0000000100000001ull,
6986 0x0001000100010001ull,
6987 0x0101010101010101ull,
6988 0x1111111111111111ull,
6989 0x5555555555555555ull,
6990 };
6991
6992
6993 /* Return true if val is a valid bitmask immediate. */
6994
6995 bool
6996 aarch64_bitmask_imm (HOST_WIDE_INT val_in, machine_mode mode)
6997 {
6998 unsigned HOST_WIDE_INT val, tmp, mask, first_one, next_one;
6999 int bits;
7000
7001 /* Check for a single sequence of one bits and return quickly if so.
7002 The special cases of all ones and all zeroes returns false. */
7003 val = aarch64_replicate_bitmask_imm (val_in, mode);
7004 tmp = val + (val & -val);
7005
7006 if (tmp == (tmp & -tmp))
7007 return (val + 1) > 1;
7008
7009 /* Replicate 32-bit immediates so we can treat them as 64-bit. */
7010 if (mode == SImode)
7011 val = (val << 32) | (val & 0xffffffff);
7012
7013 /* Invert if the immediate doesn't start with a zero bit - this means we
7014 only need to search for sequences of one bits. */
7015 if (val & 1)
7016 val = ~val;
7017
7018 /* Find the first set bit and set tmp to val with the first sequence of one
7019 bits removed. Return success if there is a single sequence of ones. */
7020 first_one = val & -val;
7021 tmp = val & (val + first_one);
7022
7023 if (tmp == 0)
7024 return true;
7025
7026 /* Find the next set bit and compute the difference in bit position. */
7027 next_one = tmp & -tmp;
7028 bits = clz_hwi (first_one) - clz_hwi (next_one);
7029 mask = val ^ tmp;
7030
7031 /* Check the bit position difference is a power of 2, and that the first
7032 sequence of one bits fits within 'bits' bits. */
7033 if ((mask >> bits) != 0 || bits != (bits & -bits))
7034 return false;
7035
7036 /* Check the sequence of one bits is repeated 64/bits times. */
7037 return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
7038 }
7039
7040 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
7041 Assumed precondition: VAL_IN Is not zero. */
7042
7043 unsigned HOST_WIDE_INT
7044 aarch64_and_split_imm1 (HOST_WIDE_INT val_in)
7045 {
7046 int lowest_bit_set = ctz_hwi (val_in);
7047 int highest_bit_set = floor_log2 (val_in);
7048 gcc_assert (val_in != 0);
7049
7050 return ((HOST_WIDE_INT_UC (2) << highest_bit_set) -
7051 (HOST_WIDE_INT_1U << lowest_bit_set));
7052 }
7053
7054 /* Create constant where bits outside of lowest bit set to highest bit set
7055 are set to 1. */
7056
7057 unsigned HOST_WIDE_INT
7058 aarch64_and_split_imm2 (HOST_WIDE_INT val_in)
7059 {
7060 return val_in | ~aarch64_and_split_imm1 (val_in);
7061 }
7062
7063 /* Return true if VAL_IN is a valid 'and' bitmask immediate. */
7064
7065 bool
7066 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode)
7067 {
7068 scalar_int_mode int_mode;
7069 if (!is_a <scalar_int_mode> (mode, &int_mode))
7070 return false;
7071
7072 if (aarch64_bitmask_imm (val_in, int_mode))
7073 return false;
7074
7075 if (aarch64_move_imm (val_in, int_mode))
7076 return false;
7077
7078 unsigned HOST_WIDE_INT imm2 = aarch64_and_split_imm2 (val_in);
7079
7080 return aarch64_bitmask_imm (imm2, int_mode);
7081 }
7082
7083 /* Return true if val is an immediate that can be loaded into a
7084 register in a single instruction. */
7085 bool
7086 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
7087 {
7088 scalar_int_mode int_mode;
7089 if (!is_a <scalar_int_mode> (mode, &int_mode))
7090 return false;
7091
7092 if (aarch64_movw_imm (val, int_mode) || aarch64_movw_imm (~val, int_mode))
7093 return 1;
7094 return aarch64_bitmask_imm (val, int_mode);
7095 }
7096
7097 static bool
7098 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
7099 {
7100 rtx base, offset;
7101
7102 if (GET_CODE (x) == HIGH)
7103 return true;
7104
7105 /* There's no way to calculate VL-based values using relocations. */
7106 subrtx_iterator::array_type array;
7107 FOR_EACH_SUBRTX (iter, array, x, ALL)
7108 if (GET_CODE (*iter) == CONST_POLY_INT)
7109 return true;
7110
7111 split_const (x, &base, &offset);
7112 if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
7113 {
7114 if (aarch64_classify_symbol (base, INTVAL (offset))
7115 != SYMBOL_FORCE_TO_MEM)
7116 return true;
7117 else
7118 /* Avoid generating a 64-bit relocation in ILP32; leave
7119 to aarch64_expand_mov_immediate to handle it properly. */
7120 return mode != ptr_mode;
7121 }
7122
7123 return aarch64_tls_referenced_p (x);
7124 }
7125
7126 /* Implement TARGET_CASE_VALUES_THRESHOLD.
7127 The expansion for a table switch is quite expensive due to the number
7128 of instructions, the table lookup and hard to predict indirect jump.
7129 When optimizing for speed, and -O3 enabled, use the per-core tuning if
7130 set, otherwise use tables for > 16 cases as a tradeoff between size and
7131 performance. When optimizing for size, use the default setting. */
7132
7133 static unsigned int
7134 aarch64_case_values_threshold (void)
7135 {
7136 /* Use the specified limit for the number of cases before using jump
7137 tables at higher optimization levels. */
7138 if (optimize > 2
7139 && selected_cpu->tune->max_case_values != 0)
7140 return selected_cpu->tune->max_case_values;
7141 else
7142 return optimize_size ? default_case_values_threshold () : 17;
7143 }
7144
7145 /* Return true if register REGNO is a valid index register.
7146 STRICT_P is true if REG_OK_STRICT is in effect. */
7147
7148 bool
7149 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
7150 {
7151 if (!HARD_REGISTER_NUM_P (regno))
7152 {
7153 if (!strict_p)
7154 return true;
7155
7156 if (!reg_renumber)
7157 return false;
7158
7159 regno = reg_renumber[regno];
7160 }
7161 return GP_REGNUM_P (regno);
7162 }
7163
7164 /* Return true if register REGNO is a valid base register for mode MODE.
7165 STRICT_P is true if REG_OK_STRICT is in effect. */
7166
7167 bool
7168 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
7169 {
7170 if (!HARD_REGISTER_NUM_P (regno))
7171 {
7172 if (!strict_p)
7173 return true;
7174
7175 if (!reg_renumber)
7176 return false;
7177
7178 regno = reg_renumber[regno];
7179 }
7180
7181 /* The fake registers will be eliminated to either the stack or
7182 hard frame pointer, both of which are usually valid base registers.
7183 Reload deals with the cases where the eliminated form isn't valid. */
7184 return (GP_REGNUM_P (regno)
7185 || regno == SP_REGNUM
7186 || regno == FRAME_POINTER_REGNUM
7187 || regno == ARG_POINTER_REGNUM);
7188 }
7189
7190 /* Return true if X is a valid base register for mode MODE.
7191 STRICT_P is true if REG_OK_STRICT is in effect. */
7192
7193 static bool
7194 aarch64_base_register_rtx_p (rtx x, bool strict_p)
7195 {
7196 if (!strict_p
7197 && GET_CODE (x) == SUBREG
7198 && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (x))])
7199 x = SUBREG_REG (x);
7200
7201 return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
7202 }
7203
7204 /* Return true if address offset is a valid index. If it is, fill in INFO
7205 appropriately. STRICT_P is true if REG_OK_STRICT is in effect. */
7206
7207 static bool
7208 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
7209 machine_mode mode, bool strict_p)
7210 {
7211 enum aarch64_address_type type;
7212 rtx index;
7213 int shift;
7214
7215 /* (reg:P) */
7216 if ((REG_P (x) || GET_CODE (x) == SUBREG)
7217 && GET_MODE (x) == Pmode)
7218 {
7219 type = ADDRESS_REG_REG;
7220 index = x;
7221 shift = 0;
7222 }
7223 /* (sign_extend:DI (reg:SI)) */
7224 else if ((GET_CODE (x) == SIGN_EXTEND
7225 || GET_CODE (x) == ZERO_EXTEND)
7226 && GET_MODE (x) == DImode
7227 && GET_MODE (XEXP (x, 0)) == SImode)
7228 {
7229 type = (GET_CODE (x) == SIGN_EXTEND)
7230 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
7231 index = XEXP (x, 0);
7232 shift = 0;
7233 }
7234 /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
7235 else if (GET_CODE (x) == MULT
7236 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
7237 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
7238 && GET_MODE (XEXP (x, 0)) == DImode
7239 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
7240 && CONST_INT_P (XEXP (x, 1)))
7241 {
7242 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
7243 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
7244 index = XEXP (XEXP (x, 0), 0);
7245 shift = exact_log2 (INTVAL (XEXP (x, 1)));
7246 }
7247 /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
7248 else if (GET_CODE (x) == ASHIFT
7249 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
7250 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
7251 && GET_MODE (XEXP (x, 0)) == DImode
7252 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
7253 && CONST_INT_P (XEXP (x, 1)))
7254 {
7255 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
7256 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
7257 index = XEXP (XEXP (x, 0), 0);
7258 shift = INTVAL (XEXP (x, 1));
7259 }
7260 /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
7261 else if ((GET_CODE (x) == SIGN_EXTRACT
7262 || GET_CODE (x) == ZERO_EXTRACT)
7263 && GET_MODE (x) == DImode
7264 && GET_CODE (XEXP (x, 0)) == MULT
7265 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
7266 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
7267 {
7268 type = (GET_CODE (x) == SIGN_EXTRACT)
7269 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
7270 index = XEXP (XEXP (x, 0), 0);
7271 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
7272 if (INTVAL (XEXP (x, 1)) != 32 + shift
7273 || INTVAL (XEXP (x, 2)) != 0)
7274 shift = -1;
7275 }
7276 /* (and:DI (mult:DI (reg:DI) (const_int scale))
7277 (const_int 0xffffffff<<shift)) */
7278 else if (GET_CODE (x) == AND
7279 && GET_MODE (x) == DImode
7280 && GET_CODE (XEXP (x, 0)) == MULT
7281 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
7282 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
7283 && CONST_INT_P (XEXP (x, 1)))
7284 {
7285 type = ADDRESS_REG_UXTW;
7286 index = XEXP (XEXP (x, 0), 0);
7287 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
7288 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
7289 shift = -1;
7290 }
7291 /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
7292 else if ((GET_CODE (x) == SIGN_EXTRACT
7293 || GET_CODE (x) == ZERO_EXTRACT)
7294 && GET_MODE (x) == DImode
7295 && GET_CODE (XEXP (x, 0)) == ASHIFT
7296 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
7297 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
7298 {
7299 type = (GET_CODE (x) == SIGN_EXTRACT)
7300 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
7301 index = XEXP (XEXP (x, 0), 0);
7302 shift = INTVAL (XEXP (XEXP (x, 0), 1));
7303 if (INTVAL (XEXP (x, 1)) != 32 + shift
7304 || INTVAL (XEXP (x, 2)) != 0)
7305 shift = -1;
7306 }
7307 /* (and:DI (ashift:DI (reg:DI) (const_int shift))
7308 (const_int 0xffffffff<<shift)) */
7309 else if (GET_CODE (x) == AND
7310 && GET_MODE (x) == DImode
7311 && GET_CODE (XEXP (x, 0)) == ASHIFT
7312 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
7313 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
7314 && CONST_INT_P (XEXP (x, 1)))
7315 {
7316 type = ADDRESS_REG_UXTW;
7317 index = XEXP (XEXP (x, 0), 0);
7318 shift = INTVAL (XEXP (XEXP (x, 0), 1));
7319 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
7320 shift = -1;
7321 }
7322 /* (mult:P (reg:P) (const_int scale)) */
7323 else if (GET_CODE (x) == MULT
7324 && GET_MODE (x) == Pmode
7325 && GET_MODE (XEXP (x, 0)) == Pmode
7326 && CONST_INT_P (XEXP (x, 1)))
7327 {
7328 type = ADDRESS_REG_REG;
7329 index = XEXP (x, 0);
7330 shift = exact_log2 (INTVAL (XEXP (x, 1)));
7331 }
7332 /* (ashift:P (reg:P) (const_int shift)) */
7333 else if (GET_CODE (x) == ASHIFT
7334 && GET_MODE (x) == Pmode
7335 && GET_MODE (XEXP (x, 0)) == Pmode
7336 && CONST_INT_P (XEXP (x, 1)))
7337 {
7338 type = ADDRESS_REG_REG;
7339 index = XEXP (x, 0);
7340 shift = INTVAL (XEXP (x, 1));
7341 }
7342 else
7343 return false;
7344
7345 if (!strict_p
7346 && GET_CODE (index) == SUBREG
7347 && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (index))])
7348 index = SUBREG_REG (index);
7349
7350 if (aarch64_sve_data_mode_p (mode))
7351 {
7352 if (type != ADDRESS_REG_REG
7353 || (1 << shift) != GET_MODE_UNIT_SIZE (mode))
7354 return false;
7355 }
7356 else
7357 {
7358 if (shift != 0
7359 && !(IN_RANGE (shift, 1, 3)
7360 && known_eq (1 << shift, GET_MODE_SIZE (mode))))
7361 return false;
7362 }
7363
7364 if (REG_P (index)
7365 && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
7366 {
7367 info->type = type;
7368 info->offset = index;
7369 info->shift = shift;
7370 return true;
7371 }
7372
7373 return false;
7374 }
7375
7376 /* Return true if MODE is one of the modes for which we
7377 support LDP/STP operations. */
7378
7379 static bool
7380 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
7381 {
7382 return mode == SImode || mode == DImode
7383 || mode == SFmode || mode == DFmode
7384 || (aarch64_vector_mode_supported_p (mode)
7385 && (known_eq (GET_MODE_SIZE (mode), 8)
7386 || (known_eq (GET_MODE_SIZE (mode), 16)
7387 && (aarch64_tune_params.extra_tuning_flags
7388 & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS) == 0)));
7389 }
7390
7391 /* Return true if REGNO is a virtual pointer register, or an eliminable
7392 "soft" frame register. Like REGNO_PTR_FRAME_P except that we don't
7393 include stack_pointer or hard_frame_pointer. */
7394 static bool
7395 virt_or_elim_regno_p (unsigned regno)
7396 {
7397 return ((regno >= FIRST_VIRTUAL_REGISTER
7398 && regno <= LAST_VIRTUAL_POINTER_REGISTER)
7399 || regno == FRAME_POINTER_REGNUM
7400 || regno == ARG_POINTER_REGNUM);
7401 }
7402
7403 /* Return true if X is a valid address of type TYPE for machine mode MODE.
7404 If it is, fill in INFO appropriately. STRICT_P is true if
7405 REG_OK_STRICT is in effect. */
7406
7407 bool
7408 aarch64_classify_address (struct aarch64_address_info *info,
7409 rtx x, machine_mode mode, bool strict_p,
7410 aarch64_addr_query_type type)
7411 {
7412 enum rtx_code code = GET_CODE (x);
7413 rtx op0, op1;
7414 poly_int64 offset;
7415
7416 HOST_WIDE_INT const_size;
7417
7418 /* On BE, we use load/store pair for all large int mode load/stores.
7419 TI/TFmode may also use a load/store pair. */
7420 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
7421 bool advsimd_struct_p = (vec_flags == (VEC_ADVSIMD | VEC_STRUCT));
7422 bool load_store_pair_p = (type == ADDR_QUERY_LDP_STP
7423 || type == ADDR_QUERY_LDP_STP_N
7424 || mode == TImode
7425 || mode == TFmode
7426 || (BYTES_BIG_ENDIAN && advsimd_struct_p));
7427
7428 /* If we are dealing with ADDR_QUERY_LDP_STP_N that means the incoming mode
7429 corresponds to the actual size of the memory being loaded/stored and the
7430 mode of the corresponding addressing mode is half of that. */
7431 if (type == ADDR_QUERY_LDP_STP_N
7432 && known_eq (GET_MODE_SIZE (mode), 16))
7433 mode = DFmode;
7434
7435 bool allow_reg_index_p = (!load_store_pair_p
7436 && (known_lt (GET_MODE_SIZE (mode), 16)
7437 || vec_flags == VEC_ADVSIMD
7438 || vec_flags & VEC_SVE_DATA));
7439
7440 /* For SVE, only accept [Rn], [Rn, Rm, LSL #shift] and
7441 [Rn, #offset, MUL VL]. */
7442 if ((vec_flags & (VEC_SVE_DATA | VEC_SVE_PRED)) != 0
7443 && (code != REG && code != PLUS))
7444 return false;
7445
7446 /* On LE, for AdvSIMD, don't support anything other than POST_INC or
7447 REG addressing. */
7448 if (advsimd_struct_p
7449 && !BYTES_BIG_ENDIAN
7450 && (code != POST_INC && code != REG))
7451 return false;
7452
7453 gcc_checking_assert (GET_MODE (x) == VOIDmode
7454 || SCALAR_INT_MODE_P (GET_MODE (x)));
7455
7456 switch (code)
7457 {
7458 case REG:
7459 case SUBREG:
7460 info->type = ADDRESS_REG_IMM;
7461 info->base = x;
7462 info->offset = const0_rtx;
7463 info->const_offset = 0;
7464 return aarch64_base_register_rtx_p (x, strict_p);
7465
7466 case PLUS:
7467 op0 = XEXP (x, 0);
7468 op1 = XEXP (x, 1);
7469
7470 if (! strict_p
7471 && REG_P (op0)
7472 && virt_or_elim_regno_p (REGNO (op0))
7473 && poly_int_rtx_p (op1, &offset))
7474 {
7475 info->type = ADDRESS_REG_IMM;
7476 info->base = op0;
7477 info->offset = op1;
7478 info->const_offset = offset;
7479
7480 return true;
7481 }
7482
7483 if (maybe_ne (GET_MODE_SIZE (mode), 0)
7484 && aarch64_base_register_rtx_p (op0, strict_p)
7485 && poly_int_rtx_p (op1, &offset))
7486 {
7487 info->type = ADDRESS_REG_IMM;
7488 info->base = op0;
7489 info->offset = op1;
7490 info->const_offset = offset;
7491
7492 /* TImode and TFmode values are allowed in both pairs of X
7493 registers and individual Q registers. The available
7494 address modes are:
7495 X,X: 7-bit signed scaled offset
7496 Q: 9-bit signed offset
7497 We conservatively require an offset representable in either mode.
7498 When performing the check for pairs of X registers i.e. LDP/STP
7499 pass down DImode since that is the natural size of the LDP/STP
7500 instruction memory accesses. */
7501 if (mode == TImode || mode == TFmode)
7502 return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
7503 && (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
7504 || offset_12bit_unsigned_scaled_p (mode, offset)));
7505
7506 /* A 7bit offset check because OImode will emit a ldp/stp
7507 instruction (only big endian will get here).
7508 For ldp/stp instructions, the offset is scaled for the size of a
7509 single element of the pair. */
7510 if (mode == OImode)
7511 return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
7512
7513 /* Three 9/12 bit offsets checks because CImode will emit three
7514 ldr/str instructions (only big endian will get here). */
7515 if (mode == CImode)
7516 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
7517 && (aarch64_offset_9bit_signed_unscaled_p (V16QImode,
7518 offset + 32)
7519 || offset_12bit_unsigned_scaled_p (V16QImode,
7520 offset + 32)));
7521
7522 /* Two 7bit offsets checks because XImode will emit two ldp/stp
7523 instructions (only big endian will get here). */
7524 if (mode == XImode)
7525 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
7526 && aarch64_offset_7bit_signed_scaled_p (TImode,
7527 offset + 32));
7528
7529 /* Make "m" use the LD1 offset range for SVE data modes, so
7530 that pre-RTL optimizers like ivopts will work to that
7531 instead of the wider LDR/STR range. */
7532 if (vec_flags == VEC_SVE_DATA)
7533 return (type == ADDR_QUERY_M
7534 ? offset_4bit_signed_scaled_p (mode, offset)
7535 : offset_9bit_signed_scaled_p (mode, offset));
7536
7537 if (vec_flags == (VEC_SVE_DATA | VEC_STRUCT))
7538 {
7539 poly_int64 end_offset = (offset
7540 + GET_MODE_SIZE (mode)
7541 - BYTES_PER_SVE_VECTOR);
7542 return (type == ADDR_QUERY_M
7543 ? offset_4bit_signed_scaled_p (mode, offset)
7544 : (offset_9bit_signed_scaled_p (SVE_BYTE_MODE, offset)
7545 && offset_9bit_signed_scaled_p (SVE_BYTE_MODE,
7546 end_offset)));
7547 }
7548
7549 if (vec_flags == VEC_SVE_PRED)
7550 return offset_9bit_signed_scaled_p (mode, offset);
7551
7552 if (load_store_pair_p)
7553 return ((known_eq (GET_MODE_SIZE (mode), 4)
7554 || known_eq (GET_MODE_SIZE (mode), 8)
7555 || known_eq (GET_MODE_SIZE (mode), 16))
7556 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
7557 else
7558 return (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
7559 || offset_12bit_unsigned_scaled_p (mode, offset));
7560 }
7561
7562 if (allow_reg_index_p)
7563 {
7564 /* Look for base + (scaled/extended) index register. */
7565 if (aarch64_base_register_rtx_p (op0, strict_p)
7566 && aarch64_classify_index (info, op1, mode, strict_p))
7567 {
7568 info->base = op0;
7569 return true;
7570 }
7571 if (aarch64_base_register_rtx_p (op1, strict_p)
7572 && aarch64_classify_index (info, op0, mode, strict_p))
7573 {
7574 info->base = op1;
7575 return true;
7576 }
7577 }
7578
7579 return false;
7580
7581 case POST_INC:
7582 case POST_DEC:
7583 case PRE_INC:
7584 case PRE_DEC:
7585 info->type = ADDRESS_REG_WB;
7586 info->base = XEXP (x, 0);
7587 info->offset = NULL_RTX;
7588 return aarch64_base_register_rtx_p (info->base, strict_p);
7589
7590 case POST_MODIFY:
7591 case PRE_MODIFY:
7592 info->type = ADDRESS_REG_WB;
7593 info->base = XEXP (x, 0);
7594 if (GET_CODE (XEXP (x, 1)) == PLUS
7595 && poly_int_rtx_p (XEXP (XEXP (x, 1), 1), &offset)
7596 && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
7597 && aarch64_base_register_rtx_p (info->base, strict_p))
7598 {
7599 info->offset = XEXP (XEXP (x, 1), 1);
7600 info->const_offset = offset;
7601
7602 /* TImode and TFmode values are allowed in both pairs of X
7603 registers and individual Q registers. The available
7604 address modes are:
7605 X,X: 7-bit signed scaled offset
7606 Q: 9-bit signed offset
7607 We conservatively require an offset representable in either mode.
7608 */
7609 if (mode == TImode || mode == TFmode)
7610 return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
7611 && aarch64_offset_9bit_signed_unscaled_p (mode, offset));
7612
7613 if (load_store_pair_p)
7614 return ((known_eq (GET_MODE_SIZE (mode), 4)
7615 || known_eq (GET_MODE_SIZE (mode), 8)
7616 || known_eq (GET_MODE_SIZE (mode), 16))
7617 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
7618 else
7619 return aarch64_offset_9bit_signed_unscaled_p (mode, offset);
7620 }
7621 return false;
7622
7623 case CONST:
7624 case SYMBOL_REF:
7625 case LABEL_REF:
7626 /* load literal: pc-relative constant pool entry. Only supported
7627 for SI mode or larger. */
7628 info->type = ADDRESS_SYMBOLIC;
7629
7630 if (!load_store_pair_p
7631 && GET_MODE_SIZE (mode).is_constant (&const_size)
7632 && const_size >= 4)
7633 {
7634 rtx sym, addend;
7635
7636 split_const (x, &sym, &addend);
7637 return ((GET_CODE (sym) == LABEL_REF
7638 || (GET_CODE (sym) == SYMBOL_REF
7639 && CONSTANT_POOL_ADDRESS_P (sym)
7640 && aarch64_pcrelative_literal_loads)));
7641 }
7642 return false;
7643
7644 case LO_SUM:
7645 info->type = ADDRESS_LO_SUM;
7646 info->base = XEXP (x, 0);
7647 info->offset = XEXP (x, 1);
7648 if (allow_reg_index_p
7649 && aarch64_base_register_rtx_p (info->base, strict_p))
7650 {
7651 rtx sym, offs;
7652 split_const (info->offset, &sym, &offs);
7653 if (GET_CODE (sym) == SYMBOL_REF
7654 && (aarch64_classify_symbol (sym, INTVAL (offs))
7655 == SYMBOL_SMALL_ABSOLUTE))
7656 {
7657 /* The symbol and offset must be aligned to the access size. */
7658 unsigned int align;
7659
7660 if (CONSTANT_POOL_ADDRESS_P (sym))
7661 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
7662 else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
7663 {
7664 tree exp = SYMBOL_REF_DECL (sym);
7665 align = TYPE_ALIGN (TREE_TYPE (exp));
7666 align = aarch64_constant_alignment (exp, align);
7667 }
7668 else if (SYMBOL_REF_DECL (sym))
7669 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
7670 else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
7671 && SYMBOL_REF_BLOCK (sym) != NULL)
7672 align = SYMBOL_REF_BLOCK (sym)->alignment;
7673 else
7674 align = BITS_PER_UNIT;
7675
7676 poly_int64 ref_size = GET_MODE_SIZE (mode);
7677 if (known_eq (ref_size, 0))
7678 ref_size = GET_MODE_SIZE (DImode);
7679
7680 return (multiple_p (INTVAL (offs), ref_size)
7681 && multiple_p (align / BITS_PER_UNIT, ref_size));
7682 }
7683 }
7684 return false;
7685
7686 default:
7687 return false;
7688 }
7689 }
7690
7691 /* Return true if the address X is valid for a PRFM instruction.
7692 STRICT_P is true if we should do strict checking with
7693 aarch64_classify_address. */
7694
7695 bool
7696 aarch64_address_valid_for_prefetch_p (rtx x, bool strict_p)
7697 {
7698 struct aarch64_address_info addr;
7699
7700 /* PRFM accepts the same addresses as DImode... */
7701 bool res = aarch64_classify_address (&addr, x, DImode, strict_p);
7702 if (!res)
7703 return false;
7704
7705 /* ... except writeback forms. */
7706 return addr.type != ADDRESS_REG_WB;
7707 }
7708
7709 bool
7710 aarch64_symbolic_address_p (rtx x)
7711 {
7712 rtx offset;
7713
7714 split_const (x, &x, &offset);
7715 return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
7716 }
7717
7718 /* Classify the base of symbolic expression X. */
7719
7720 enum aarch64_symbol_type
7721 aarch64_classify_symbolic_expression (rtx x)
7722 {
7723 rtx offset;
7724
7725 split_const (x, &x, &offset);
7726 return aarch64_classify_symbol (x, INTVAL (offset));
7727 }
7728
7729
7730 /* Return TRUE if X is a legitimate address for accessing memory in
7731 mode MODE. */
7732 static bool
7733 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
7734 {
7735 struct aarch64_address_info addr;
7736
7737 return aarch64_classify_address (&addr, x, mode, strict_p);
7738 }
7739
7740 /* Return TRUE if X is a legitimate address of type TYPE for accessing
7741 memory in mode MODE. STRICT_P is true if REG_OK_STRICT is in effect. */
7742 bool
7743 aarch64_legitimate_address_p (machine_mode mode, rtx x, bool strict_p,
7744 aarch64_addr_query_type type)
7745 {
7746 struct aarch64_address_info addr;
7747
7748 return aarch64_classify_address (&addr, x, mode, strict_p, type);
7749 }
7750
7751 /* Implement TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT. */
7752
7753 static bool
7754 aarch64_legitimize_address_displacement (rtx *offset1, rtx *offset2,
7755 poly_int64 orig_offset,
7756 machine_mode mode)
7757 {
7758 HOST_WIDE_INT size;
7759 if (GET_MODE_SIZE (mode).is_constant (&size))
7760 {
7761 HOST_WIDE_INT const_offset, second_offset;
7762
7763 /* A general SVE offset is A * VQ + B. Remove the A component from
7764 coefficient 0 in order to get the constant B. */
7765 const_offset = orig_offset.coeffs[0] - orig_offset.coeffs[1];
7766
7767 /* Split an out-of-range address displacement into a base and
7768 offset. Use 4KB range for 1- and 2-byte accesses and a 16KB
7769 range otherwise to increase opportunities for sharing the base
7770 address of different sizes. Unaligned accesses use the signed
7771 9-bit range, TImode/TFmode use the intersection of signed
7772 scaled 7-bit and signed 9-bit offset. */
7773 if (mode == TImode || mode == TFmode)
7774 second_offset = ((const_offset + 0x100) & 0x1f8) - 0x100;
7775 else if ((const_offset & (size - 1)) != 0)
7776 second_offset = ((const_offset + 0x100) & 0x1ff) - 0x100;
7777 else
7778 second_offset = const_offset & (size < 4 ? 0xfff : 0x3ffc);
7779
7780 if (second_offset == 0 || known_eq (orig_offset, second_offset))
7781 return false;
7782
7783 /* Split the offset into second_offset and the rest. */
7784 *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
7785 *offset2 = gen_int_mode (second_offset, Pmode);
7786 return true;
7787 }
7788 else
7789 {
7790 /* Get the mode we should use as the basis of the range. For structure
7791 modes this is the mode of one vector. */
7792 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
7793 machine_mode step_mode
7794 = (vec_flags & VEC_STRUCT) != 0 ? SVE_BYTE_MODE : mode;
7795
7796 /* Get the "mul vl" multiplier we'd like to use. */
7797 HOST_WIDE_INT factor = GET_MODE_SIZE (step_mode).coeffs[1];
7798 HOST_WIDE_INT vnum = orig_offset.coeffs[1] / factor;
7799 if (vec_flags & VEC_SVE_DATA)
7800 /* LDR supports a 9-bit range, but the move patterns for
7801 structure modes require all vectors to be in range of the
7802 same base. The simplest way of accomodating that while still
7803 promoting reuse of anchor points between different modes is
7804 to use an 8-bit range unconditionally. */
7805 vnum = ((vnum + 128) & 255) - 128;
7806 else
7807 /* Predicates are only handled singly, so we might as well use
7808 the full range. */
7809 vnum = ((vnum + 256) & 511) - 256;
7810 if (vnum == 0)
7811 return false;
7812
7813 /* Convert the "mul vl" multiplier into a byte offset. */
7814 poly_int64 second_offset = GET_MODE_SIZE (step_mode) * vnum;
7815 if (known_eq (second_offset, orig_offset))
7816 return false;
7817
7818 /* Split the offset into second_offset and the rest. */
7819 *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
7820 *offset2 = gen_int_mode (second_offset, Pmode);
7821 return true;
7822 }
7823 }
7824
7825 /* Return the binary representation of floating point constant VALUE in INTVAL.
7826 If the value cannot be converted, return false without setting INTVAL.
7827 The conversion is done in the given MODE. */
7828 bool
7829 aarch64_reinterpret_float_as_int (rtx value, unsigned HOST_WIDE_INT *intval)
7830 {
7831
7832 /* We make a general exception for 0. */
7833 if (aarch64_float_const_zero_rtx_p (value))
7834 {
7835 *intval = 0;
7836 return true;
7837 }
7838
7839 scalar_float_mode mode;
7840 if (GET_CODE (value) != CONST_DOUBLE
7841 || !is_a <scalar_float_mode> (GET_MODE (value), &mode)
7842 || GET_MODE_BITSIZE (mode) > HOST_BITS_PER_WIDE_INT
7843 /* Only support up to DF mode. */
7844 || GET_MODE_BITSIZE (mode) > GET_MODE_BITSIZE (DFmode))
7845 return false;
7846
7847 unsigned HOST_WIDE_INT ival = 0;
7848
7849 long res[2];
7850 real_to_target (res,
7851 CONST_DOUBLE_REAL_VALUE (value),
7852 REAL_MODE_FORMAT (mode));
7853
7854 if (mode == DFmode)
7855 {
7856 int order = BYTES_BIG_ENDIAN ? 1 : 0;
7857 ival = zext_hwi (res[order], 32);
7858 ival |= (zext_hwi (res[1 - order], 32) << 32);
7859 }
7860 else
7861 ival = zext_hwi (res[0], 32);
7862
7863 *intval = ival;
7864 return true;
7865 }
7866
7867 /* Return TRUE if rtx X is an immediate constant that can be moved using a
7868 single MOV(+MOVK) followed by an FMOV. */
7869 bool
7870 aarch64_float_const_rtx_p (rtx x)
7871 {
7872 machine_mode mode = GET_MODE (x);
7873 if (mode == VOIDmode)
7874 return false;
7875
7876 /* Determine whether it's cheaper to write float constants as
7877 mov/movk pairs over ldr/adrp pairs. */
7878 unsigned HOST_WIDE_INT ival;
7879
7880 if (GET_CODE (x) == CONST_DOUBLE
7881 && SCALAR_FLOAT_MODE_P (mode)
7882 && aarch64_reinterpret_float_as_int (x, &ival))
7883 {
7884 scalar_int_mode imode = (mode == HFmode
7885 ? SImode
7886 : int_mode_for_mode (mode).require ());
7887 int num_instr = aarch64_internal_mov_immediate
7888 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
7889 return num_instr < 3;
7890 }
7891
7892 return false;
7893 }
7894
7895 /* Return TRUE if rtx X is immediate constant 0.0 */
7896 bool
7897 aarch64_float_const_zero_rtx_p (rtx x)
7898 {
7899 if (GET_MODE (x) == VOIDmode)
7900 return false;
7901
7902 if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
7903 return !HONOR_SIGNED_ZEROS (GET_MODE (x));
7904 return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
7905 }
7906
7907 /* Return TRUE if rtx X is immediate constant that fits in a single
7908 MOVI immediate operation. */
7909 bool
7910 aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode)
7911 {
7912 if (!TARGET_SIMD)
7913 return false;
7914
7915 machine_mode vmode;
7916 scalar_int_mode imode;
7917 unsigned HOST_WIDE_INT ival;
7918
7919 if (GET_CODE (x) == CONST_DOUBLE
7920 && SCALAR_FLOAT_MODE_P (mode))
7921 {
7922 if (!aarch64_reinterpret_float_as_int (x, &ival))
7923 return false;
7924
7925 /* We make a general exception for 0. */
7926 if (aarch64_float_const_zero_rtx_p (x))
7927 return true;
7928
7929 imode = int_mode_for_mode (mode).require ();
7930 }
7931 else if (GET_CODE (x) == CONST_INT
7932 && is_a <scalar_int_mode> (mode, &imode))
7933 ival = INTVAL (x);
7934 else
7935 return false;
7936
7937 /* use a 64 bit mode for everything except for DI/DF mode, where we use
7938 a 128 bit vector mode. */
7939 int width = GET_MODE_BITSIZE (imode) == 64 ? 128 : 64;
7940
7941 vmode = aarch64_simd_container_mode (imode, width);
7942 rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, ival);
7943
7944 return aarch64_simd_valid_immediate (v_op, NULL);
7945 }
7946
7947
7948 /* Return the fixed registers used for condition codes. */
7949
7950 static bool
7951 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
7952 {
7953 *p1 = CC_REGNUM;
7954 *p2 = INVALID_REGNUM;
7955 return true;
7956 }
7957
7958 /* This function is used by the call expanders of the machine description.
7959 RESULT is the register in which the result is returned. It's NULL for
7960 "call" and "sibcall".
7961 MEM is the location of the function call.
7962 SIBCALL indicates whether this function call is normal call or sibling call.
7963 It will generate different pattern accordingly. */
7964
7965 void
7966 aarch64_expand_call (rtx result, rtx mem, bool sibcall)
7967 {
7968 rtx call, callee, tmp;
7969 rtvec vec;
7970 machine_mode mode;
7971
7972 gcc_assert (MEM_P (mem));
7973 callee = XEXP (mem, 0);
7974 mode = GET_MODE (callee);
7975 gcc_assert (mode == Pmode);
7976
7977 /* Decide if we should generate indirect calls by loading the
7978 address of the callee into a register before performing
7979 the branch-and-link. */
7980 if (SYMBOL_REF_P (callee)
7981 ? (aarch64_is_long_call_p (callee)
7982 || aarch64_is_noplt_call_p (callee))
7983 : !REG_P (callee))
7984 XEXP (mem, 0) = force_reg (mode, callee);
7985
7986 call = gen_rtx_CALL (VOIDmode, mem, const0_rtx);
7987
7988 if (result != NULL_RTX)
7989 call = gen_rtx_SET (result, call);
7990
7991 if (sibcall)
7992 tmp = ret_rtx;
7993 else
7994 tmp = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (Pmode, LR_REGNUM));
7995
7996 vec = gen_rtvec (2, call, tmp);
7997 call = gen_rtx_PARALLEL (VOIDmode, vec);
7998
7999 aarch64_emit_call_insn (call);
8000 }
8001
8002 /* Emit call insn with PAT and do aarch64-specific handling. */
8003
8004 void
8005 aarch64_emit_call_insn (rtx pat)
8006 {
8007 rtx insn = emit_call_insn (pat);
8008
8009 rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
8010 clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
8011 clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
8012 }
8013
8014 machine_mode
8015 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
8016 {
8017 machine_mode mode_x = GET_MODE (x);
8018 rtx_code code_x = GET_CODE (x);
8019
8020 /* All floating point compares return CCFP if it is an equality
8021 comparison, and CCFPE otherwise. */
8022 if (GET_MODE_CLASS (mode_x) == MODE_FLOAT)
8023 {
8024 switch (code)
8025 {
8026 case EQ:
8027 case NE:
8028 case UNORDERED:
8029 case ORDERED:
8030 case UNLT:
8031 case UNLE:
8032 case UNGT:
8033 case UNGE:
8034 case UNEQ:
8035 return CCFPmode;
8036
8037 case LT:
8038 case LE:
8039 case GT:
8040 case GE:
8041 case LTGT:
8042 return CCFPEmode;
8043
8044 default:
8045 gcc_unreachable ();
8046 }
8047 }
8048
8049 /* Equality comparisons of short modes against zero can be performed
8050 using the TST instruction with the appropriate bitmask. */
8051 if (y == const0_rtx && (REG_P (x) || SUBREG_P (x))
8052 && (code == EQ || code == NE)
8053 && (mode_x == HImode || mode_x == QImode))
8054 return CC_NZmode;
8055
8056 /* Similarly, comparisons of zero_extends from shorter modes can
8057 be performed using an ANDS with an immediate mask. */
8058 if (y == const0_rtx && code_x == ZERO_EXTEND
8059 && (mode_x == SImode || mode_x == DImode)
8060 && (GET_MODE (XEXP (x, 0)) == HImode || GET_MODE (XEXP (x, 0)) == QImode)
8061 && (code == EQ || code == NE))
8062 return CC_NZmode;
8063
8064 if ((mode_x == SImode || mode_x == DImode)
8065 && y == const0_rtx
8066 && (code == EQ || code == NE || code == LT || code == GE)
8067 && (code_x == PLUS || code_x == MINUS || code_x == AND
8068 || code_x == NEG
8069 || (code_x == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
8070 && CONST_INT_P (XEXP (x, 2)))))
8071 return CC_NZmode;
8072
8073 /* A compare with a shifted operand. Because of canonicalization,
8074 the comparison will have to be swapped when we emit the assembly
8075 code. */
8076 if ((mode_x == SImode || mode_x == DImode)
8077 && (REG_P (y) || GET_CODE (y) == SUBREG || y == const0_rtx)
8078 && (code_x == ASHIFT || code_x == ASHIFTRT
8079 || code_x == LSHIFTRT
8080 || code_x == ZERO_EXTEND || code_x == SIGN_EXTEND))
8081 return CC_SWPmode;
8082
8083 /* Similarly for a negated operand, but we can only do this for
8084 equalities. */
8085 if ((mode_x == SImode || mode_x == DImode)
8086 && (REG_P (y) || GET_CODE (y) == SUBREG)
8087 && (code == EQ || code == NE)
8088 && code_x == NEG)
8089 return CC_Zmode;
8090
8091 /* A test for unsigned overflow from an addition. */
8092 if ((mode_x == DImode || mode_x == TImode)
8093 && (code == LTU || code == GEU)
8094 && code_x == PLUS
8095 && rtx_equal_p (XEXP (x, 0), y))
8096 return CC_Cmode;
8097
8098 /* A test for unsigned overflow from an add with carry. */
8099 if ((mode_x == DImode || mode_x == TImode)
8100 && (code == LTU || code == GEU)
8101 && code_x == PLUS
8102 && CONST_SCALAR_INT_P (y)
8103 && (rtx_mode_t (y, mode_x)
8104 == (wi::shwi (1, mode_x)
8105 << (GET_MODE_BITSIZE (mode_x).to_constant () / 2))))
8106 return CC_ADCmode;
8107
8108 /* A test for signed overflow. */
8109 if ((mode_x == DImode || mode_x == TImode)
8110 && code == NE
8111 && code_x == PLUS
8112 && GET_CODE (y) == SIGN_EXTEND)
8113 return CC_Vmode;
8114
8115 /* For everything else, return CCmode. */
8116 return CCmode;
8117 }
8118
8119 static int
8120 aarch64_get_condition_code_1 (machine_mode, enum rtx_code);
8121
8122 int
8123 aarch64_get_condition_code (rtx x)
8124 {
8125 machine_mode mode = GET_MODE (XEXP (x, 0));
8126 enum rtx_code comp_code = GET_CODE (x);
8127
8128 if (GET_MODE_CLASS (mode) != MODE_CC)
8129 mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
8130 return aarch64_get_condition_code_1 (mode, comp_code);
8131 }
8132
8133 static int
8134 aarch64_get_condition_code_1 (machine_mode mode, enum rtx_code comp_code)
8135 {
8136 switch (mode)
8137 {
8138 case E_CCFPmode:
8139 case E_CCFPEmode:
8140 switch (comp_code)
8141 {
8142 case GE: return AARCH64_GE;
8143 case GT: return AARCH64_GT;
8144 case LE: return AARCH64_LS;
8145 case LT: return AARCH64_MI;
8146 case NE: return AARCH64_NE;
8147 case EQ: return AARCH64_EQ;
8148 case ORDERED: return AARCH64_VC;
8149 case UNORDERED: return AARCH64_VS;
8150 case UNLT: return AARCH64_LT;
8151 case UNLE: return AARCH64_LE;
8152 case UNGT: return AARCH64_HI;
8153 case UNGE: return AARCH64_PL;
8154 default: return -1;
8155 }
8156 break;
8157
8158 case E_CCmode:
8159 switch (comp_code)
8160 {
8161 case NE: return AARCH64_NE;
8162 case EQ: return AARCH64_EQ;
8163 case GE: return AARCH64_GE;
8164 case GT: return AARCH64_GT;
8165 case LE: return AARCH64_LE;
8166 case LT: return AARCH64_LT;
8167 case GEU: return AARCH64_CS;
8168 case GTU: return AARCH64_HI;
8169 case LEU: return AARCH64_LS;
8170 case LTU: return AARCH64_CC;
8171 default: return -1;
8172 }
8173 break;
8174
8175 case E_CC_SWPmode:
8176 switch (comp_code)
8177 {
8178 case NE: return AARCH64_NE;
8179 case EQ: return AARCH64_EQ;
8180 case GE: return AARCH64_LE;
8181 case GT: return AARCH64_LT;
8182 case LE: return AARCH64_GE;
8183 case LT: return AARCH64_GT;
8184 case GEU: return AARCH64_LS;
8185 case GTU: return AARCH64_CC;
8186 case LEU: return AARCH64_CS;
8187 case LTU: return AARCH64_HI;
8188 default: return -1;
8189 }
8190 break;
8191
8192 case E_CC_NZCmode:
8193 switch (comp_code)
8194 {
8195 case NE: return AARCH64_NE; /* = any */
8196 case EQ: return AARCH64_EQ; /* = none */
8197 case GE: return AARCH64_PL; /* = nfrst */
8198 case LT: return AARCH64_MI; /* = first */
8199 case GEU: return AARCH64_CS; /* = nlast */
8200 case GTU: return AARCH64_HI; /* = pmore */
8201 case LEU: return AARCH64_LS; /* = plast */
8202 case LTU: return AARCH64_CC; /* = last */
8203 default: return -1;
8204 }
8205 break;
8206
8207 case E_CC_NZmode:
8208 switch (comp_code)
8209 {
8210 case NE: return AARCH64_NE;
8211 case EQ: return AARCH64_EQ;
8212 case GE: return AARCH64_PL;
8213 case LT: return AARCH64_MI;
8214 default: return -1;
8215 }
8216 break;
8217
8218 case E_CC_Zmode:
8219 switch (comp_code)
8220 {
8221 case NE: return AARCH64_NE;
8222 case EQ: return AARCH64_EQ;
8223 default: return -1;
8224 }
8225 break;
8226
8227 case E_CC_Cmode:
8228 switch (comp_code)
8229 {
8230 case LTU: return AARCH64_CS;
8231 case GEU: return AARCH64_CC;
8232 default: return -1;
8233 }
8234 break;
8235
8236 case E_CC_ADCmode:
8237 switch (comp_code)
8238 {
8239 case GEU: return AARCH64_CS;
8240 case LTU: return AARCH64_CC;
8241 default: return -1;
8242 }
8243 break;
8244
8245 case E_CC_Vmode:
8246 switch (comp_code)
8247 {
8248 case NE: return AARCH64_VS;
8249 case EQ: return AARCH64_VC;
8250 default: return -1;
8251 }
8252 break;
8253
8254 default:
8255 return -1;
8256 }
8257
8258 return -1;
8259 }
8260
8261 bool
8262 aarch64_const_vec_all_same_in_range_p (rtx x,
8263 HOST_WIDE_INT minval,
8264 HOST_WIDE_INT maxval)
8265 {
8266 rtx elt;
8267 return (const_vec_duplicate_p (x, &elt)
8268 && CONST_INT_P (elt)
8269 && IN_RANGE (INTVAL (elt), minval, maxval));
8270 }
8271
8272 bool
8273 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
8274 {
8275 return aarch64_const_vec_all_same_in_range_p (x, val, val);
8276 }
8277
8278 /* Return true if VEC is a constant in which every element is in the range
8279 [MINVAL, MAXVAL]. The elements do not need to have the same value. */
8280
8281 static bool
8282 aarch64_const_vec_all_in_range_p (rtx vec,
8283 HOST_WIDE_INT minval,
8284 HOST_WIDE_INT maxval)
8285 {
8286 if (GET_CODE (vec) != CONST_VECTOR
8287 || GET_MODE_CLASS (GET_MODE (vec)) != MODE_VECTOR_INT)
8288 return false;
8289
8290 int nunits;
8291 if (!CONST_VECTOR_STEPPED_P (vec))
8292 nunits = const_vector_encoded_nelts (vec);
8293 else if (!CONST_VECTOR_NUNITS (vec).is_constant (&nunits))
8294 return false;
8295
8296 for (int i = 0; i < nunits; i++)
8297 {
8298 rtx vec_elem = CONST_VECTOR_ELT (vec, i);
8299 if (!CONST_INT_P (vec_elem)
8300 || !IN_RANGE (INTVAL (vec_elem), minval, maxval))
8301 return false;
8302 }
8303 return true;
8304 }
8305
8306 /* N Z C V. */
8307 #define AARCH64_CC_V 1
8308 #define AARCH64_CC_C (1 << 1)
8309 #define AARCH64_CC_Z (1 << 2)
8310 #define AARCH64_CC_N (1 << 3)
8311
8312 /* N Z C V flags for ccmp. Indexed by AARCH64_COND_CODE. */
8313 static const int aarch64_nzcv_codes[] =
8314 {
8315 0, /* EQ, Z == 1. */
8316 AARCH64_CC_Z, /* NE, Z == 0. */
8317 0, /* CS, C == 1. */
8318 AARCH64_CC_C, /* CC, C == 0. */
8319 0, /* MI, N == 1. */
8320 AARCH64_CC_N, /* PL, N == 0. */
8321 0, /* VS, V == 1. */
8322 AARCH64_CC_V, /* VC, V == 0. */
8323 0, /* HI, C ==1 && Z == 0. */
8324 AARCH64_CC_C, /* LS, !(C == 1 && Z == 0). */
8325 AARCH64_CC_V, /* GE, N == V. */
8326 0, /* LT, N != V. */
8327 AARCH64_CC_Z, /* GT, Z == 0 && N == V. */
8328 0, /* LE, !(Z == 0 && N == V). */
8329 0, /* AL, Any. */
8330 0 /* NV, Any. */
8331 };
8332
8333 /* Print floating-point vector immediate operand X to F, negating it
8334 first if NEGATE is true. Return true on success, false if it isn't
8335 a constant we can handle. */
8336
8337 static bool
8338 aarch64_print_vector_float_operand (FILE *f, rtx x, bool negate)
8339 {
8340 rtx elt;
8341
8342 if (!const_vec_duplicate_p (x, &elt))
8343 return false;
8344
8345 REAL_VALUE_TYPE r = *CONST_DOUBLE_REAL_VALUE (elt);
8346 if (negate)
8347 r = real_value_negate (&r);
8348
8349 /* Handle the SVE single-bit immediates specially, since they have a
8350 fixed form in the assembly syntax. */
8351 if (real_equal (&r, &dconst0))
8352 asm_fprintf (f, "0.0");
8353 else if (real_equal (&r, &dconst2))
8354 asm_fprintf (f, "2.0");
8355 else if (real_equal (&r, &dconst1))
8356 asm_fprintf (f, "1.0");
8357 else if (real_equal (&r, &dconsthalf))
8358 asm_fprintf (f, "0.5");
8359 else
8360 {
8361 const int buf_size = 20;
8362 char float_buf[buf_size] = {'\0'};
8363 real_to_decimal_for_mode (float_buf, &r, buf_size, buf_size,
8364 1, GET_MODE (elt));
8365 asm_fprintf (f, "%s", float_buf);
8366 }
8367
8368 return true;
8369 }
8370
8371 /* Return the equivalent letter for size. */
8372 static char
8373 sizetochar (int size)
8374 {
8375 switch (size)
8376 {
8377 case 64: return 'd';
8378 case 32: return 's';
8379 case 16: return 'h';
8380 case 8 : return 'b';
8381 default: gcc_unreachable ();
8382 }
8383 }
8384
8385 /* Print operand X to file F in a target specific manner according to CODE.
8386 The acceptable formatting commands given by CODE are:
8387 'c': An integer or symbol address without a preceding #
8388 sign.
8389 'C': Take the duplicated element in a vector constant
8390 and print it in hex.
8391 'D': Take the duplicated element in a vector constant
8392 and print it as an unsigned integer, in decimal.
8393 'e': Print the sign/zero-extend size as a character 8->b,
8394 16->h, 32->w. Can also be used for masks:
8395 0xff->b, 0xffff->h, 0xffffffff->w.
8396 'I': If the operand is a duplicated vector constant,
8397 replace it with the duplicated scalar. If the
8398 operand is then a floating-point constant, replace
8399 it with the integer bit representation. Print the
8400 transformed constant as a signed decimal number.
8401 'p': Prints N such that 2^N == X (X must be power of 2 and
8402 const int).
8403 'P': Print the number of non-zero bits in X (a const_int).
8404 'H': Print the higher numbered register of a pair (TImode)
8405 of regs.
8406 'm': Print a condition (eq, ne, etc).
8407 'M': Same as 'm', but invert condition.
8408 'N': Take the duplicated element in a vector constant
8409 and print the negative of it in decimal.
8410 'b/h/s/d/q': Print a scalar FP/SIMD register name.
8411 'S/T/U/V': Print a FP/SIMD register name for a register list.
8412 The register printed is the FP/SIMD register name
8413 of X + 0/1/2/3 for S/T/U/V.
8414 'R': Print a scalar FP/SIMD register name + 1.
8415 'X': Print bottom 16 bits of integer constant in hex.
8416 'w/x': Print a general register name or the zero register
8417 (32-bit or 64-bit).
8418 '0': Print a normal operand, if it's a general register,
8419 then we assume DImode.
8420 'k': Print NZCV for conditional compare instructions.
8421 'A': Output address constant representing the first
8422 argument of X, specifying a relocation offset
8423 if appropriate.
8424 'L': Output constant address specified by X
8425 with a relocation offset if appropriate.
8426 'G': Prints address of X, specifying a PC relative
8427 relocation mode if appropriate.
8428 'y': Output address of LDP or STP - this is used for
8429 some LDP/STPs which don't use a PARALLEL in their
8430 pattern (so the mode needs to be adjusted).
8431 'z': Output address of a typical LDP or STP. */
8432
8433 static void
8434 aarch64_print_operand (FILE *f, rtx x, int code)
8435 {
8436 rtx elt;
8437 switch (code)
8438 {
8439 case 'c':
8440 switch (GET_CODE (x))
8441 {
8442 case CONST_INT:
8443 fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
8444 break;
8445
8446 case SYMBOL_REF:
8447 output_addr_const (f, x);
8448 break;
8449
8450 case CONST:
8451 if (GET_CODE (XEXP (x, 0)) == PLUS
8452 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
8453 {
8454 output_addr_const (f, x);
8455 break;
8456 }
8457 /* Fall through. */
8458
8459 default:
8460 output_operand_lossage ("unsupported operand for code '%c'", code);
8461 }
8462 break;
8463
8464 case 'e':
8465 {
8466 x = unwrap_const_vec_duplicate (x);
8467 if (!CONST_INT_P (x))
8468 {
8469 output_operand_lossage ("invalid operand for '%%%c'", code);
8470 return;
8471 }
8472
8473 HOST_WIDE_INT val = INTVAL (x);
8474 if ((val & ~7) == 8 || val == 0xff)
8475 fputc ('b', f);
8476 else if ((val & ~7) == 16 || val == 0xffff)
8477 fputc ('h', f);
8478 else if ((val & ~7) == 32 || val == 0xffffffff)
8479 fputc ('w', f);
8480 else
8481 {
8482 output_operand_lossage ("invalid operand for '%%%c'", code);
8483 return;
8484 }
8485 }
8486 break;
8487
8488 case 'p':
8489 {
8490 int n;
8491
8492 if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
8493 {
8494 output_operand_lossage ("invalid operand for '%%%c'", code);
8495 return;
8496 }
8497
8498 asm_fprintf (f, "%d", n);
8499 }
8500 break;
8501
8502 case 'P':
8503 if (!CONST_INT_P (x))
8504 {
8505 output_operand_lossage ("invalid operand for '%%%c'", code);
8506 return;
8507 }
8508
8509 asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
8510 break;
8511
8512 case 'H':
8513 if (x == const0_rtx)
8514 {
8515 asm_fprintf (f, "xzr");
8516 break;
8517 }
8518
8519 if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
8520 {
8521 output_operand_lossage ("invalid operand for '%%%c'", code);
8522 return;
8523 }
8524
8525 asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
8526 break;
8527
8528 case 'I':
8529 {
8530 x = aarch64_bit_representation (unwrap_const_vec_duplicate (x));
8531 if (CONST_INT_P (x))
8532 asm_fprintf (f, "%wd", INTVAL (x));
8533 else
8534 {
8535 output_operand_lossage ("invalid operand for '%%%c'", code);
8536 return;
8537 }
8538 break;
8539 }
8540
8541 case 'M':
8542 case 'm':
8543 {
8544 int cond_code;
8545 /* CONST_TRUE_RTX means al/nv (al is the default, don't print it). */
8546 if (x == const_true_rtx)
8547 {
8548 if (code == 'M')
8549 fputs ("nv", f);
8550 return;
8551 }
8552
8553 if (!COMPARISON_P (x))
8554 {
8555 output_operand_lossage ("invalid operand for '%%%c'", code);
8556 return;
8557 }
8558
8559 cond_code = aarch64_get_condition_code (x);
8560 gcc_assert (cond_code >= 0);
8561 if (code == 'M')
8562 cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code);
8563 if (GET_MODE (XEXP (x, 0)) == CC_NZCmode)
8564 fputs (aarch64_sve_condition_codes[cond_code], f);
8565 else
8566 fputs (aarch64_condition_codes[cond_code], f);
8567 }
8568 break;
8569
8570 case 'N':
8571 if (!const_vec_duplicate_p (x, &elt))
8572 {
8573 output_operand_lossage ("invalid vector constant");
8574 return;
8575 }
8576
8577 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
8578 asm_fprintf (f, "%wd", -INTVAL (elt));
8579 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
8580 && aarch64_print_vector_float_operand (f, x, true))
8581 ;
8582 else
8583 {
8584 output_operand_lossage ("invalid vector constant");
8585 return;
8586 }
8587 break;
8588
8589 case 'b':
8590 case 'h':
8591 case 's':
8592 case 'd':
8593 case 'q':
8594 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
8595 {
8596 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
8597 return;
8598 }
8599 asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
8600 break;
8601
8602 case 'S':
8603 case 'T':
8604 case 'U':
8605 case 'V':
8606 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
8607 {
8608 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
8609 return;
8610 }
8611 asm_fprintf (f, "%c%d",
8612 aarch64_sve_data_mode_p (GET_MODE (x)) ? 'z' : 'v',
8613 REGNO (x) - V0_REGNUM + (code - 'S'));
8614 break;
8615
8616 case 'R':
8617 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
8618 {
8619 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
8620 return;
8621 }
8622 asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
8623 break;
8624
8625 case 'X':
8626 if (!CONST_INT_P (x))
8627 {
8628 output_operand_lossage ("invalid operand for '%%%c'", code);
8629 return;
8630 }
8631 asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
8632 break;
8633
8634 case 'C':
8635 {
8636 /* Print a replicated constant in hex. */
8637 if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
8638 {
8639 output_operand_lossage ("invalid operand for '%%%c'", code);
8640 return;
8641 }
8642 scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
8643 asm_fprintf (f, "0x%wx", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
8644 }
8645 break;
8646
8647 case 'D':
8648 {
8649 /* Print a replicated constant in decimal, treating it as
8650 unsigned. */
8651 if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
8652 {
8653 output_operand_lossage ("invalid operand for '%%%c'", code);
8654 return;
8655 }
8656 scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
8657 asm_fprintf (f, "%wd", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
8658 }
8659 break;
8660
8661 case 'w':
8662 case 'x':
8663 if (x == const0_rtx
8664 || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
8665 {
8666 asm_fprintf (f, "%czr", code);
8667 break;
8668 }
8669
8670 if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
8671 {
8672 asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
8673 break;
8674 }
8675
8676 if (REG_P (x) && REGNO (x) == SP_REGNUM)
8677 {
8678 asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
8679 break;
8680 }
8681
8682 /* Fall through */
8683
8684 case 0:
8685 if (x == NULL)
8686 {
8687 output_operand_lossage ("missing operand");
8688 return;
8689 }
8690
8691 switch (GET_CODE (x))
8692 {
8693 case REG:
8694 if (aarch64_sve_data_mode_p (GET_MODE (x)))
8695 {
8696 if (REG_NREGS (x) == 1)
8697 asm_fprintf (f, "z%d", REGNO (x) - V0_REGNUM);
8698 else
8699 {
8700 char suffix
8701 = sizetochar (GET_MODE_UNIT_BITSIZE (GET_MODE (x)));
8702 asm_fprintf (f, "{z%d.%c - z%d.%c}",
8703 REGNO (x) - V0_REGNUM, suffix,
8704 END_REGNO (x) - V0_REGNUM - 1, suffix);
8705 }
8706 }
8707 else
8708 asm_fprintf (f, "%s", reg_names [REGNO (x)]);
8709 break;
8710
8711 case MEM:
8712 output_address (GET_MODE (x), XEXP (x, 0));
8713 break;
8714
8715 case LABEL_REF:
8716 case SYMBOL_REF:
8717 output_addr_const (asm_out_file, x);
8718 break;
8719
8720 case CONST_INT:
8721 asm_fprintf (f, "%wd", INTVAL (x));
8722 break;
8723
8724 case CONST:
8725 if (!VECTOR_MODE_P (GET_MODE (x)))
8726 {
8727 output_addr_const (asm_out_file, x);
8728 break;
8729 }
8730 /* fall through */
8731
8732 case CONST_VECTOR:
8733 if (!const_vec_duplicate_p (x, &elt))
8734 {
8735 output_operand_lossage ("invalid vector constant");
8736 return;
8737 }
8738
8739 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
8740 asm_fprintf (f, "%wd", INTVAL (elt));
8741 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
8742 && aarch64_print_vector_float_operand (f, x, false))
8743 ;
8744 else
8745 {
8746 output_operand_lossage ("invalid vector constant");
8747 return;
8748 }
8749 break;
8750
8751 case CONST_DOUBLE:
8752 /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
8753 be getting CONST_DOUBLEs holding integers. */
8754 gcc_assert (GET_MODE (x) != VOIDmode);
8755 if (aarch64_float_const_zero_rtx_p (x))
8756 {
8757 fputc ('0', f);
8758 break;
8759 }
8760 else if (aarch64_float_const_representable_p (x))
8761 {
8762 #define buf_size 20
8763 char float_buf[buf_size] = {'\0'};
8764 real_to_decimal_for_mode (float_buf,
8765 CONST_DOUBLE_REAL_VALUE (x),
8766 buf_size, buf_size,
8767 1, GET_MODE (x));
8768 asm_fprintf (asm_out_file, "%s", float_buf);
8769 break;
8770 #undef buf_size
8771 }
8772 output_operand_lossage ("invalid constant");
8773 return;
8774 default:
8775 output_operand_lossage ("invalid operand");
8776 return;
8777 }
8778 break;
8779
8780 case 'A':
8781 if (GET_CODE (x) == HIGH)
8782 x = XEXP (x, 0);
8783
8784 switch (aarch64_classify_symbolic_expression (x))
8785 {
8786 case SYMBOL_SMALL_GOT_4G:
8787 asm_fprintf (asm_out_file, ":got:");
8788 break;
8789
8790 case SYMBOL_SMALL_TLSGD:
8791 asm_fprintf (asm_out_file, ":tlsgd:");
8792 break;
8793
8794 case SYMBOL_SMALL_TLSDESC:
8795 asm_fprintf (asm_out_file, ":tlsdesc:");
8796 break;
8797
8798 case SYMBOL_SMALL_TLSIE:
8799 asm_fprintf (asm_out_file, ":gottprel:");
8800 break;
8801
8802 case SYMBOL_TLSLE24:
8803 asm_fprintf (asm_out_file, ":tprel:");
8804 break;
8805
8806 case SYMBOL_TINY_GOT:
8807 gcc_unreachable ();
8808 break;
8809
8810 default:
8811 break;
8812 }
8813 output_addr_const (asm_out_file, x);
8814 break;
8815
8816 case 'L':
8817 switch (aarch64_classify_symbolic_expression (x))
8818 {
8819 case SYMBOL_SMALL_GOT_4G:
8820 asm_fprintf (asm_out_file, ":lo12:");
8821 break;
8822
8823 case SYMBOL_SMALL_TLSGD:
8824 asm_fprintf (asm_out_file, ":tlsgd_lo12:");
8825 break;
8826
8827 case SYMBOL_SMALL_TLSDESC:
8828 asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
8829 break;
8830
8831 case SYMBOL_SMALL_TLSIE:
8832 asm_fprintf (asm_out_file, ":gottprel_lo12:");
8833 break;
8834
8835 case SYMBOL_TLSLE12:
8836 asm_fprintf (asm_out_file, ":tprel_lo12:");
8837 break;
8838
8839 case SYMBOL_TLSLE24:
8840 asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
8841 break;
8842
8843 case SYMBOL_TINY_GOT:
8844 asm_fprintf (asm_out_file, ":got:");
8845 break;
8846
8847 case SYMBOL_TINY_TLSIE:
8848 asm_fprintf (asm_out_file, ":gottprel:");
8849 break;
8850
8851 default:
8852 break;
8853 }
8854 output_addr_const (asm_out_file, x);
8855 break;
8856
8857 case 'G':
8858 switch (aarch64_classify_symbolic_expression (x))
8859 {
8860 case SYMBOL_TLSLE24:
8861 asm_fprintf (asm_out_file, ":tprel_hi12:");
8862 break;
8863 default:
8864 break;
8865 }
8866 output_addr_const (asm_out_file, x);
8867 break;
8868
8869 case 'k':
8870 {
8871 HOST_WIDE_INT cond_code;
8872
8873 if (!CONST_INT_P (x))
8874 {
8875 output_operand_lossage ("invalid operand for '%%%c'", code);
8876 return;
8877 }
8878
8879 cond_code = INTVAL (x);
8880 gcc_assert (cond_code >= 0 && cond_code <= AARCH64_NV);
8881 asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code]);
8882 }
8883 break;
8884
8885 case 'y':
8886 case 'z':
8887 {
8888 machine_mode mode = GET_MODE (x);
8889
8890 if (GET_CODE (x) != MEM
8891 || (code == 'y' && maybe_ne (GET_MODE_SIZE (mode), 16)))
8892 {
8893 output_operand_lossage ("invalid operand for '%%%c'", code);
8894 return;
8895 }
8896
8897 if (!aarch64_print_address_internal (f, mode, XEXP (x, 0),
8898 code == 'y'
8899 ? ADDR_QUERY_LDP_STP_N
8900 : ADDR_QUERY_LDP_STP))
8901 output_operand_lossage ("invalid operand prefix '%%%c'", code);
8902 }
8903 break;
8904
8905 default:
8906 output_operand_lossage ("invalid operand prefix '%%%c'", code);
8907 return;
8908 }
8909 }
8910
8911 /* Print address 'x' of a memory access with mode 'mode'.
8912 'op' is the context required by aarch64_classify_address. It can either be
8913 MEM for a normal memory access or PARALLEL for LDP/STP. */
8914 static bool
8915 aarch64_print_address_internal (FILE *f, machine_mode mode, rtx x,
8916 aarch64_addr_query_type type)
8917 {
8918 struct aarch64_address_info addr;
8919 unsigned int size;
8920
8921 /* Check all addresses are Pmode - including ILP32. */
8922 if (GET_MODE (x) != Pmode
8923 && (!CONST_INT_P (x)
8924 || trunc_int_for_mode (INTVAL (x), Pmode) != INTVAL (x)))
8925 {
8926 output_operand_lossage ("invalid address mode");
8927 return false;
8928 }
8929
8930 if (aarch64_classify_address (&addr, x, mode, true, type))
8931 switch (addr.type)
8932 {
8933 case ADDRESS_REG_IMM:
8934 if (known_eq (addr.const_offset, 0))
8935 asm_fprintf (f, "[%s]", reg_names [REGNO (addr.base)]);
8936 else if (aarch64_sve_data_mode_p (mode))
8937 {
8938 HOST_WIDE_INT vnum
8939 = exact_div (addr.const_offset,
8940 BYTES_PER_SVE_VECTOR).to_constant ();
8941 asm_fprintf (f, "[%s, #%wd, mul vl]",
8942 reg_names[REGNO (addr.base)], vnum);
8943 }
8944 else if (aarch64_sve_pred_mode_p (mode))
8945 {
8946 HOST_WIDE_INT vnum
8947 = exact_div (addr.const_offset,
8948 BYTES_PER_SVE_PRED).to_constant ();
8949 asm_fprintf (f, "[%s, #%wd, mul vl]",
8950 reg_names[REGNO (addr.base)], vnum);
8951 }
8952 else
8953 asm_fprintf (f, "[%s, %wd]", reg_names [REGNO (addr.base)],
8954 INTVAL (addr.offset));
8955 return true;
8956
8957 case ADDRESS_REG_REG:
8958 if (addr.shift == 0)
8959 asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
8960 reg_names [REGNO (addr.offset)]);
8961 else
8962 asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
8963 reg_names [REGNO (addr.offset)], addr.shift);
8964 return true;
8965
8966 case ADDRESS_REG_UXTW:
8967 if (addr.shift == 0)
8968 asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
8969 REGNO (addr.offset) - R0_REGNUM);
8970 else
8971 asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
8972 REGNO (addr.offset) - R0_REGNUM, addr.shift);
8973 return true;
8974
8975 case ADDRESS_REG_SXTW:
8976 if (addr.shift == 0)
8977 asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
8978 REGNO (addr.offset) - R0_REGNUM);
8979 else
8980 asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
8981 REGNO (addr.offset) - R0_REGNUM, addr.shift);
8982 return true;
8983
8984 case ADDRESS_REG_WB:
8985 /* Writeback is only supported for fixed-width modes. */
8986 size = GET_MODE_SIZE (mode).to_constant ();
8987 switch (GET_CODE (x))
8988 {
8989 case PRE_INC:
8990 asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)], size);
8991 return true;
8992 case POST_INC:
8993 asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)], size);
8994 return true;
8995 case PRE_DEC:
8996 asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)], size);
8997 return true;
8998 case POST_DEC:
8999 asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)], size);
9000 return true;
9001 case PRE_MODIFY:
9002 asm_fprintf (f, "[%s, %wd]!", reg_names[REGNO (addr.base)],
9003 INTVAL (addr.offset));
9004 return true;
9005 case POST_MODIFY:
9006 asm_fprintf (f, "[%s], %wd", reg_names[REGNO (addr.base)],
9007 INTVAL (addr.offset));
9008 return true;
9009 default:
9010 break;
9011 }
9012 break;
9013
9014 case ADDRESS_LO_SUM:
9015 asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
9016 output_addr_const (f, addr.offset);
9017 asm_fprintf (f, "]");
9018 return true;
9019
9020 case ADDRESS_SYMBOLIC:
9021 output_addr_const (f, x);
9022 return true;
9023 }
9024
9025 return false;
9026 }
9027
9028 /* Print address 'x' of a memory access with mode 'mode'. */
9029 static void
9030 aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
9031 {
9032 if (!aarch64_print_address_internal (f, mode, x, ADDR_QUERY_ANY))
9033 output_addr_const (f, x);
9034 }
9035
9036 bool
9037 aarch64_label_mentioned_p (rtx x)
9038 {
9039 const char *fmt;
9040 int i;
9041
9042 if (GET_CODE (x) == LABEL_REF)
9043 return true;
9044
9045 /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
9046 referencing instruction, but they are constant offsets, not
9047 symbols. */
9048 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
9049 return false;
9050
9051 fmt = GET_RTX_FORMAT (GET_CODE (x));
9052 for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
9053 {
9054 if (fmt[i] == 'E')
9055 {
9056 int j;
9057
9058 for (j = XVECLEN (x, i) - 1; j >= 0; j--)
9059 if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
9060 return 1;
9061 }
9062 else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
9063 return 1;
9064 }
9065
9066 return 0;
9067 }
9068
9069 /* Implement REGNO_REG_CLASS. */
9070
9071 enum reg_class
9072 aarch64_regno_regclass (unsigned regno)
9073 {
9074 if (GP_REGNUM_P (regno))
9075 return GENERAL_REGS;
9076
9077 if (regno == SP_REGNUM)
9078 return STACK_REG;
9079
9080 if (regno == FRAME_POINTER_REGNUM
9081 || regno == ARG_POINTER_REGNUM)
9082 return POINTER_REGS;
9083
9084 if (FP_REGNUM_P (regno))
9085 return (FP_LO8_REGNUM_P (regno) ? FP_LO8_REGS
9086 : FP_LO_REGNUM_P (regno) ? FP_LO_REGS : FP_REGS);
9087
9088 if (PR_REGNUM_P (regno))
9089 return PR_LO_REGNUM_P (regno) ? PR_LO_REGS : PR_HI_REGS;
9090
9091 return NO_REGS;
9092 }
9093
9094 /* OFFSET is an address offset for mode MODE, which has SIZE bytes.
9095 If OFFSET is out of range, return an offset of an anchor point
9096 that is in range. Return 0 otherwise. */
9097
9098 static HOST_WIDE_INT
9099 aarch64_anchor_offset (HOST_WIDE_INT offset, HOST_WIDE_INT size,
9100 machine_mode mode)
9101 {
9102 /* Does it look like we'll need a 16-byte load/store-pair operation? */
9103 if (size > 16)
9104 return (offset + 0x400) & ~0x7f0;
9105
9106 /* For offsets that aren't a multiple of the access size, the limit is
9107 -256...255. */
9108 if (offset & (size - 1))
9109 {
9110 /* BLKmode typically uses LDP of X-registers. */
9111 if (mode == BLKmode)
9112 return (offset + 512) & ~0x3ff;
9113 return (offset + 0x100) & ~0x1ff;
9114 }
9115
9116 /* Small negative offsets are supported. */
9117 if (IN_RANGE (offset, -256, 0))
9118 return 0;
9119
9120 if (mode == TImode || mode == TFmode)
9121 return (offset + 0x100) & ~0x1ff;
9122
9123 /* Use 12-bit offset by access size. */
9124 return offset & (~0xfff * size);
9125 }
9126
9127 static rtx
9128 aarch64_legitimize_address (rtx x, rtx /* orig_x */, machine_mode mode)
9129 {
9130 /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
9131 where mask is selected by alignment and size of the offset.
9132 We try to pick as large a range for the offset as possible to
9133 maximize the chance of a CSE. However, for aligned addresses
9134 we limit the range to 4k so that structures with different sized
9135 elements are likely to use the same base. We need to be careful
9136 not to split a CONST for some forms of address expression, otherwise
9137 it will generate sub-optimal code. */
9138
9139 if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
9140 {
9141 rtx base = XEXP (x, 0);
9142 rtx offset_rtx = XEXP (x, 1);
9143 HOST_WIDE_INT offset = INTVAL (offset_rtx);
9144
9145 if (GET_CODE (base) == PLUS)
9146 {
9147 rtx op0 = XEXP (base, 0);
9148 rtx op1 = XEXP (base, 1);
9149
9150 /* Force any scaling into a temp for CSE. */
9151 op0 = force_reg (Pmode, op0);
9152 op1 = force_reg (Pmode, op1);
9153
9154 /* Let the pointer register be in op0. */
9155 if (REG_POINTER (op1))
9156 std::swap (op0, op1);
9157
9158 /* If the pointer is virtual or frame related, then we know that
9159 virtual register instantiation or register elimination is going
9160 to apply a second constant. We want the two constants folded
9161 together easily. Therefore, emit as (OP0 + CONST) + OP1. */
9162 if (virt_or_elim_regno_p (REGNO (op0)))
9163 {
9164 base = expand_binop (Pmode, add_optab, op0, offset_rtx,
9165 NULL_RTX, true, OPTAB_DIRECT);
9166 return gen_rtx_PLUS (Pmode, base, op1);
9167 }
9168
9169 /* Otherwise, in order to encourage CSE (and thence loop strength
9170 reduce) scaled addresses, emit as (OP0 + OP1) + CONST. */
9171 base = expand_binop (Pmode, add_optab, op0, op1,
9172 NULL_RTX, true, OPTAB_DIRECT);
9173 x = gen_rtx_PLUS (Pmode, base, offset_rtx);
9174 }
9175
9176 HOST_WIDE_INT size;
9177 if (GET_MODE_SIZE (mode).is_constant (&size))
9178 {
9179 HOST_WIDE_INT base_offset = aarch64_anchor_offset (offset, size,
9180 mode);
9181 if (base_offset != 0)
9182 {
9183 base = plus_constant (Pmode, base, base_offset);
9184 base = force_operand (base, NULL_RTX);
9185 return plus_constant (Pmode, base, offset - base_offset);
9186 }
9187 }
9188 }
9189
9190 return x;
9191 }
9192
9193 static reg_class_t
9194 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
9195 reg_class_t rclass,
9196 machine_mode mode,
9197 secondary_reload_info *sri)
9198 {
9199 /* Use aarch64_sve_reload_be for SVE reloads that cannot be handled
9200 directly by the *aarch64_sve_mov<mode>_be move pattern. See the
9201 comment at the head of aarch64-sve.md for more details about the
9202 big-endian handling. */
9203 if (BYTES_BIG_ENDIAN
9204 && reg_class_subset_p (rclass, FP_REGS)
9205 && !((REG_P (x) && HARD_REGISTER_P (x))
9206 || aarch64_simd_valid_immediate (x, NULL))
9207 && aarch64_sve_data_mode_p (mode))
9208 {
9209 sri->icode = CODE_FOR_aarch64_sve_reload_be;
9210 return NO_REGS;
9211 }
9212
9213 /* If we have to disable direct literal pool loads and stores because the
9214 function is too big, then we need a scratch register. */
9215 if (MEM_P (x) && GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x)
9216 && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
9217 || targetm.vector_mode_supported_p (GET_MODE (x)))
9218 && !aarch64_pcrelative_literal_loads)
9219 {
9220 sri->icode = code_for_aarch64_reload_movcp (mode, DImode);
9221 return NO_REGS;
9222 }
9223
9224 /* Without the TARGET_SIMD instructions we cannot move a Q register
9225 to a Q register directly. We need a scratch. */
9226 if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
9227 && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
9228 && reg_class_subset_p (rclass, FP_REGS))
9229 {
9230 sri->icode = code_for_aarch64_reload_mov (mode);
9231 return NO_REGS;
9232 }
9233
9234 /* A TFmode or TImode memory access should be handled via an FP_REGS
9235 because AArch64 has richer addressing modes for LDR/STR instructions
9236 than LDP/STP instructions. */
9237 if (TARGET_FLOAT && rclass == GENERAL_REGS
9238 && known_eq (GET_MODE_SIZE (mode), 16) && MEM_P (x))
9239 return FP_REGS;
9240
9241 if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
9242 return GENERAL_REGS;
9243
9244 return NO_REGS;
9245 }
9246
9247 static bool
9248 aarch64_can_eliminate (const int from ATTRIBUTE_UNUSED, const int to)
9249 {
9250 gcc_assert (from == ARG_POINTER_REGNUM || from == FRAME_POINTER_REGNUM);
9251
9252 /* If we need a frame pointer, ARG_POINTER_REGNUM and FRAME_POINTER_REGNUM
9253 can only eliminate to HARD_FRAME_POINTER_REGNUM. */
9254 if (frame_pointer_needed)
9255 return to == HARD_FRAME_POINTER_REGNUM;
9256 return true;
9257 }
9258
9259 poly_int64
9260 aarch64_initial_elimination_offset (unsigned from, unsigned to)
9261 {
9262 if (to == HARD_FRAME_POINTER_REGNUM)
9263 {
9264 if (from == ARG_POINTER_REGNUM)
9265 return cfun->machine->frame.hard_fp_offset;
9266
9267 if (from == FRAME_POINTER_REGNUM)
9268 return cfun->machine->frame.hard_fp_offset
9269 - cfun->machine->frame.locals_offset;
9270 }
9271
9272 if (to == STACK_POINTER_REGNUM)
9273 {
9274 if (from == FRAME_POINTER_REGNUM)
9275 return cfun->machine->frame.frame_size
9276 - cfun->machine->frame.locals_offset;
9277 }
9278
9279 return cfun->machine->frame.frame_size;
9280 }
9281
9282 /* Implement RETURN_ADDR_RTX. We do not support moving back to a
9283 previous frame. */
9284
9285 rtx
9286 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
9287 {
9288 if (count != 0)
9289 return const0_rtx;
9290 return get_hard_reg_initial_val (Pmode, LR_REGNUM);
9291 }
9292
9293
9294 static void
9295 aarch64_asm_trampoline_template (FILE *f)
9296 {
9297 int offset1 = 16;
9298 int offset2 = 20;
9299
9300 if (aarch64_bti_enabled ())
9301 {
9302 asm_fprintf (f, "\thint\t34 // bti c\n");
9303 offset1 -= 4;
9304 offset2 -= 4;
9305 }
9306
9307 if (TARGET_ILP32)
9308 {
9309 asm_fprintf (f, "\tldr\tw%d, .+%d\n", IP1_REGNUM - R0_REGNUM, offset1);
9310 asm_fprintf (f, "\tldr\tw%d, .+%d\n", STATIC_CHAIN_REGNUM - R0_REGNUM,
9311 offset1);
9312 }
9313 else
9314 {
9315 asm_fprintf (f, "\tldr\t%s, .+%d\n", reg_names [IP1_REGNUM], offset1);
9316 asm_fprintf (f, "\tldr\t%s, .+%d\n", reg_names [STATIC_CHAIN_REGNUM],
9317 offset2);
9318 }
9319 asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
9320
9321 /* The trampoline needs an extra padding instruction. In case if BTI is
9322 enabled the padding instruction is replaced by the BTI instruction at
9323 the beginning. */
9324 if (!aarch64_bti_enabled ())
9325 assemble_aligned_integer (4, const0_rtx);
9326
9327 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
9328 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
9329 }
9330
9331 static void
9332 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
9333 {
9334 rtx fnaddr, mem, a_tramp;
9335 const int tramp_code_sz = 16;
9336
9337 /* Don't need to copy the trailing D-words, we fill those in below. */
9338 emit_block_move (m_tramp, assemble_trampoline_template (),
9339 GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
9340 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
9341 fnaddr = XEXP (DECL_RTL (fndecl), 0);
9342 if (GET_MODE (fnaddr) != ptr_mode)
9343 fnaddr = convert_memory_address (ptr_mode, fnaddr);
9344 emit_move_insn (mem, fnaddr);
9345
9346 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
9347 emit_move_insn (mem, chain_value);
9348
9349 /* XXX We should really define a "clear_cache" pattern and use
9350 gen_clear_cache(). */
9351 a_tramp = XEXP (m_tramp, 0);
9352 emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
9353 LCT_NORMAL, VOIDmode, a_tramp, ptr_mode,
9354 plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
9355 ptr_mode);
9356 }
9357
9358 static unsigned char
9359 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
9360 {
9361 /* ??? Logically we should only need to provide a value when
9362 HARD_REGNO_MODE_OK says that at least one register in REGCLASS
9363 can hold MODE, but at the moment we need to handle all modes.
9364 Just ignore any runtime parts for registers that can't store them. */
9365 HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
9366 unsigned int nregs;
9367 switch (regclass)
9368 {
9369 case TAILCALL_ADDR_REGS:
9370 case POINTER_REGS:
9371 case GENERAL_REGS:
9372 case ALL_REGS:
9373 case POINTER_AND_FP_REGS:
9374 case FP_REGS:
9375 case FP_LO_REGS:
9376 case FP_LO8_REGS:
9377 if (aarch64_sve_data_mode_p (mode)
9378 && constant_multiple_p (GET_MODE_SIZE (mode),
9379 BYTES_PER_SVE_VECTOR, &nregs))
9380 return nregs;
9381 return (aarch64_vector_data_mode_p (mode)
9382 ? CEIL (lowest_size, UNITS_PER_VREG)
9383 : CEIL (lowest_size, UNITS_PER_WORD));
9384 case STACK_REG:
9385 case PR_REGS:
9386 case PR_LO_REGS:
9387 case PR_HI_REGS:
9388 return 1;
9389
9390 case NO_REGS:
9391 return 0;
9392
9393 default:
9394 break;
9395 }
9396 gcc_unreachable ();
9397 }
9398
9399 static reg_class_t
9400 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
9401 {
9402 if (regclass == POINTER_REGS)
9403 return GENERAL_REGS;
9404
9405 if (regclass == STACK_REG)
9406 {
9407 if (REG_P(x)
9408 && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
9409 return regclass;
9410
9411 return NO_REGS;
9412 }
9413
9414 /* Register eliminiation can result in a request for
9415 SP+constant->FP_REGS. We cannot support such operations which
9416 use SP as source and an FP_REG as destination, so reject out
9417 right now. */
9418 if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
9419 {
9420 rtx lhs = XEXP (x, 0);
9421
9422 /* Look through a possible SUBREG introduced by ILP32. */
9423 if (GET_CODE (lhs) == SUBREG)
9424 lhs = SUBREG_REG (lhs);
9425
9426 gcc_assert (REG_P (lhs));
9427 gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
9428 POINTER_REGS));
9429 return NO_REGS;
9430 }
9431
9432 return regclass;
9433 }
9434
9435 void
9436 aarch64_asm_output_labelref (FILE* f, const char *name)
9437 {
9438 asm_fprintf (f, "%U%s", name);
9439 }
9440
9441 static void
9442 aarch64_elf_asm_constructor (rtx symbol, int priority)
9443 {
9444 if (priority == DEFAULT_INIT_PRIORITY)
9445 default_ctor_section_asm_out_constructor (symbol, priority);
9446 else
9447 {
9448 section *s;
9449 /* While priority is known to be in range [0, 65535], so 18 bytes
9450 would be enough, the compiler might not know that. To avoid
9451 -Wformat-truncation false positive, use a larger size. */
9452 char buf[23];
9453 snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
9454 s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
9455 switch_to_section (s);
9456 assemble_align (POINTER_SIZE);
9457 assemble_aligned_integer (POINTER_BYTES, symbol);
9458 }
9459 }
9460
9461 static void
9462 aarch64_elf_asm_destructor (rtx symbol, int priority)
9463 {
9464 if (priority == DEFAULT_INIT_PRIORITY)
9465 default_dtor_section_asm_out_destructor (symbol, priority);
9466 else
9467 {
9468 section *s;
9469 /* While priority is known to be in range [0, 65535], so 18 bytes
9470 would be enough, the compiler might not know that. To avoid
9471 -Wformat-truncation false positive, use a larger size. */
9472 char buf[23];
9473 snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
9474 s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
9475 switch_to_section (s);
9476 assemble_align (POINTER_SIZE);
9477 assemble_aligned_integer (POINTER_BYTES, symbol);
9478 }
9479 }
9480
9481 const char*
9482 aarch64_output_casesi (rtx *operands)
9483 {
9484 char buf[100];
9485 char label[100];
9486 rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
9487 int index;
9488 static const char *const patterns[4][2] =
9489 {
9490 {
9491 "ldrb\t%w3, [%0,%w1,uxtw]",
9492 "add\t%3, %4, %w3, sxtb #2"
9493 },
9494 {
9495 "ldrh\t%w3, [%0,%w1,uxtw #1]",
9496 "add\t%3, %4, %w3, sxth #2"
9497 },
9498 {
9499 "ldr\t%w3, [%0,%w1,uxtw #2]",
9500 "add\t%3, %4, %w3, sxtw #2"
9501 },
9502 /* We assume that DImode is only generated when not optimizing and
9503 that we don't really need 64-bit address offsets. That would
9504 imply an object file with 8GB of code in a single function! */
9505 {
9506 "ldr\t%w3, [%0,%w1,uxtw #2]",
9507 "add\t%3, %4, %w3, sxtw #2"
9508 }
9509 };
9510
9511 gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
9512
9513 scalar_int_mode mode = as_a <scalar_int_mode> (GET_MODE (diff_vec));
9514 index = exact_log2 (GET_MODE_SIZE (mode));
9515
9516 gcc_assert (index >= 0 && index <= 3);
9517
9518 /* Need to implement table size reduction, by chaning the code below. */
9519 output_asm_insn (patterns[index][0], operands);
9520 ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
9521 snprintf (buf, sizeof (buf),
9522 "adr\t%%4, %s", targetm.strip_name_encoding (label));
9523 output_asm_insn (buf, operands);
9524 output_asm_insn (patterns[index][1], operands);
9525 output_asm_insn ("br\t%3", operands);
9526 assemble_label (asm_out_file, label);
9527 return "";
9528 }
9529
9530
9531 /* Return size in bits of an arithmetic operand which is shifted/scaled and
9532 masked such that it is suitable for a UXTB, UXTH, or UXTW extend
9533 operator. */
9534
9535 int
9536 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
9537 {
9538 if (shift >= 0 && shift <= 3)
9539 {
9540 int size;
9541 for (size = 8; size <= 32; size *= 2)
9542 {
9543 HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
9544 if (mask == bits << shift)
9545 return size;
9546 }
9547 }
9548 return 0;
9549 }
9550
9551 /* Constant pools are per function only when PC relative
9552 literal loads are true or we are in the large memory
9553 model. */
9554
9555 static inline bool
9556 aarch64_can_use_per_function_literal_pools_p (void)
9557 {
9558 return (aarch64_pcrelative_literal_loads
9559 || aarch64_cmodel == AARCH64_CMODEL_LARGE);
9560 }
9561
9562 static bool
9563 aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
9564 {
9565 /* We can't use blocks for constants when we're using a per-function
9566 constant pool. */
9567 return !aarch64_can_use_per_function_literal_pools_p ();
9568 }
9569
9570 /* Select appropriate section for constants depending
9571 on where we place literal pools. */
9572
9573 static section *
9574 aarch64_select_rtx_section (machine_mode mode,
9575 rtx x,
9576 unsigned HOST_WIDE_INT align)
9577 {
9578 if (aarch64_can_use_per_function_literal_pools_p ())
9579 return function_section (current_function_decl);
9580
9581 return default_elf_select_rtx_section (mode, x, align);
9582 }
9583
9584 /* Implement ASM_OUTPUT_POOL_EPILOGUE. */
9585 void
9586 aarch64_asm_output_pool_epilogue (FILE *f, const char *, tree,
9587 HOST_WIDE_INT offset)
9588 {
9589 /* When using per-function literal pools, we must ensure that any code
9590 section is aligned to the minimal instruction length, lest we get
9591 errors from the assembler re "unaligned instructions". */
9592 if ((offset & 3) && aarch64_can_use_per_function_literal_pools_p ())
9593 ASM_OUTPUT_ALIGN (f, 2);
9594 }
9595
9596 /* Costs. */
9597
9598 /* Helper function for rtx cost calculation. Strip a shift expression
9599 from X. Returns the inner operand if successful, or the original
9600 expression on failure. */
9601 static rtx
9602 aarch64_strip_shift (rtx x)
9603 {
9604 rtx op = x;
9605
9606 /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
9607 we can convert both to ROR during final output. */
9608 if ((GET_CODE (op) == ASHIFT
9609 || GET_CODE (op) == ASHIFTRT
9610 || GET_CODE (op) == LSHIFTRT
9611 || GET_CODE (op) == ROTATERT
9612 || GET_CODE (op) == ROTATE)
9613 && CONST_INT_P (XEXP (op, 1)))
9614 return XEXP (op, 0);
9615
9616 if (GET_CODE (op) == MULT
9617 && CONST_INT_P (XEXP (op, 1))
9618 && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
9619 return XEXP (op, 0);
9620
9621 return x;
9622 }
9623
9624 /* Helper function for rtx cost calculation. Strip an extend
9625 expression from X. Returns the inner operand if successful, or the
9626 original expression on failure. We deal with a number of possible
9627 canonicalization variations here. If STRIP_SHIFT is true, then
9628 we can strip off a shift also. */
9629 static rtx
9630 aarch64_strip_extend (rtx x, bool strip_shift)
9631 {
9632 scalar_int_mode mode;
9633 rtx op = x;
9634
9635 if (!is_a <scalar_int_mode> (GET_MODE (op), &mode))
9636 return op;
9637
9638 /* Zero and sign extraction of a widened value. */
9639 if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
9640 && XEXP (op, 2) == const0_rtx
9641 && GET_CODE (XEXP (op, 0)) == MULT
9642 && aarch64_is_extend_from_extract (mode, XEXP (XEXP (op, 0), 1),
9643 XEXP (op, 1)))
9644 return XEXP (XEXP (op, 0), 0);
9645
9646 /* It can also be represented (for zero-extend) as an AND with an
9647 immediate. */
9648 if (GET_CODE (op) == AND
9649 && GET_CODE (XEXP (op, 0)) == MULT
9650 && CONST_INT_P (XEXP (XEXP (op, 0), 1))
9651 && CONST_INT_P (XEXP (op, 1))
9652 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
9653 INTVAL (XEXP (op, 1))) != 0)
9654 return XEXP (XEXP (op, 0), 0);
9655
9656 /* Now handle extended register, as this may also have an optional
9657 left shift by 1..4. */
9658 if (strip_shift
9659 && GET_CODE (op) == ASHIFT
9660 && CONST_INT_P (XEXP (op, 1))
9661 && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
9662 op = XEXP (op, 0);
9663
9664 if (GET_CODE (op) == ZERO_EXTEND
9665 || GET_CODE (op) == SIGN_EXTEND)
9666 op = XEXP (op, 0);
9667
9668 if (op != x)
9669 return op;
9670
9671 return x;
9672 }
9673
9674 /* Return true iff CODE is a shift supported in combination
9675 with arithmetic instructions. */
9676
9677 static bool
9678 aarch64_shift_p (enum rtx_code code)
9679 {
9680 return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
9681 }
9682
9683
9684 /* Return true iff X is a cheap shift without a sign extend. */
9685
9686 static bool
9687 aarch64_cheap_mult_shift_p (rtx x)
9688 {
9689 rtx op0, op1;
9690
9691 op0 = XEXP (x, 0);
9692 op1 = XEXP (x, 1);
9693
9694 if (!(aarch64_tune_params.extra_tuning_flags
9695 & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND))
9696 return false;
9697
9698 if (GET_CODE (op0) == SIGN_EXTEND)
9699 return false;
9700
9701 if (GET_CODE (x) == ASHIFT && CONST_INT_P (op1)
9702 && UINTVAL (op1) <= 4)
9703 return true;
9704
9705 if (GET_CODE (x) != MULT || !CONST_INT_P (op1))
9706 return false;
9707
9708 HOST_WIDE_INT l2 = exact_log2 (INTVAL (op1));
9709
9710 if (l2 > 0 && l2 <= 4)
9711 return true;
9712
9713 return false;
9714 }
9715
9716 /* Helper function for rtx cost calculation. Calculate the cost of
9717 a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
9718 Return the calculated cost of the expression, recursing manually in to
9719 operands where needed. */
9720
9721 static int
9722 aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
9723 {
9724 rtx op0, op1;
9725 const struct cpu_cost_table *extra_cost
9726 = aarch64_tune_params.insn_extra_cost;
9727 int cost = 0;
9728 bool compound_p = (outer == PLUS || outer == MINUS);
9729 machine_mode mode = GET_MODE (x);
9730
9731 gcc_checking_assert (code == MULT);
9732
9733 op0 = XEXP (x, 0);
9734 op1 = XEXP (x, 1);
9735
9736 if (VECTOR_MODE_P (mode))
9737 mode = GET_MODE_INNER (mode);
9738
9739 /* Integer multiply/fma. */
9740 if (GET_MODE_CLASS (mode) == MODE_INT)
9741 {
9742 /* The multiply will be canonicalized as a shift, cost it as such. */
9743 if (aarch64_shift_p (GET_CODE (x))
9744 || (CONST_INT_P (op1)
9745 && exact_log2 (INTVAL (op1)) > 0))
9746 {
9747 bool is_extend = GET_CODE (op0) == ZERO_EXTEND
9748 || GET_CODE (op0) == SIGN_EXTEND;
9749 if (speed)
9750 {
9751 if (compound_p)
9752 {
9753 /* If the shift is considered cheap,
9754 then don't add any cost. */
9755 if (aarch64_cheap_mult_shift_p (x))
9756 ;
9757 else if (REG_P (op1))
9758 /* ARITH + shift-by-register. */
9759 cost += extra_cost->alu.arith_shift_reg;
9760 else if (is_extend)
9761 /* ARITH + extended register. We don't have a cost field
9762 for ARITH+EXTEND+SHIFT, so use extend_arith here. */
9763 cost += extra_cost->alu.extend_arith;
9764 else
9765 /* ARITH + shift-by-immediate. */
9766 cost += extra_cost->alu.arith_shift;
9767 }
9768 else
9769 /* LSL (immediate). */
9770 cost += extra_cost->alu.shift;
9771
9772 }
9773 /* Strip extends as we will have costed them in the case above. */
9774 if (is_extend)
9775 op0 = aarch64_strip_extend (op0, true);
9776
9777 cost += rtx_cost (op0, VOIDmode, code, 0, speed);
9778
9779 return cost;
9780 }
9781
9782 /* MNEG or [US]MNEGL. Extract the NEG operand and indicate that it's a
9783 compound and let the below cases handle it. After all, MNEG is a
9784 special-case alias of MSUB. */
9785 if (GET_CODE (op0) == NEG)
9786 {
9787 op0 = XEXP (op0, 0);
9788 compound_p = true;
9789 }
9790
9791 /* Integer multiplies or FMAs have zero/sign extending variants. */
9792 if ((GET_CODE (op0) == ZERO_EXTEND
9793 && GET_CODE (op1) == ZERO_EXTEND)
9794 || (GET_CODE (op0) == SIGN_EXTEND
9795 && GET_CODE (op1) == SIGN_EXTEND))
9796 {
9797 cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
9798 cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
9799
9800 if (speed)
9801 {
9802 if (compound_p)
9803 /* SMADDL/UMADDL/UMSUBL/SMSUBL. */
9804 cost += extra_cost->mult[0].extend_add;
9805 else
9806 /* MUL/SMULL/UMULL. */
9807 cost += extra_cost->mult[0].extend;
9808 }
9809
9810 return cost;
9811 }
9812
9813 /* This is either an integer multiply or a MADD. In both cases
9814 we want to recurse and cost the operands. */
9815 cost += rtx_cost (op0, mode, MULT, 0, speed);
9816 cost += rtx_cost (op1, mode, MULT, 1, speed);
9817
9818 if (speed)
9819 {
9820 if (compound_p)
9821 /* MADD/MSUB. */
9822 cost += extra_cost->mult[mode == DImode].add;
9823 else
9824 /* MUL. */
9825 cost += extra_cost->mult[mode == DImode].simple;
9826 }
9827
9828 return cost;
9829 }
9830 else
9831 {
9832 if (speed)
9833 {
9834 /* Floating-point FMA/FMUL can also support negations of the
9835 operands, unless the rounding mode is upward or downward in
9836 which case FNMUL is different than FMUL with operand negation. */
9837 bool neg0 = GET_CODE (op0) == NEG;
9838 bool neg1 = GET_CODE (op1) == NEG;
9839 if (compound_p || !flag_rounding_math || (neg0 && neg1))
9840 {
9841 if (neg0)
9842 op0 = XEXP (op0, 0);
9843 if (neg1)
9844 op1 = XEXP (op1, 0);
9845 }
9846
9847 if (compound_p)
9848 /* FMADD/FNMADD/FNMSUB/FMSUB. */
9849 cost += extra_cost->fp[mode == DFmode].fma;
9850 else
9851 /* FMUL/FNMUL. */
9852 cost += extra_cost->fp[mode == DFmode].mult;
9853 }
9854
9855 cost += rtx_cost (op0, mode, MULT, 0, speed);
9856 cost += rtx_cost (op1, mode, MULT, 1, speed);
9857 return cost;
9858 }
9859 }
9860
9861 static int
9862 aarch64_address_cost (rtx x,
9863 machine_mode mode,
9864 addr_space_t as ATTRIBUTE_UNUSED,
9865 bool speed)
9866 {
9867 enum rtx_code c = GET_CODE (x);
9868 const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
9869 struct aarch64_address_info info;
9870 int cost = 0;
9871 info.shift = 0;
9872
9873 if (!aarch64_classify_address (&info, x, mode, false))
9874 {
9875 if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
9876 {
9877 /* This is a CONST or SYMBOL ref which will be split
9878 in a different way depending on the code model in use.
9879 Cost it through the generic infrastructure. */
9880 int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
9881 /* Divide through by the cost of one instruction to
9882 bring it to the same units as the address costs. */
9883 cost_symbol_ref /= COSTS_N_INSNS (1);
9884 /* The cost is then the cost of preparing the address,
9885 followed by an immediate (possibly 0) offset. */
9886 return cost_symbol_ref + addr_cost->imm_offset;
9887 }
9888 else
9889 {
9890 /* This is most likely a jump table from a case
9891 statement. */
9892 return addr_cost->register_offset;
9893 }
9894 }
9895
9896 switch (info.type)
9897 {
9898 case ADDRESS_LO_SUM:
9899 case ADDRESS_SYMBOLIC:
9900 case ADDRESS_REG_IMM:
9901 cost += addr_cost->imm_offset;
9902 break;
9903
9904 case ADDRESS_REG_WB:
9905 if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
9906 cost += addr_cost->pre_modify;
9907 else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
9908 cost += addr_cost->post_modify;
9909 else
9910 gcc_unreachable ();
9911
9912 break;
9913
9914 case ADDRESS_REG_REG:
9915 cost += addr_cost->register_offset;
9916 break;
9917
9918 case ADDRESS_REG_SXTW:
9919 cost += addr_cost->register_sextend;
9920 break;
9921
9922 case ADDRESS_REG_UXTW:
9923 cost += addr_cost->register_zextend;
9924 break;
9925
9926 default:
9927 gcc_unreachable ();
9928 }
9929
9930
9931 if (info.shift > 0)
9932 {
9933 /* For the sake of calculating the cost of the shifted register
9934 component, we can treat same sized modes in the same way. */
9935 if (known_eq (GET_MODE_BITSIZE (mode), 16))
9936 cost += addr_cost->addr_scale_costs.hi;
9937 else if (known_eq (GET_MODE_BITSIZE (mode), 32))
9938 cost += addr_cost->addr_scale_costs.si;
9939 else if (known_eq (GET_MODE_BITSIZE (mode), 64))
9940 cost += addr_cost->addr_scale_costs.di;
9941 else
9942 /* We can't tell, or this is a 128-bit vector. */
9943 cost += addr_cost->addr_scale_costs.ti;
9944 }
9945
9946 return cost;
9947 }
9948
9949 /* Return the cost of a branch. If SPEED_P is true then the compiler is
9950 optimizing for speed. If PREDICTABLE_P is true then the branch is predicted
9951 to be taken. */
9952
9953 int
9954 aarch64_branch_cost (bool speed_p, bool predictable_p)
9955 {
9956 /* When optimizing for speed, use the cost of unpredictable branches. */
9957 const struct cpu_branch_cost *branch_costs =
9958 aarch64_tune_params.branch_costs;
9959
9960 if (!speed_p || predictable_p)
9961 return branch_costs->predictable;
9962 else
9963 return branch_costs->unpredictable;
9964 }
9965
9966 /* Return true if the RTX X in mode MODE is a zero or sign extract
9967 usable in an ADD or SUB (extended register) instruction. */
9968 static bool
9969 aarch64_rtx_arith_op_extract_p (rtx x, scalar_int_mode mode)
9970 {
9971 /* Catch add with a sign extract.
9972 This is add_<optab><mode>_multp2. */
9973 if (GET_CODE (x) == SIGN_EXTRACT
9974 || GET_CODE (x) == ZERO_EXTRACT)
9975 {
9976 rtx op0 = XEXP (x, 0);
9977 rtx op1 = XEXP (x, 1);
9978 rtx op2 = XEXP (x, 2);
9979
9980 if (GET_CODE (op0) == MULT
9981 && CONST_INT_P (op1)
9982 && op2 == const0_rtx
9983 && CONST_INT_P (XEXP (op0, 1))
9984 && aarch64_is_extend_from_extract (mode,
9985 XEXP (op0, 1),
9986 op1))
9987 {
9988 return true;
9989 }
9990 }
9991 /* The simple case <ARITH>, XD, XN, XM, [us]xt.
9992 No shift. */
9993 else if (GET_CODE (x) == SIGN_EXTEND
9994 || GET_CODE (x) == ZERO_EXTEND)
9995 return REG_P (XEXP (x, 0));
9996
9997 return false;
9998 }
9999
10000 static bool
10001 aarch64_frint_unspec_p (unsigned int u)
10002 {
10003 switch (u)
10004 {
10005 case UNSPEC_FRINTZ:
10006 case UNSPEC_FRINTP:
10007 case UNSPEC_FRINTM:
10008 case UNSPEC_FRINTA:
10009 case UNSPEC_FRINTN:
10010 case UNSPEC_FRINTX:
10011 case UNSPEC_FRINTI:
10012 return true;
10013
10014 default:
10015 return false;
10016 }
10017 }
10018
10019 /* Return true iff X is an rtx that will match an extr instruction
10020 i.e. as described in the *extr<mode>5_insn family of patterns.
10021 OP0 and OP1 will be set to the operands of the shifts involved
10022 on success and will be NULL_RTX otherwise. */
10023
10024 static bool
10025 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
10026 {
10027 rtx op0, op1;
10028 scalar_int_mode mode;
10029 if (!is_a <scalar_int_mode> (GET_MODE (x), &mode))
10030 return false;
10031
10032 *res_op0 = NULL_RTX;
10033 *res_op1 = NULL_RTX;
10034
10035 if (GET_CODE (x) != IOR)
10036 return false;
10037
10038 op0 = XEXP (x, 0);
10039 op1 = XEXP (x, 1);
10040
10041 if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
10042 || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
10043 {
10044 /* Canonicalise locally to ashift in op0, lshiftrt in op1. */
10045 if (GET_CODE (op1) == ASHIFT)
10046 std::swap (op0, op1);
10047
10048 if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
10049 return false;
10050
10051 unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
10052 unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
10053
10054 if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
10055 && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
10056 {
10057 *res_op0 = XEXP (op0, 0);
10058 *res_op1 = XEXP (op1, 0);
10059 return true;
10060 }
10061 }
10062
10063 return false;
10064 }
10065
10066 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
10067 storing it in *COST. Result is true if the total cost of the operation
10068 has now been calculated. */
10069 static bool
10070 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
10071 {
10072 rtx inner;
10073 rtx comparator;
10074 enum rtx_code cmpcode;
10075
10076 if (COMPARISON_P (op0))
10077 {
10078 inner = XEXP (op0, 0);
10079 comparator = XEXP (op0, 1);
10080 cmpcode = GET_CODE (op0);
10081 }
10082 else
10083 {
10084 inner = op0;
10085 comparator = const0_rtx;
10086 cmpcode = NE;
10087 }
10088
10089 if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
10090 {
10091 /* Conditional branch. */
10092 if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
10093 return true;
10094 else
10095 {
10096 if (cmpcode == NE || cmpcode == EQ)
10097 {
10098 if (comparator == const0_rtx)
10099 {
10100 /* TBZ/TBNZ/CBZ/CBNZ. */
10101 if (GET_CODE (inner) == ZERO_EXTRACT)
10102 /* TBZ/TBNZ. */
10103 *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
10104 ZERO_EXTRACT, 0, speed);
10105 else
10106 /* CBZ/CBNZ. */
10107 *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
10108
10109 return true;
10110 }
10111 }
10112 else if (cmpcode == LT || cmpcode == GE)
10113 {
10114 /* TBZ/TBNZ. */
10115 if (comparator == const0_rtx)
10116 return true;
10117 }
10118 }
10119 }
10120 else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
10121 {
10122 /* CCMP. */
10123 if (GET_CODE (op1) == COMPARE)
10124 {
10125 /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0. */
10126 if (XEXP (op1, 1) == const0_rtx)
10127 *cost += 1;
10128 if (speed)
10129 {
10130 machine_mode mode = GET_MODE (XEXP (op1, 0));
10131 const struct cpu_cost_table *extra_cost
10132 = aarch64_tune_params.insn_extra_cost;
10133
10134 if (GET_MODE_CLASS (mode) == MODE_INT)
10135 *cost += extra_cost->alu.arith;
10136 else
10137 *cost += extra_cost->fp[mode == DFmode].compare;
10138 }
10139 return true;
10140 }
10141
10142 /* It's a conditional operation based on the status flags,
10143 so it must be some flavor of CSEL. */
10144
10145 /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL. */
10146 if (GET_CODE (op1) == NEG
10147 || GET_CODE (op1) == NOT
10148 || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
10149 op1 = XEXP (op1, 0);
10150 else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND)
10151 {
10152 /* CSEL with zero-extension (*cmovdi_insn_uxtw). */
10153 op1 = XEXP (op1, 0);
10154 op2 = XEXP (op2, 0);
10155 }
10156
10157 *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
10158 *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
10159 return true;
10160 }
10161
10162 /* We don't know what this is, cost all operands. */
10163 return false;
10164 }
10165
10166 /* Check whether X is a bitfield operation of the form shift + extend that
10167 maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction. If so, return the
10168 operand to which the bitfield operation is applied. Otherwise return
10169 NULL_RTX. */
10170
10171 static rtx
10172 aarch64_extend_bitfield_pattern_p (rtx x)
10173 {
10174 rtx_code outer_code = GET_CODE (x);
10175 machine_mode outer_mode = GET_MODE (x);
10176
10177 if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND
10178 && outer_mode != SImode && outer_mode != DImode)
10179 return NULL_RTX;
10180
10181 rtx inner = XEXP (x, 0);
10182 rtx_code inner_code = GET_CODE (inner);
10183 machine_mode inner_mode = GET_MODE (inner);
10184 rtx op = NULL_RTX;
10185
10186 switch (inner_code)
10187 {
10188 case ASHIFT:
10189 if (CONST_INT_P (XEXP (inner, 1))
10190 && (inner_mode == QImode || inner_mode == HImode))
10191 op = XEXP (inner, 0);
10192 break;
10193 case LSHIFTRT:
10194 if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1))
10195 && (inner_mode == QImode || inner_mode == HImode))
10196 op = XEXP (inner, 0);
10197 break;
10198 case ASHIFTRT:
10199 if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1))
10200 && (inner_mode == QImode || inner_mode == HImode))
10201 op = XEXP (inner, 0);
10202 break;
10203 default:
10204 break;
10205 }
10206
10207 return op;
10208 }
10209
10210 /* Return true if the mask and a shift amount from an RTX of the form
10211 (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
10212 mode MODE. See the *andim_ashift<mode>_bfiz pattern. */
10213
10214 bool
10215 aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode mode, rtx mask,
10216 rtx shft_amnt)
10217 {
10218 return CONST_INT_P (mask) && CONST_INT_P (shft_amnt)
10219 && INTVAL (shft_amnt) < GET_MODE_BITSIZE (mode)
10220 && exact_log2 ((INTVAL (mask) >> INTVAL (shft_amnt)) + 1) >= 0
10221 && (INTVAL (mask)
10222 & ((HOST_WIDE_INT_1U << INTVAL (shft_amnt)) - 1)) == 0;
10223 }
10224
10225 /* Return true if the masks and a shift amount from an RTX of the form
10226 ((x & MASK1) | ((y << SHIFT_AMNT) & MASK2)) are valid to combine into
10227 a BFI instruction of mode MODE. See *arch64_bfi patterns. */
10228
10229 bool
10230 aarch64_masks_and_shift_for_bfi_p (scalar_int_mode mode,
10231 unsigned HOST_WIDE_INT mask1,
10232 unsigned HOST_WIDE_INT shft_amnt,
10233 unsigned HOST_WIDE_INT mask2)
10234 {
10235 unsigned HOST_WIDE_INT t;
10236
10237 /* Verify that there is no overlap in what bits are set in the two masks. */
10238 if (mask1 != ~mask2)
10239 return false;
10240
10241 /* Verify that mask2 is not all zeros or ones. */
10242 if (mask2 == 0 || mask2 == HOST_WIDE_INT_M1U)
10243 return false;
10244
10245 /* The shift amount should always be less than the mode size. */
10246 gcc_assert (shft_amnt < GET_MODE_BITSIZE (mode));
10247
10248 /* Verify that the mask being shifted is contiguous and would be in the
10249 least significant bits after shifting by shft_amnt. */
10250 t = mask2 + (HOST_WIDE_INT_1U << shft_amnt);
10251 return (t == (t & -t));
10252 }
10253
10254 /* Calculate the cost of calculating X, storing it in *COST. Result
10255 is true if the total cost of the operation has now been calculated. */
10256 static bool
10257 aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
10258 int param ATTRIBUTE_UNUSED, int *cost, bool speed)
10259 {
10260 rtx op0, op1, op2;
10261 const struct cpu_cost_table *extra_cost
10262 = aarch64_tune_params.insn_extra_cost;
10263 int code = GET_CODE (x);
10264 scalar_int_mode int_mode;
10265
10266 /* By default, assume that everything has equivalent cost to the
10267 cheapest instruction. Any additional costs are applied as a delta
10268 above this default. */
10269 *cost = COSTS_N_INSNS (1);
10270
10271 switch (code)
10272 {
10273 case SET:
10274 /* The cost depends entirely on the operands to SET. */
10275 *cost = 0;
10276 op0 = SET_DEST (x);
10277 op1 = SET_SRC (x);
10278
10279 switch (GET_CODE (op0))
10280 {
10281 case MEM:
10282 if (speed)
10283 {
10284 rtx address = XEXP (op0, 0);
10285 if (VECTOR_MODE_P (mode))
10286 *cost += extra_cost->ldst.storev;
10287 else if (GET_MODE_CLASS (mode) == MODE_INT)
10288 *cost += extra_cost->ldst.store;
10289 else if (mode == SFmode)
10290 *cost += extra_cost->ldst.storef;
10291 else if (mode == DFmode)
10292 *cost += extra_cost->ldst.stored;
10293
10294 *cost +=
10295 COSTS_N_INSNS (aarch64_address_cost (address, mode,
10296 0, speed));
10297 }
10298
10299 *cost += rtx_cost (op1, mode, SET, 1, speed);
10300 return true;
10301
10302 case SUBREG:
10303 if (! REG_P (SUBREG_REG (op0)))
10304 *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
10305
10306 /* Fall through. */
10307 case REG:
10308 /* The cost is one per vector-register copied. */
10309 if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
10310 {
10311 int nregs = aarch64_hard_regno_nregs (V0_REGNUM, GET_MODE (op0));
10312 *cost = COSTS_N_INSNS (nregs);
10313 }
10314 /* const0_rtx is in general free, but we will use an
10315 instruction to set a register to 0. */
10316 else if (REG_P (op1) || op1 == const0_rtx)
10317 {
10318 /* The cost is 1 per register copied. */
10319 int nregs = aarch64_hard_regno_nregs (R0_REGNUM, GET_MODE (op0));
10320 *cost = COSTS_N_INSNS (nregs);
10321 }
10322 else
10323 /* Cost is just the cost of the RHS of the set. */
10324 *cost += rtx_cost (op1, mode, SET, 1, speed);
10325 return true;
10326
10327 case ZERO_EXTRACT:
10328 case SIGN_EXTRACT:
10329 /* Bit-field insertion. Strip any redundant widening of
10330 the RHS to meet the width of the target. */
10331 if (GET_CODE (op1) == SUBREG)
10332 op1 = SUBREG_REG (op1);
10333 if ((GET_CODE (op1) == ZERO_EXTEND
10334 || GET_CODE (op1) == SIGN_EXTEND)
10335 && CONST_INT_P (XEXP (op0, 1))
10336 && is_a <scalar_int_mode> (GET_MODE (XEXP (op1, 0)), &int_mode)
10337 && GET_MODE_BITSIZE (int_mode) >= INTVAL (XEXP (op0, 1)))
10338 op1 = XEXP (op1, 0);
10339
10340 if (CONST_INT_P (op1))
10341 {
10342 /* MOV immediate is assumed to always be cheap. */
10343 *cost = COSTS_N_INSNS (1);
10344 }
10345 else
10346 {
10347 /* BFM. */
10348 if (speed)
10349 *cost += extra_cost->alu.bfi;
10350 *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed);
10351 }
10352
10353 return true;
10354
10355 default:
10356 /* We can't make sense of this, assume default cost. */
10357 *cost = COSTS_N_INSNS (1);
10358 return false;
10359 }
10360 return false;
10361
10362 case CONST_INT:
10363 /* If an instruction can incorporate a constant within the
10364 instruction, the instruction's expression avoids calling
10365 rtx_cost() on the constant. If rtx_cost() is called on a
10366 constant, then it is usually because the constant must be
10367 moved into a register by one or more instructions.
10368
10369 The exception is constant 0, which can be expressed
10370 as XZR/WZR and is therefore free. The exception to this is
10371 if we have (set (reg) (const0_rtx)) in which case we must cost
10372 the move. However, we can catch that when we cost the SET, so
10373 we don't need to consider that here. */
10374 if (x == const0_rtx)
10375 *cost = 0;
10376 else
10377 {
10378 /* To an approximation, building any other constant is
10379 proportionally expensive to the number of instructions
10380 required to build that constant. This is true whether we
10381 are compiling for SPEED or otherwise. */
10382 if (!is_a <scalar_int_mode> (mode, &int_mode))
10383 int_mode = word_mode;
10384 *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
10385 (NULL_RTX, x, false, int_mode));
10386 }
10387 return true;
10388
10389 case CONST_DOUBLE:
10390
10391 /* First determine number of instructions to do the move
10392 as an integer constant. */
10393 if (!aarch64_float_const_representable_p (x)
10394 && !aarch64_can_const_movi_rtx_p (x, mode)
10395 && aarch64_float_const_rtx_p (x))
10396 {
10397 unsigned HOST_WIDE_INT ival;
10398 bool succeed = aarch64_reinterpret_float_as_int (x, &ival);
10399 gcc_assert (succeed);
10400
10401 scalar_int_mode imode = (mode == HFmode
10402 ? SImode
10403 : int_mode_for_mode (mode).require ());
10404 int ncost = aarch64_internal_mov_immediate
10405 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
10406 *cost += COSTS_N_INSNS (ncost);
10407 return true;
10408 }
10409
10410 if (speed)
10411 {
10412 /* mov[df,sf]_aarch64. */
10413 if (aarch64_float_const_representable_p (x))
10414 /* FMOV (scalar immediate). */
10415 *cost += extra_cost->fp[mode == DFmode].fpconst;
10416 else if (!aarch64_float_const_zero_rtx_p (x))
10417 {
10418 /* This will be a load from memory. */
10419 if (mode == DFmode)
10420 *cost += extra_cost->ldst.loadd;
10421 else
10422 *cost += extra_cost->ldst.loadf;
10423 }
10424 else
10425 /* Otherwise this is +0.0. We get this using MOVI d0, #0
10426 or MOV v0.s[0], wzr - neither of which are modeled by the
10427 cost tables. Just use the default cost. */
10428 {
10429 }
10430 }
10431
10432 return true;
10433
10434 case MEM:
10435 if (speed)
10436 {
10437 /* For loads we want the base cost of a load, plus an
10438 approximation for the additional cost of the addressing
10439 mode. */
10440 rtx address = XEXP (x, 0);
10441 if (VECTOR_MODE_P (mode))
10442 *cost += extra_cost->ldst.loadv;
10443 else if (GET_MODE_CLASS (mode) == MODE_INT)
10444 *cost += extra_cost->ldst.load;
10445 else if (mode == SFmode)
10446 *cost += extra_cost->ldst.loadf;
10447 else if (mode == DFmode)
10448 *cost += extra_cost->ldst.loadd;
10449
10450 *cost +=
10451 COSTS_N_INSNS (aarch64_address_cost (address, mode,
10452 0, speed));
10453 }
10454
10455 return true;
10456
10457 case NEG:
10458 op0 = XEXP (x, 0);
10459
10460 if (VECTOR_MODE_P (mode))
10461 {
10462 if (speed)
10463 {
10464 /* FNEG. */
10465 *cost += extra_cost->vect.alu;
10466 }
10467 return false;
10468 }
10469
10470 if (GET_MODE_CLASS (mode) == MODE_INT)
10471 {
10472 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
10473 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
10474 {
10475 /* CSETM. */
10476 *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
10477 return true;
10478 }
10479
10480 /* Cost this as SUB wzr, X. */
10481 op0 = CONST0_RTX (mode);
10482 op1 = XEXP (x, 0);
10483 goto cost_minus;
10484 }
10485
10486 if (GET_MODE_CLASS (mode) == MODE_FLOAT)
10487 {
10488 /* Support (neg(fma...)) as a single instruction only if
10489 sign of zeros is unimportant. This matches the decision
10490 making in aarch64.md. */
10491 if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
10492 {
10493 /* FNMADD. */
10494 *cost = rtx_cost (op0, mode, NEG, 0, speed);
10495 return true;
10496 }
10497 if (GET_CODE (op0) == MULT)
10498 {
10499 /* FNMUL. */
10500 *cost = rtx_cost (op0, mode, NEG, 0, speed);
10501 return true;
10502 }
10503 if (speed)
10504 /* FNEG. */
10505 *cost += extra_cost->fp[mode == DFmode].neg;
10506 return false;
10507 }
10508
10509 return false;
10510
10511 case CLRSB:
10512 case CLZ:
10513 if (speed)
10514 {
10515 if (VECTOR_MODE_P (mode))
10516 *cost += extra_cost->vect.alu;
10517 else
10518 *cost += extra_cost->alu.clz;
10519 }
10520
10521 return false;
10522
10523 case COMPARE:
10524 op0 = XEXP (x, 0);
10525 op1 = XEXP (x, 1);
10526
10527 if (op1 == const0_rtx
10528 && GET_CODE (op0) == AND)
10529 {
10530 x = op0;
10531 mode = GET_MODE (op0);
10532 goto cost_logic;
10533 }
10534
10535 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
10536 {
10537 /* TODO: A write to the CC flags possibly costs extra, this
10538 needs encoding in the cost tables. */
10539
10540 mode = GET_MODE (op0);
10541 /* ANDS. */
10542 if (GET_CODE (op0) == AND)
10543 {
10544 x = op0;
10545 goto cost_logic;
10546 }
10547
10548 if (GET_CODE (op0) == PLUS)
10549 {
10550 /* ADDS (and CMN alias). */
10551 x = op0;
10552 goto cost_plus;
10553 }
10554
10555 if (GET_CODE (op0) == MINUS)
10556 {
10557 /* SUBS. */
10558 x = op0;
10559 goto cost_minus;
10560 }
10561
10562 if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx
10563 && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1))
10564 && CONST_INT_P (XEXP (op0, 2)))
10565 {
10566 /* COMPARE of ZERO_EXTRACT form of TST-immediate.
10567 Handle it here directly rather than going to cost_logic
10568 since we know the immediate generated for the TST is valid
10569 so we can avoid creating an intermediate rtx for it only
10570 for costing purposes. */
10571 if (speed)
10572 *cost += extra_cost->alu.logical;
10573
10574 *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0),
10575 ZERO_EXTRACT, 0, speed);
10576 return true;
10577 }
10578
10579 if (GET_CODE (op1) == NEG)
10580 {
10581 /* CMN. */
10582 if (speed)
10583 *cost += extra_cost->alu.arith;
10584
10585 *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
10586 *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
10587 return true;
10588 }
10589
10590 /* CMP.
10591
10592 Compare can freely swap the order of operands, and
10593 canonicalization puts the more complex operation first.
10594 But the integer MINUS logic expects the shift/extend
10595 operation in op1. */
10596 if (! (REG_P (op0)
10597 || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
10598 {
10599 op0 = XEXP (x, 1);
10600 op1 = XEXP (x, 0);
10601 }
10602 goto cost_minus;
10603 }
10604
10605 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
10606 {
10607 /* FCMP. */
10608 if (speed)
10609 *cost += extra_cost->fp[mode == DFmode].compare;
10610
10611 if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
10612 {
10613 *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
10614 /* FCMP supports constant 0.0 for no extra cost. */
10615 return true;
10616 }
10617 return false;
10618 }
10619
10620 if (VECTOR_MODE_P (mode))
10621 {
10622 /* Vector compare. */
10623 if (speed)
10624 *cost += extra_cost->vect.alu;
10625
10626 if (aarch64_float_const_zero_rtx_p (op1))
10627 {
10628 /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
10629 cost. */
10630 return true;
10631 }
10632 return false;
10633 }
10634 return false;
10635
10636 case MINUS:
10637 {
10638 op0 = XEXP (x, 0);
10639 op1 = XEXP (x, 1);
10640
10641 cost_minus:
10642 *cost += rtx_cost (op0, mode, MINUS, 0, speed);
10643
10644 /* Detect valid immediates. */
10645 if ((GET_MODE_CLASS (mode) == MODE_INT
10646 || (GET_MODE_CLASS (mode) == MODE_CC
10647 && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
10648 && CONST_INT_P (op1)
10649 && aarch64_uimm12_shift (INTVAL (op1)))
10650 {
10651 if (speed)
10652 /* SUB(S) (immediate). */
10653 *cost += extra_cost->alu.arith;
10654 return true;
10655 }
10656
10657 /* Look for SUB (extended register). */
10658 if (is_a <scalar_int_mode> (mode, &int_mode)
10659 && aarch64_rtx_arith_op_extract_p (op1, int_mode))
10660 {
10661 if (speed)
10662 *cost += extra_cost->alu.extend_arith;
10663
10664 op1 = aarch64_strip_extend (op1, true);
10665 *cost += rtx_cost (op1, VOIDmode,
10666 (enum rtx_code) GET_CODE (op1), 0, speed);
10667 return true;
10668 }
10669
10670 rtx new_op1 = aarch64_strip_extend (op1, false);
10671
10672 /* Cost this as an FMA-alike operation. */
10673 if ((GET_CODE (new_op1) == MULT
10674 || aarch64_shift_p (GET_CODE (new_op1)))
10675 && code != COMPARE)
10676 {
10677 *cost += aarch64_rtx_mult_cost (new_op1, MULT,
10678 (enum rtx_code) code,
10679 speed);
10680 return true;
10681 }
10682
10683 *cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
10684
10685 if (speed)
10686 {
10687 if (VECTOR_MODE_P (mode))
10688 {
10689 /* Vector SUB. */
10690 *cost += extra_cost->vect.alu;
10691 }
10692 else if (GET_MODE_CLASS (mode) == MODE_INT)
10693 {
10694 /* SUB(S). */
10695 *cost += extra_cost->alu.arith;
10696 }
10697 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
10698 {
10699 /* FSUB. */
10700 *cost += extra_cost->fp[mode == DFmode].addsub;
10701 }
10702 }
10703 return true;
10704 }
10705
10706 case PLUS:
10707 {
10708 rtx new_op0;
10709
10710 op0 = XEXP (x, 0);
10711 op1 = XEXP (x, 1);
10712
10713 cost_plus:
10714 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
10715 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
10716 {
10717 /* CSINC. */
10718 *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
10719 *cost += rtx_cost (op1, mode, PLUS, 1, speed);
10720 return true;
10721 }
10722
10723 if (GET_MODE_CLASS (mode) == MODE_INT
10724 && ((CONST_INT_P (op1) && aarch64_uimm12_shift (INTVAL (op1)))
10725 || aarch64_sve_addvl_addpl_immediate (op1, mode)))
10726 {
10727 *cost += rtx_cost (op0, mode, PLUS, 0, speed);
10728
10729 if (speed)
10730 /* ADD (immediate). */
10731 *cost += extra_cost->alu.arith;
10732 return true;
10733 }
10734
10735 *cost += rtx_cost (op1, mode, PLUS, 1, speed);
10736
10737 /* Look for ADD (extended register). */
10738 if (is_a <scalar_int_mode> (mode, &int_mode)
10739 && aarch64_rtx_arith_op_extract_p (op0, int_mode))
10740 {
10741 if (speed)
10742 *cost += extra_cost->alu.extend_arith;
10743
10744 op0 = aarch64_strip_extend (op0, true);
10745 *cost += rtx_cost (op0, VOIDmode,
10746 (enum rtx_code) GET_CODE (op0), 0, speed);
10747 return true;
10748 }
10749
10750 /* Strip any extend, leave shifts behind as we will
10751 cost them through mult_cost. */
10752 new_op0 = aarch64_strip_extend (op0, false);
10753
10754 if (GET_CODE (new_op0) == MULT
10755 || aarch64_shift_p (GET_CODE (new_op0)))
10756 {
10757 *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
10758 speed);
10759 return true;
10760 }
10761
10762 *cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
10763
10764 if (speed)
10765 {
10766 if (VECTOR_MODE_P (mode))
10767 {
10768 /* Vector ADD. */
10769 *cost += extra_cost->vect.alu;
10770 }
10771 else if (GET_MODE_CLASS (mode) == MODE_INT)
10772 {
10773 /* ADD. */
10774 *cost += extra_cost->alu.arith;
10775 }
10776 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
10777 {
10778 /* FADD. */
10779 *cost += extra_cost->fp[mode == DFmode].addsub;
10780 }
10781 }
10782 return true;
10783 }
10784
10785 case BSWAP:
10786 *cost = COSTS_N_INSNS (1);
10787
10788 if (speed)
10789 {
10790 if (VECTOR_MODE_P (mode))
10791 *cost += extra_cost->vect.alu;
10792 else
10793 *cost += extra_cost->alu.rev;
10794 }
10795 return false;
10796
10797 case IOR:
10798 if (aarch_rev16_p (x))
10799 {
10800 *cost = COSTS_N_INSNS (1);
10801
10802 if (speed)
10803 {
10804 if (VECTOR_MODE_P (mode))
10805 *cost += extra_cost->vect.alu;
10806 else
10807 *cost += extra_cost->alu.rev;
10808 }
10809 return true;
10810 }
10811
10812 if (aarch64_extr_rtx_p (x, &op0, &op1))
10813 {
10814 *cost += rtx_cost (op0, mode, IOR, 0, speed);
10815 *cost += rtx_cost (op1, mode, IOR, 1, speed);
10816 if (speed)
10817 *cost += extra_cost->alu.shift;
10818
10819 return true;
10820 }
10821 /* Fall through. */
10822 case XOR:
10823 case AND:
10824 cost_logic:
10825 op0 = XEXP (x, 0);
10826 op1 = XEXP (x, 1);
10827
10828 if (VECTOR_MODE_P (mode))
10829 {
10830 if (speed)
10831 *cost += extra_cost->vect.alu;
10832 return true;
10833 }
10834
10835 if (code == AND
10836 && GET_CODE (op0) == MULT
10837 && CONST_INT_P (XEXP (op0, 1))
10838 && CONST_INT_P (op1)
10839 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
10840 INTVAL (op1)) != 0)
10841 {
10842 /* This is a UBFM/SBFM. */
10843 *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
10844 if (speed)
10845 *cost += extra_cost->alu.bfx;
10846 return true;
10847 }
10848
10849 if (is_int_mode (mode, &int_mode))
10850 {
10851 if (CONST_INT_P (op1))
10852 {
10853 /* We have a mask + shift version of a UBFIZ
10854 i.e. the *andim_ashift<mode>_bfiz pattern. */
10855 if (GET_CODE (op0) == ASHIFT
10856 && aarch64_mask_and_shift_for_ubfiz_p (int_mode, op1,
10857 XEXP (op0, 1)))
10858 {
10859 *cost += rtx_cost (XEXP (op0, 0), int_mode,
10860 (enum rtx_code) code, 0, speed);
10861 if (speed)
10862 *cost += extra_cost->alu.bfx;
10863
10864 return true;
10865 }
10866 else if (aarch64_bitmask_imm (INTVAL (op1), int_mode))
10867 {
10868 /* We possibly get the immediate for free, this is not
10869 modelled. */
10870 *cost += rtx_cost (op0, int_mode,
10871 (enum rtx_code) code, 0, speed);
10872 if (speed)
10873 *cost += extra_cost->alu.logical;
10874
10875 return true;
10876 }
10877 }
10878 else
10879 {
10880 rtx new_op0 = op0;
10881
10882 /* Handle ORN, EON, or BIC. */
10883 if (GET_CODE (op0) == NOT)
10884 op0 = XEXP (op0, 0);
10885
10886 new_op0 = aarch64_strip_shift (op0);
10887
10888 /* If we had a shift on op0 then this is a logical-shift-
10889 by-register/immediate operation. Otherwise, this is just
10890 a logical operation. */
10891 if (speed)
10892 {
10893 if (new_op0 != op0)
10894 {
10895 /* Shift by immediate. */
10896 if (CONST_INT_P (XEXP (op0, 1)))
10897 *cost += extra_cost->alu.log_shift;
10898 else
10899 *cost += extra_cost->alu.log_shift_reg;
10900 }
10901 else
10902 *cost += extra_cost->alu.logical;
10903 }
10904
10905 /* In both cases we want to cost both operands. */
10906 *cost += rtx_cost (new_op0, int_mode, (enum rtx_code) code,
10907 0, speed);
10908 *cost += rtx_cost (op1, int_mode, (enum rtx_code) code,
10909 1, speed);
10910
10911 return true;
10912 }
10913 }
10914 return false;
10915
10916 case NOT:
10917 x = XEXP (x, 0);
10918 op0 = aarch64_strip_shift (x);
10919
10920 if (VECTOR_MODE_P (mode))
10921 {
10922 /* Vector NOT. */
10923 *cost += extra_cost->vect.alu;
10924 return false;
10925 }
10926
10927 /* MVN-shifted-reg. */
10928 if (op0 != x)
10929 {
10930 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
10931
10932 if (speed)
10933 *cost += extra_cost->alu.log_shift;
10934
10935 return true;
10936 }
10937 /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
10938 Handle the second form here taking care that 'a' in the above can
10939 be a shift. */
10940 else if (GET_CODE (op0) == XOR)
10941 {
10942 rtx newop0 = XEXP (op0, 0);
10943 rtx newop1 = XEXP (op0, 1);
10944 rtx op0_stripped = aarch64_strip_shift (newop0);
10945
10946 *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed);
10947 *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
10948
10949 if (speed)
10950 {
10951 if (op0_stripped != newop0)
10952 *cost += extra_cost->alu.log_shift;
10953 else
10954 *cost += extra_cost->alu.logical;
10955 }
10956
10957 return true;
10958 }
10959 /* MVN. */
10960 if (speed)
10961 *cost += extra_cost->alu.logical;
10962
10963 return false;
10964
10965 case ZERO_EXTEND:
10966
10967 op0 = XEXP (x, 0);
10968 /* If a value is written in SI mode, then zero extended to DI
10969 mode, the operation will in general be free as a write to
10970 a 'w' register implicitly zeroes the upper bits of an 'x'
10971 register. However, if this is
10972
10973 (set (reg) (zero_extend (reg)))
10974
10975 we must cost the explicit register move. */
10976 if (mode == DImode
10977 && GET_MODE (op0) == SImode
10978 && outer == SET)
10979 {
10980 int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
10981
10982 /* If OP_COST is non-zero, then the cost of the zero extend
10983 is effectively the cost of the inner operation. Otherwise
10984 we have a MOV instruction and we take the cost from the MOV
10985 itself. This is true independently of whether we are
10986 optimizing for space or time. */
10987 if (op_cost)
10988 *cost = op_cost;
10989
10990 return true;
10991 }
10992 else if (MEM_P (op0))
10993 {
10994 /* All loads can zero extend to any size for free. */
10995 *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
10996 return true;
10997 }
10998
10999 op0 = aarch64_extend_bitfield_pattern_p (x);
11000 if (op0)
11001 {
11002 *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed);
11003 if (speed)
11004 *cost += extra_cost->alu.bfx;
11005 return true;
11006 }
11007
11008 if (speed)
11009 {
11010 if (VECTOR_MODE_P (mode))
11011 {
11012 /* UMOV. */
11013 *cost += extra_cost->vect.alu;
11014 }
11015 else
11016 {
11017 /* We generate an AND instead of UXTB/UXTH. */
11018 *cost += extra_cost->alu.logical;
11019 }
11020 }
11021 return false;
11022
11023 case SIGN_EXTEND:
11024 if (MEM_P (XEXP (x, 0)))
11025 {
11026 /* LDRSH. */
11027 if (speed)
11028 {
11029 rtx address = XEXP (XEXP (x, 0), 0);
11030 *cost += extra_cost->ldst.load_sign_extend;
11031
11032 *cost +=
11033 COSTS_N_INSNS (aarch64_address_cost (address, mode,
11034 0, speed));
11035 }
11036 return true;
11037 }
11038
11039 op0 = aarch64_extend_bitfield_pattern_p (x);
11040 if (op0)
11041 {
11042 *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed);
11043 if (speed)
11044 *cost += extra_cost->alu.bfx;
11045 return true;
11046 }
11047
11048 if (speed)
11049 {
11050 if (VECTOR_MODE_P (mode))
11051 *cost += extra_cost->vect.alu;
11052 else
11053 *cost += extra_cost->alu.extend;
11054 }
11055 return false;
11056
11057 case ASHIFT:
11058 op0 = XEXP (x, 0);
11059 op1 = XEXP (x, 1);
11060
11061 if (CONST_INT_P (op1))
11062 {
11063 if (speed)
11064 {
11065 if (VECTOR_MODE_P (mode))
11066 {
11067 /* Vector shift (immediate). */
11068 *cost += extra_cost->vect.alu;
11069 }
11070 else
11071 {
11072 /* LSL (immediate), UBMF, UBFIZ and friends. These are all
11073 aliases. */
11074 *cost += extra_cost->alu.shift;
11075 }
11076 }
11077
11078 /* We can incorporate zero/sign extend for free. */
11079 if (GET_CODE (op0) == ZERO_EXTEND
11080 || GET_CODE (op0) == SIGN_EXTEND)
11081 op0 = XEXP (op0, 0);
11082
11083 *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
11084 return true;
11085 }
11086 else
11087 {
11088 if (VECTOR_MODE_P (mode))
11089 {
11090 if (speed)
11091 /* Vector shift (register). */
11092 *cost += extra_cost->vect.alu;
11093 }
11094 else
11095 {
11096 if (speed)
11097 /* LSLV. */
11098 *cost += extra_cost->alu.shift_reg;
11099
11100 if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
11101 && CONST_INT_P (XEXP (op1, 1))
11102 && known_eq (INTVAL (XEXP (op1, 1)),
11103 GET_MODE_BITSIZE (mode) - 1))
11104 {
11105 *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
11106 /* We already demanded XEXP (op1, 0) to be REG_P, so
11107 don't recurse into it. */
11108 return true;
11109 }
11110 }
11111 return false; /* All arguments need to be in registers. */
11112 }
11113
11114 case ROTATE:
11115 case ROTATERT:
11116 case LSHIFTRT:
11117 case ASHIFTRT:
11118 op0 = XEXP (x, 0);
11119 op1 = XEXP (x, 1);
11120
11121 if (CONST_INT_P (op1))
11122 {
11123 /* ASR (immediate) and friends. */
11124 if (speed)
11125 {
11126 if (VECTOR_MODE_P (mode))
11127 *cost += extra_cost->vect.alu;
11128 else
11129 *cost += extra_cost->alu.shift;
11130 }
11131
11132 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
11133 return true;
11134 }
11135 else
11136 {
11137 if (VECTOR_MODE_P (mode))
11138 {
11139 if (speed)
11140 /* Vector shift (register). */
11141 *cost += extra_cost->vect.alu;
11142 }
11143 else
11144 {
11145 if (speed)
11146 /* ASR (register) and friends. */
11147 *cost += extra_cost->alu.shift_reg;
11148
11149 if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
11150 && CONST_INT_P (XEXP (op1, 1))
11151 && known_eq (INTVAL (XEXP (op1, 1)),
11152 GET_MODE_BITSIZE (mode) - 1))
11153 {
11154 *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
11155 /* We already demanded XEXP (op1, 0) to be REG_P, so
11156 don't recurse into it. */
11157 return true;
11158 }
11159 }
11160 return false; /* All arguments need to be in registers. */
11161 }
11162
11163 case SYMBOL_REF:
11164
11165 if (aarch64_cmodel == AARCH64_CMODEL_LARGE
11166 || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
11167 {
11168 /* LDR. */
11169 if (speed)
11170 *cost += extra_cost->ldst.load;
11171 }
11172 else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
11173 || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
11174 {
11175 /* ADRP, followed by ADD. */
11176 *cost += COSTS_N_INSNS (1);
11177 if (speed)
11178 *cost += 2 * extra_cost->alu.arith;
11179 }
11180 else if (aarch64_cmodel == AARCH64_CMODEL_TINY
11181 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
11182 {
11183 /* ADR. */
11184 if (speed)
11185 *cost += extra_cost->alu.arith;
11186 }
11187
11188 if (flag_pic)
11189 {
11190 /* One extra load instruction, after accessing the GOT. */
11191 *cost += COSTS_N_INSNS (1);
11192 if (speed)
11193 *cost += extra_cost->ldst.load;
11194 }
11195 return true;
11196
11197 case HIGH:
11198 case LO_SUM:
11199 /* ADRP/ADD (immediate). */
11200 if (speed)
11201 *cost += extra_cost->alu.arith;
11202 return true;
11203
11204 case ZERO_EXTRACT:
11205 case SIGN_EXTRACT:
11206 /* UBFX/SBFX. */
11207 if (speed)
11208 {
11209 if (VECTOR_MODE_P (mode))
11210 *cost += extra_cost->vect.alu;
11211 else
11212 *cost += extra_cost->alu.bfx;
11213 }
11214
11215 /* We can trust that the immediates used will be correct (there
11216 are no by-register forms), so we need only cost op0. */
11217 *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed);
11218 return true;
11219
11220 case MULT:
11221 *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
11222 /* aarch64_rtx_mult_cost always handles recursion to its
11223 operands. */
11224 return true;
11225
11226 case MOD:
11227 /* We can expand signed mod by power of 2 using a NEGS, two parallel
11228 ANDs and a CSNEG. Assume here that CSNEG is the same as the cost of
11229 an unconditional negate. This case should only ever be reached through
11230 the set_smod_pow2_cheap check in expmed.c. */
11231 if (CONST_INT_P (XEXP (x, 1))
11232 && exact_log2 (INTVAL (XEXP (x, 1))) > 0
11233 && (mode == SImode || mode == DImode))
11234 {
11235 /* We expand to 4 instructions. Reset the baseline. */
11236 *cost = COSTS_N_INSNS (4);
11237
11238 if (speed)
11239 *cost += 2 * extra_cost->alu.logical
11240 + 2 * extra_cost->alu.arith;
11241
11242 return true;
11243 }
11244
11245 /* Fall-through. */
11246 case UMOD:
11247 if (speed)
11248 {
11249 /* Slighly prefer UMOD over SMOD. */
11250 if (VECTOR_MODE_P (mode))
11251 *cost += extra_cost->vect.alu;
11252 else if (GET_MODE_CLASS (mode) == MODE_INT)
11253 *cost += (extra_cost->mult[mode == DImode].add
11254 + extra_cost->mult[mode == DImode].idiv
11255 + (code == MOD ? 1 : 0));
11256 }
11257 return false; /* All arguments need to be in registers. */
11258
11259 case DIV:
11260 case UDIV:
11261 case SQRT:
11262 if (speed)
11263 {
11264 if (VECTOR_MODE_P (mode))
11265 *cost += extra_cost->vect.alu;
11266 else if (GET_MODE_CLASS (mode) == MODE_INT)
11267 /* There is no integer SQRT, so only DIV and UDIV can get
11268 here. */
11269 *cost += (extra_cost->mult[mode == DImode].idiv
11270 /* Slighly prefer UDIV over SDIV. */
11271 + (code == DIV ? 1 : 0));
11272 else
11273 *cost += extra_cost->fp[mode == DFmode].div;
11274 }
11275 return false; /* All arguments need to be in registers. */
11276
11277 case IF_THEN_ELSE:
11278 return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
11279 XEXP (x, 2), cost, speed);
11280
11281 case EQ:
11282 case NE:
11283 case GT:
11284 case GTU:
11285 case LT:
11286 case LTU:
11287 case GE:
11288 case GEU:
11289 case LE:
11290 case LEU:
11291
11292 return false; /* All arguments must be in registers. */
11293
11294 case FMA:
11295 op0 = XEXP (x, 0);
11296 op1 = XEXP (x, 1);
11297 op2 = XEXP (x, 2);
11298
11299 if (speed)
11300 {
11301 if (VECTOR_MODE_P (mode))
11302 *cost += extra_cost->vect.alu;
11303 else
11304 *cost += extra_cost->fp[mode == DFmode].fma;
11305 }
11306
11307 /* FMSUB, FNMADD, and FNMSUB are free. */
11308 if (GET_CODE (op0) == NEG)
11309 op0 = XEXP (op0, 0);
11310
11311 if (GET_CODE (op2) == NEG)
11312 op2 = XEXP (op2, 0);
11313
11314 /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
11315 and the by-element operand as operand 0. */
11316 if (GET_CODE (op1) == NEG)
11317 op1 = XEXP (op1, 0);
11318
11319 /* Catch vector-by-element operations. The by-element operand can
11320 either be (vec_duplicate (vec_select (x))) or just
11321 (vec_select (x)), depending on whether we are multiplying by
11322 a vector or a scalar.
11323
11324 Canonicalization is not very good in these cases, FMA4 will put the
11325 by-element operand as operand 0, FNMA4 will have it as operand 1. */
11326 if (GET_CODE (op0) == VEC_DUPLICATE)
11327 op0 = XEXP (op0, 0);
11328 else if (GET_CODE (op1) == VEC_DUPLICATE)
11329 op1 = XEXP (op1, 0);
11330
11331 if (GET_CODE (op0) == VEC_SELECT)
11332 op0 = XEXP (op0, 0);
11333 else if (GET_CODE (op1) == VEC_SELECT)
11334 op1 = XEXP (op1, 0);
11335
11336 /* If the remaining parameters are not registers,
11337 get the cost to put them into registers. */
11338 *cost += rtx_cost (op0, mode, FMA, 0, speed);
11339 *cost += rtx_cost (op1, mode, FMA, 1, speed);
11340 *cost += rtx_cost (op2, mode, FMA, 2, speed);
11341 return true;
11342
11343 case FLOAT:
11344 case UNSIGNED_FLOAT:
11345 if (speed)
11346 *cost += extra_cost->fp[mode == DFmode].fromint;
11347 return false;
11348
11349 case FLOAT_EXTEND:
11350 if (speed)
11351 {
11352 if (VECTOR_MODE_P (mode))
11353 {
11354 /*Vector truncate. */
11355 *cost += extra_cost->vect.alu;
11356 }
11357 else
11358 *cost += extra_cost->fp[mode == DFmode].widen;
11359 }
11360 return false;
11361
11362 case FLOAT_TRUNCATE:
11363 if (speed)
11364 {
11365 if (VECTOR_MODE_P (mode))
11366 {
11367 /*Vector conversion. */
11368 *cost += extra_cost->vect.alu;
11369 }
11370 else
11371 *cost += extra_cost->fp[mode == DFmode].narrow;
11372 }
11373 return false;
11374
11375 case FIX:
11376 case UNSIGNED_FIX:
11377 x = XEXP (x, 0);
11378 /* Strip the rounding part. They will all be implemented
11379 by the fcvt* family of instructions anyway. */
11380 if (GET_CODE (x) == UNSPEC)
11381 {
11382 unsigned int uns_code = XINT (x, 1);
11383
11384 if (uns_code == UNSPEC_FRINTA
11385 || uns_code == UNSPEC_FRINTM
11386 || uns_code == UNSPEC_FRINTN
11387 || uns_code == UNSPEC_FRINTP
11388 || uns_code == UNSPEC_FRINTZ)
11389 x = XVECEXP (x, 0, 0);
11390 }
11391
11392 if (speed)
11393 {
11394 if (VECTOR_MODE_P (mode))
11395 *cost += extra_cost->vect.alu;
11396 else
11397 *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
11398 }
11399
11400 /* We can combine fmul by a power of 2 followed by a fcvt into a single
11401 fixed-point fcvt. */
11402 if (GET_CODE (x) == MULT
11403 && ((VECTOR_MODE_P (mode)
11404 && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
11405 || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
11406 {
11407 *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code,
11408 0, speed);
11409 return true;
11410 }
11411
11412 *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
11413 return true;
11414
11415 case ABS:
11416 if (VECTOR_MODE_P (mode))
11417 {
11418 /* ABS (vector). */
11419 if (speed)
11420 *cost += extra_cost->vect.alu;
11421 }
11422 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
11423 {
11424 op0 = XEXP (x, 0);
11425
11426 /* FABD, which is analogous to FADD. */
11427 if (GET_CODE (op0) == MINUS)
11428 {
11429 *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
11430 *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
11431 if (speed)
11432 *cost += extra_cost->fp[mode == DFmode].addsub;
11433
11434 return true;
11435 }
11436 /* Simple FABS is analogous to FNEG. */
11437 if (speed)
11438 *cost += extra_cost->fp[mode == DFmode].neg;
11439 }
11440 else
11441 {
11442 /* Integer ABS will either be split to
11443 two arithmetic instructions, or will be an ABS
11444 (scalar), which we don't model. */
11445 *cost = COSTS_N_INSNS (2);
11446 if (speed)
11447 *cost += 2 * extra_cost->alu.arith;
11448 }
11449 return false;
11450
11451 case SMAX:
11452 case SMIN:
11453 if (speed)
11454 {
11455 if (VECTOR_MODE_P (mode))
11456 *cost += extra_cost->vect.alu;
11457 else
11458 {
11459 /* FMAXNM/FMINNM/FMAX/FMIN.
11460 TODO: This may not be accurate for all implementations, but
11461 we do not model this in the cost tables. */
11462 *cost += extra_cost->fp[mode == DFmode].addsub;
11463 }
11464 }
11465 return false;
11466
11467 case UNSPEC:
11468 /* The floating point round to integer frint* instructions. */
11469 if (aarch64_frint_unspec_p (XINT (x, 1)))
11470 {
11471 if (speed)
11472 *cost += extra_cost->fp[mode == DFmode].roundint;
11473
11474 return false;
11475 }
11476
11477 if (XINT (x, 1) == UNSPEC_RBIT)
11478 {
11479 if (speed)
11480 *cost += extra_cost->alu.rev;
11481
11482 return false;
11483 }
11484 break;
11485
11486 case TRUNCATE:
11487
11488 /* Decompose <su>muldi3_highpart. */
11489 if (/* (truncate:DI */
11490 mode == DImode
11491 /* (lshiftrt:TI */
11492 && GET_MODE (XEXP (x, 0)) == TImode
11493 && GET_CODE (XEXP (x, 0)) == LSHIFTRT
11494 /* (mult:TI */
11495 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
11496 /* (ANY_EXTEND:TI (reg:DI))
11497 (ANY_EXTEND:TI (reg:DI))) */
11498 && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
11499 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
11500 || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
11501 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
11502 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
11503 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
11504 /* (const_int 64) */
11505 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
11506 && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
11507 {
11508 /* UMULH/SMULH. */
11509 if (speed)
11510 *cost += extra_cost->mult[mode == DImode].extend;
11511 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
11512 mode, MULT, 0, speed);
11513 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
11514 mode, MULT, 1, speed);
11515 return true;
11516 }
11517
11518 /* Fall through. */
11519 default:
11520 break;
11521 }
11522
11523 if (dump_file
11524 && flag_aarch64_verbose_cost)
11525 fprintf (dump_file,
11526 "\nFailed to cost RTX. Assuming default cost.\n");
11527
11528 return true;
11529 }
11530
11531 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
11532 calculated for X. This cost is stored in *COST. Returns true
11533 if the total cost of X was calculated. */
11534 static bool
11535 aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
11536 int param, int *cost, bool speed)
11537 {
11538 bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
11539
11540 if (dump_file
11541 && flag_aarch64_verbose_cost)
11542 {
11543 print_rtl_single (dump_file, x);
11544 fprintf (dump_file, "\n%s cost: %d (%s)\n",
11545 speed ? "Hot" : "Cold",
11546 *cost, result ? "final" : "partial");
11547 }
11548
11549 return result;
11550 }
11551
11552 static int
11553 aarch64_register_move_cost (machine_mode mode,
11554 reg_class_t from_i, reg_class_t to_i)
11555 {
11556 enum reg_class from = (enum reg_class) from_i;
11557 enum reg_class to = (enum reg_class) to_i;
11558 const struct cpu_regmove_cost *regmove_cost
11559 = aarch64_tune_params.regmove_cost;
11560
11561 /* Caller save and pointer regs are equivalent to GENERAL_REGS. */
11562 if (to == TAILCALL_ADDR_REGS || to == POINTER_REGS)
11563 to = GENERAL_REGS;
11564
11565 if (from == TAILCALL_ADDR_REGS || from == POINTER_REGS)
11566 from = GENERAL_REGS;
11567
11568 /* Moving between GPR and stack cost is the same as GP2GP. */
11569 if ((from == GENERAL_REGS && to == STACK_REG)
11570 || (to == GENERAL_REGS && from == STACK_REG))
11571 return regmove_cost->GP2GP;
11572
11573 /* To/From the stack register, we move via the gprs. */
11574 if (to == STACK_REG || from == STACK_REG)
11575 return aarch64_register_move_cost (mode, from, GENERAL_REGS)
11576 + aarch64_register_move_cost (mode, GENERAL_REGS, to);
11577
11578 if (known_eq (GET_MODE_SIZE (mode), 16))
11579 {
11580 /* 128-bit operations on general registers require 2 instructions. */
11581 if (from == GENERAL_REGS && to == GENERAL_REGS)
11582 return regmove_cost->GP2GP * 2;
11583 else if (from == GENERAL_REGS)
11584 return regmove_cost->GP2FP * 2;
11585 else if (to == GENERAL_REGS)
11586 return regmove_cost->FP2GP * 2;
11587
11588 /* When AdvSIMD instructions are disabled it is not possible to move
11589 a 128-bit value directly between Q registers. This is handled in
11590 secondary reload. A general register is used as a scratch to move
11591 the upper DI value and the lower DI value is moved directly,
11592 hence the cost is the sum of three moves. */
11593 if (! TARGET_SIMD)
11594 return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
11595
11596 return regmove_cost->FP2FP;
11597 }
11598
11599 if (from == GENERAL_REGS && to == GENERAL_REGS)
11600 return regmove_cost->GP2GP;
11601 else if (from == GENERAL_REGS)
11602 return regmove_cost->GP2FP;
11603 else if (to == GENERAL_REGS)
11604 return regmove_cost->FP2GP;
11605
11606 return regmove_cost->FP2FP;
11607 }
11608
11609 static int
11610 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
11611 reg_class_t rclass ATTRIBUTE_UNUSED,
11612 bool in ATTRIBUTE_UNUSED)
11613 {
11614 return aarch64_tune_params.memmov_cost;
11615 }
11616
11617 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
11618 to optimize 1.0/sqrt. */
11619
11620 static bool
11621 use_rsqrt_p (machine_mode mode)
11622 {
11623 return (!flag_trapping_math
11624 && flag_unsafe_math_optimizations
11625 && ((aarch64_tune_params.approx_modes->recip_sqrt
11626 & AARCH64_APPROX_MODE (mode))
11627 || flag_mrecip_low_precision_sqrt));
11628 }
11629
11630 /* Function to decide when to use the approximate reciprocal square root
11631 builtin. */
11632
11633 static tree
11634 aarch64_builtin_reciprocal (tree fndecl)
11635 {
11636 machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
11637
11638 if (!use_rsqrt_p (mode))
11639 return NULL_TREE;
11640 return aarch64_builtin_rsqrt (DECL_MD_FUNCTION_CODE (fndecl));
11641 }
11642
11643 /* Emit instruction sequence to compute either the approximate square root
11644 or its approximate reciprocal, depending on the flag RECP, and return
11645 whether the sequence was emitted or not. */
11646
11647 bool
11648 aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
11649 {
11650 machine_mode mode = GET_MODE (dst);
11651
11652 if (GET_MODE_INNER (mode) == HFmode)
11653 {
11654 gcc_assert (!recp);
11655 return false;
11656 }
11657
11658 if (!recp)
11659 {
11660 if (!(flag_mlow_precision_sqrt
11661 || (aarch64_tune_params.approx_modes->sqrt
11662 & AARCH64_APPROX_MODE (mode))))
11663 return false;
11664
11665 if (flag_finite_math_only
11666 || flag_trapping_math
11667 || !flag_unsafe_math_optimizations
11668 || optimize_function_for_size_p (cfun))
11669 return false;
11670 }
11671 else
11672 /* Caller assumes we cannot fail. */
11673 gcc_assert (use_rsqrt_p (mode));
11674
11675 machine_mode mmsk = mode_for_int_vector (mode).require ();
11676 rtx xmsk = gen_reg_rtx (mmsk);
11677 if (!recp)
11678 /* When calculating the approximate square root, compare the
11679 argument with 0.0 and create a mask. */
11680 emit_insn (gen_rtx_SET (xmsk,
11681 gen_rtx_NEG (mmsk,
11682 gen_rtx_EQ (mmsk, src,
11683 CONST0_RTX (mode)))));
11684
11685 /* Estimate the approximate reciprocal square root. */
11686 rtx xdst = gen_reg_rtx (mode);
11687 emit_insn (gen_aarch64_rsqrte (mode, xdst, src));
11688
11689 /* Iterate over the series twice for SF and thrice for DF. */
11690 int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
11691
11692 /* Optionally iterate over the series once less for faster performance
11693 while sacrificing the accuracy. */
11694 if ((recp && flag_mrecip_low_precision_sqrt)
11695 || (!recp && flag_mlow_precision_sqrt))
11696 iterations--;
11697
11698 /* Iterate over the series to calculate the approximate reciprocal square
11699 root. */
11700 rtx x1 = gen_reg_rtx (mode);
11701 while (iterations--)
11702 {
11703 rtx x2 = gen_reg_rtx (mode);
11704 emit_set_insn (x2, gen_rtx_MULT (mode, xdst, xdst));
11705
11706 emit_insn (gen_aarch64_rsqrts (mode, x1, src, x2));
11707
11708 if (iterations > 0)
11709 emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, x1));
11710 }
11711
11712 if (!recp)
11713 {
11714 /* Qualify the approximate reciprocal square root when the argument is
11715 0.0 by squashing the intermediary result to 0.0. */
11716 rtx xtmp = gen_reg_rtx (mmsk);
11717 emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
11718 gen_rtx_SUBREG (mmsk, xdst, 0)));
11719 emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
11720
11721 /* Calculate the approximate square root. */
11722 emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, src));
11723 }
11724
11725 /* Finalize the approximation. */
11726 emit_set_insn (dst, gen_rtx_MULT (mode, xdst, x1));
11727
11728 return true;
11729 }
11730
11731 /* Emit the instruction sequence to compute the approximation for the division
11732 of NUM by DEN in QUO and return whether the sequence was emitted or not. */
11733
11734 bool
11735 aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
11736 {
11737 machine_mode mode = GET_MODE (quo);
11738
11739 if (GET_MODE_INNER (mode) == HFmode)
11740 return false;
11741
11742 bool use_approx_division_p = (flag_mlow_precision_div
11743 || (aarch64_tune_params.approx_modes->division
11744 & AARCH64_APPROX_MODE (mode)));
11745
11746 if (!flag_finite_math_only
11747 || flag_trapping_math
11748 || !flag_unsafe_math_optimizations
11749 || optimize_function_for_size_p (cfun)
11750 || !use_approx_division_p)
11751 return false;
11752
11753 if (!TARGET_SIMD && VECTOR_MODE_P (mode))
11754 return false;
11755
11756 /* Estimate the approximate reciprocal. */
11757 rtx xrcp = gen_reg_rtx (mode);
11758 emit_insn (gen_aarch64_frecpe (mode, xrcp, den));
11759
11760 /* Iterate over the series twice for SF and thrice for DF. */
11761 int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
11762
11763 /* Optionally iterate over the series once less for faster performance,
11764 while sacrificing the accuracy. */
11765 if (flag_mlow_precision_div)
11766 iterations--;
11767
11768 /* Iterate over the series to calculate the approximate reciprocal. */
11769 rtx xtmp = gen_reg_rtx (mode);
11770 while (iterations--)
11771 {
11772 emit_insn (gen_aarch64_frecps (mode, xtmp, xrcp, den));
11773
11774 if (iterations > 0)
11775 emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xtmp));
11776 }
11777
11778 if (num != CONST1_RTX (mode))
11779 {
11780 /* As the approximate reciprocal of DEN is already calculated, only
11781 calculate the approximate division when NUM is not 1.0. */
11782 rtx xnum = force_reg (mode, num);
11783 emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xnum));
11784 }
11785
11786 /* Finalize the approximation. */
11787 emit_set_insn (quo, gen_rtx_MULT (mode, xrcp, xtmp));
11788 return true;
11789 }
11790
11791 /* Return the number of instructions that can be issued per cycle. */
11792 static int
11793 aarch64_sched_issue_rate (void)
11794 {
11795 return aarch64_tune_params.issue_rate;
11796 }
11797
11798 static int
11799 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
11800 {
11801 int issue_rate = aarch64_sched_issue_rate ();
11802
11803 return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
11804 }
11805
11806
11807 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
11808 autopref_multipass_dfa_lookahead_guard from haifa-sched.c. It only
11809 has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0. */
11810
11811 static int
11812 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
11813 int ready_index)
11814 {
11815 return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
11816 }
11817
11818
11819 /* Vectorizer cost model target hooks. */
11820
11821 /* Implement targetm.vectorize.builtin_vectorization_cost. */
11822 static int
11823 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
11824 tree vectype,
11825 int misalign ATTRIBUTE_UNUSED)
11826 {
11827 unsigned elements;
11828 const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
11829 bool fp = false;
11830
11831 if (vectype != NULL)
11832 fp = FLOAT_TYPE_P (vectype);
11833
11834 switch (type_of_cost)
11835 {
11836 case scalar_stmt:
11837 return fp ? costs->scalar_fp_stmt_cost : costs->scalar_int_stmt_cost;
11838
11839 case scalar_load:
11840 return costs->scalar_load_cost;
11841
11842 case scalar_store:
11843 return costs->scalar_store_cost;
11844
11845 case vector_stmt:
11846 return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
11847
11848 case vector_load:
11849 return costs->vec_align_load_cost;
11850
11851 case vector_store:
11852 return costs->vec_store_cost;
11853
11854 case vec_to_scalar:
11855 return costs->vec_to_scalar_cost;
11856
11857 case scalar_to_vec:
11858 return costs->scalar_to_vec_cost;
11859
11860 case unaligned_load:
11861 case vector_gather_load:
11862 return costs->vec_unalign_load_cost;
11863
11864 case unaligned_store:
11865 case vector_scatter_store:
11866 return costs->vec_unalign_store_cost;
11867
11868 case cond_branch_taken:
11869 return costs->cond_taken_branch_cost;
11870
11871 case cond_branch_not_taken:
11872 return costs->cond_not_taken_branch_cost;
11873
11874 case vec_perm:
11875 return costs->vec_permute_cost;
11876
11877 case vec_promote_demote:
11878 return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
11879
11880 case vec_construct:
11881 elements = estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype));
11882 return elements / 2 + 1;
11883
11884 default:
11885 gcc_unreachable ();
11886 }
11887 }
11888
11889 /* Implement targetm.vectorize.add_stmt_cost. */
11890 static unsigned
11891 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
11892 struct _stmt_vec_info *stmt_info, int misalign,
11893 enum vect_cost_model_location where)
11894 {
11895 unsigned *cost = (unsigned *) data;
11896 unsigned retval = 0;
11897
11898 if (flag_vect_cost_model)
11899 {
11900 tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
11901 int stmt_cost =
11902 aarch64_builtin_vectorization_cost (kind, vectype, misalign);
11903
11904 /* Statements in an inner loop relative to the loop being
11905 vectorized are weighted more heavily. The value here is
11906 arbitrary and could potentially be improved with analysis. */
11907 if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
11908 count *= 50; /* FIXME */
11909
11910 retval = (unsigned) (count * stmt_cost);
11911 cost[where] += retval;
11912 }
11913
11914 return retval;
11915 }
11916
11917 static void initialize_aarch64_code_model (struct gcc_options *);
11918
11919 /* Parse the TO_PARSE string and put the architecture struct that it
11920 selects into RES and the architectural features into ISA_FLAGS.
11921 Return an aarch64_parse_opt_result describing the parse result.
11922 If there is an error parsing, RES and ISA_FLAGS are left unchanged.
11923 When the TO_PARSE string contains an invalid extension,
11924 a copy of the string is created and stored to INVALID_EXTENSION. */
11925
11926 static enum aarch64_parse_opt_result
11927 aarch64_parse_arch (const char *to_parse, const struct processor **res,
11928 uint64_t *isa_flags, std::string *invalid_extension)
11929 {
11930 const char *ext;
11931 const struct processor *arch;
11932 size_t len;
11933
11934 ext = strchr (to_parse, '+');
11935
11936 if (ext != NULL)
11937 len = ext - to_parse;
11938 else
11939 len = strlen (to_parse);
11940
11941 if (len == 0)
11942 return AARCH64_PARSE_MISSING_ARG;
11943
11944
11945 /* Loop through the list of supported ARCHes to find a match. */
11946 for (arch = all_architectures; arch->name != NULL; arch++)
11947 {
11948 if (strlen (arch->name) == len
11949 && strncmp (arch->name, to_parse, len) == 0)
11950 {
11951 uint64_t isa_temp = arch->flags;
11952
11953 if (ext != NULL)
11954 {
11955 /* TO_PARSE string contains at least one extension. */
11956 enum aarch64_parse_opt_result ext_res
11957 = aarch64_parse_extension (ext, &isa_temp, invalid_extension);
11958
11959 if (ext_res != AARCH64_PARSE_OK)
11960 return ext_res;
11961 }
11962 /* Extension parsing was successful. Confirm the result
11963 arch and ISA flags. */
11964 *res = arch;
11965 *isa_flags = isa_temp;
11966 return AARCH64_PARSE_OK;
11967 }
11968 }
11969
11970 /* ARCH name not found in list. */
11971 return AARCH64_PARSE_INVALID_ARG;
11972 }
11973
11974 /* Parse the TO_PARSE string and put the result tuning in RES and the
11975 architecture flags in ISA_FLAGS. Return an aarch64_parse_opt_result
11976 describing the parse result. If there is an error parsing, RES and
11977 ISA_FLAGS are left unchanged.
11978 When the TO_PARSE string contains an invalid extension,
11979 a copy of the string is created and stored to INVALID_EXTENSION. */
11980
11981 static enum aarch64_parse_opt_result
11982 aarch64_parse_cpu (const char *to_parse, const struct processor **res,
11983 uint64_t *isa_flags, std::string *invalid_extension)
11984 {
11985 const char *ext;
11986 const struct processor *cpu;
11987 size_t len;
11988
11989 ext = strchr (to_parse, '+');
11990
11991 if (ext != NULL)
11992 len = ext - to_parse;
11993 else
11994 len = strlen (to_parse);
11995
11996 if (len == 0)
11997 return AARCH64_PARSE_MISSING_ARG;
11998
11999
12000 /* Loop through the list of supported CPUs to find a match. */
12001 for (cpu = all_cores; cpu->name != NULL; cpu++)
12002 {
12003 if (strlen (cpu->name) == len && strncmp (cpu->name, to_parse, len) == 0)
12004 {
12005 uint64_t isa_temp = cpu->flags;
12006
12007
12008 if (ext != NULL)
12009 {
12010 /* TO_PARSE string contains at least one extension. */
12011 enum aarch64_parse_opt_result ext_res
12012 = aarch64_parse_extension (ext, &isa_temp, invalid_extension);
12013
12014 if (ext_res != AARCH64_PARSE_OK)
12015 return ext_res;
12016 }
12017 /* Extension parsing was successfull. Confirm the result
12018 cpu and ISA flags. */
12019 *res = cpu;
12020 *isa_flags = isa_temp;
12021 return AARCH64_PARSE_OK;
12022 }
12023 }
12024
12025 /* CPU name not found in list. */
12026 return AARCH64_PARSE_INVALID_ARG;
12027 }
12028
12029 /* Parse the TO_PARSE string and put the cpu it selects into RES.
12030 Return an aarch64_parse_opt_result describing the parse result.
12031 If the parsing fails the RES does not change. */
12032
12033 static enum aarch64_parse_opt_result
12034 aarch64_parse_tune (const char *to_parse, const struct processor **res)
12035 {
12036 const struct processor *cpu;
12037
12038 /* Loop through the list of supported CPUs to find a match. */
12039 for (cpu = all_cores; cpu->name != NULL; cpu++)
12040 {
12041 if (strcmp (cpu->name, to_parse) == 0)
12042 {
12043 *res = cpu;
12044 return AARCH64_PARSE_OK;
12045 }
12046 }
12047
12048 /* CPU name not found in list. */
12049 return AARCH64_PARSE_INVALID_ARG;
12050 }
12051
12052 /* Parse TOKEN, which has length LENGTH to see if it is an option
12053 described in FLAG. If it is, return the index bit for that fusion type.
12054 If not, error (printing OPTION_NAME) and return zero. */
12055
12056 static unsigned int
12057 aarch64_parse_one_option_token (const char *token,
12058 size_t length,
12059 const struct aarch64_flag_desc *flag,
12060 const char *option_name)
12061 {
12062 for (; flag->name != NULL; flag++)
12063 {
12064 if (length == strlen (flag->name)
12065 && !strncmp (flag->name, token, length))
12066 return flag->flag;
12067 }
12068
12069 error ("unknown flag passed in %<-moverride=%s%> (%s)", option_name, token);
12070 return 0;
12071 }
12072
12073 /* Parse OPTION which is a comma-separated list of flags to enable.
12074 FLAGS gives the list of flags we understand, INITIAL_STATE gives any
12075 default state we inherit from the CPU tuning structures. OPTION_NAME
12076 gives the top-level option we are parsing in the -moverride string,
12077 for use in error messages. */
12078
12079 static unsigned int
12080 aarch64_parse_boolean_options (const char *option,
12081 const struct aarch64_flag_desc *flags,
12082 unsigned int initial_state,
12083 const char *option_name)
12084 {
12085 const char separator = '.';
12086 const char* specs = option;
12087 const char* ntoken = option;
12088 unsigned int found_flags = initial_state;
12089
12090 while ((ntoken = strchr (specs, separator)))
12091 {
12092 size_t token_length = ntoken - specs;
12093 unsigned token_ops = aarch64_parse_one_option_token (specs,
12094 token_length,
12095 flags,
12096 option_name);
12097 /* If we find "none" (or, for simplicity's sake, an error) anywhere
12098 in the token stream, reset the supported operations. So:
12099
12100 adrp+add.cmp+branch.none.adrp+add
12101
12102 would have the result of turning on only adrp+add fusion. */
12103 if (!token_ops)
12104 found_flags = 0;
12105
12106 found_flags |= token_ops;
12107 specs = ++ntoken;
12108 }
12109
12110 /* We ended with a comma, print something. */
12111 if (!(*specs))
12112 {
12113 error ("%s string ill-formed\n", option_name);
12114 return 0;
12115 }
12116
12117 /* We still have one more token to parse. */
12118 size_t token_length = strlen (specs);
12119 unsigned token_ops = aarch64_parse_one_option_token (specs,
12120 token_length,
12121 flags,
12122 option_name);
12123 if (!token_ops)
12124 found_flags = 0;
12125
12126 found_flags |= token_ops;
12127 return found_flags;
12128 }
12129
12130 /* Support for overriding instruction fusion. */
12131
12132 static void
12133 aarch64_parse_fuse_string (const char *fuse_string,
12134 struct tune_params *tune)
12135 {
12136 tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
12137 aarch64_fusible_pairs,
12138 tune->fusible_ops,
12139 "fuse=");
12140 }
12141
12142 /* Support for overriding other tuning flags. */
12143
12144 static void
12145 aarch64_parse_tune_string (const char *tune_string,
12146 struct tune_params *tune)
12147 {
12148 tune->extra_tuning_flags
12149 = aarch64_parse_boolean_options (tune_string,
12150 aarch64_tuning_flags,
12151 tune->extra_tuning_flags,
12152 "tune=");
12153 }
12154
12155 /* Parse the sve_width tuning moverride string in TUNE_STRING.
12156 Accept the valid SVE vector widths allowed by
12157 aarch64_sve_vector_bits_enum and use it to override sve_width
12158 in TUNE. */
12159
12160 static void
12161 aarch64_parse_sve_width_string (const char *tune_string,
12162 struct tune_params *tune)
12163 {
12164 int width = -1;
12165
12166 int n = sscanf (tune_string, "%d", &width);
12167 if (n == EOF)
12168 {
12169 error ("invalid format for sve_width");
12170 return;
12171 }
12172 switch (width)
12173 {
12174 case SVE_128:
12175 case SVE_256:
12176 case SVE_512:
12177 case SVE_1024:
12178 case SVE_2048:
12179 break;
12180 default:
12181 error ("invalid sve_width value: %d", width);
12182 }
12183 tune->sve_width = (enum aarch64_sve_vector_bits_enum) width;
12184 }
12185
12186 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
12187 we understand. If it is, extract the option string and handoff to
12188 the appropriate function. */
12189
12190 void
12191 aarch64_parse_one_override_token (const char* token,
12192 size_t length,
12193 struct tune_params *tune)
12194 {
12195 const struct aarch64_tuning_override_function *fn
12196 = aarch64_tuning_override_functions;
12197
12198 const char *option_part = strchr (token, '=');
12199 if (!option_part)
12200 {
12201 error ("tuning string missing in option (%s)", token);
12202 return;
12203 }
12204
12205 /* Get the length of the option name. */
12206 length = option_part - token;
12207 /* Skip the '=' to get to the option string. */
12208 option_part++;
12209
12210 for (; fn->name != NULL; fn++)
12211 {
12212 if (!strncmp (fn->name, token, length))
12213 {
12214 fn->parse_override (option_part, tune);
12215 return;
12216 }
12217 }
12218
12219 error ("unknown tuning option (%s)",token);
12220 return;
12221 }
12222
12223 /* A checking mechanism for the implementation of the tls size. */
12224
12225 static void
12226 initialize_aarch64_tls_size (struct gcc_options *opts)
12227 {
12228 if (aarch64_tls_size == 0)
12229 aarch64_tls_size = 24;
12230
12231 switch (opts->x_aarch64_cmodel_var)
12232 {
12233 case AARCH64_CMODEL_TINY:
12234 /* Both the default and maximum TLS size allowed under tiny is 1M which
12235 needs two instructions to address, so we clamp the size to 24. */
12236 if (aarch64_tls_size > 24)
12237 aarch64_tls_size = 24;
12238 break;
12239 case AARCH64_CMODEL_SMALL:
12240 /* The maximum TLS size allowed under small is 4G. */
12241 if (aarch64_tls_size > 32)
12242 aarch64_tls_size = 32;
12243 break;
12244 case AARCH64_CMODEL_LARGE:
12245 /* The maximum TLS size allowed under large is 16E.
12246 FIXME: 16E should be 64bit, we only support 48bit offset now. */
12247 if (aarch64_tls_size > 48)
12248 aarch64_tls_size = 48;
12249 break;
12250 default:
12251 gcc_unreachable ();
12252 }
12253
12254 return;
12255 }
12256
12257 /* Parse STRING looking for options in the format:
12258 string :: option:string
12259 option :: name=substring
12260 name :: {a-z}
12261 substring :: defined by option. */
12262
12263 static void
12264 aarch64_parse_override_string (const char* input_string,
12265 struct tune_params* tune)
12266 {
12267 const char separator = ':';
12268 size_t string_length = strlen (input_string) + 1;
12269 char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
12270 char *string = string_root;
12271 strncpy (string, input_string, string_length);
12272 string[string_length - 1] = '\0';
12273
12274 char* ntoken = string;
12275
12276 while ((ntoken = strchr (string, separator)))
12277 {
12278 size_t token_length = ntoken - string;
12279 /* Make this substring look like a string. */
12280 *ntoken = '\0';
12281 aarch64_parse_one_override_token (string, token_length, tune);
12282 string = ++ntoken;
12283 }
12284
12285 /* One last option to parse. */
12286 aarch64_parse_one_override_token (string, strlen (string), tune);
12287 free (string_root);
12288 }
12289
12290
12291 static void
12292 aarch64_override_options_after_change_1 (struct gcc_options *opts)
12293 {
12294 if (accepted_branch_protection_string)
12295 {
12296 opts->x_aarch64_branch_protection_string
12297 = xstrdup (accepted_branch_protection_string);
12298 }
12299
12300 /* PR 70044: We have to be careful about being called multiple times for the
12301 same function. This means all changes should be repeatable. */
12302
12303 /* Set aarch64_use_frame_pointer based on -fno-omit-frame-pointer.
12304 Disable the frame pointer flag so the mid-end will not use a frame
12305 pointer in leaf functions in order to support -fomit-leaf-frame-pointer.
12306 Set x_flag_omit_frame_pointer to the special value 2 to differentiate
12307 between -fomit-frame-pointer (1) and -fno-omit-frame-pointer (2). */
12308 aarch64_use_frame_pointer = opts->x_flag_omit_frame_pointer != 1;
12309 if (opts->x_flag_omit_frame_pointer == 0)
12310 opts->x_flag_omit_frame_pointer = 2;
12311
12312 /* If not optimizing for size, set the default
12313 alignment to what the target wants. */
12314 if (!opts->x_optimize_size)
12315 {
12316 if (opts->x_flag_align_loops && !opts->x_str_align_loops)
12317 opts->x_str_align_loops = aarch64_tune_params.loop_align;
12318 if (opts->x_flag_align_jumps && !opts->x_str_align_jumps)
12319 opts->x_str_align_jumps = aarch64_tune_params.jump_align;
12320 if (opts->x_flag_align_functions && !opts->x_str_align_functions)
12321 opts->x_str_align_functions = aarch64_tune_params.function_align;
12322 }
12323
12324 /* We default to no pc-relative literal loads. */
12325
12326 aarch64_pcrelative_literal_loads = false;
12327
12328 /* If -mpc-relative-literal-loads is set on the command line, this
12329 implies that the user asked for PC relative literal loads. */
12330 if (opts->x_pcrelative_literal_loads == 1)
12331 aarch64_pcrelative_literal_loads = true;
12332
12333 /* In the tiny memory model it makes no sense to disallow PC relative
12334 literal pool loads. */
12335 if (aarch64_cmodel == AARCH64_CMODEL_TINY
12336 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
12337 aarch64_pcrelative_literal_loads = true;
12338
12339 /* When enabling the lower precision Newton series for the square root, also
12340 enable it for the reciprocal square root, since the latter is an
12341 intermediary step for the former. */
12342 if (flag_mlow_precision_sqrt)
12343 flag_mrecip_low_precision_sqrt = true;
12344 }
12345
12346 /* 'Unpack' up the internal tuning structs and update the options
12347 in OPTS. The caller must have set up selected_tune and selected_arch
12348 as all the other target-specific codegen decisions are
12349 derived from them. */
12350
12351 void
12352 aarch64_override_options_internal (struct gcc_options *opts)
12353 {
12354 aarch64_tune_flags = selected_tune->flags;
12355 aarch64_tune = selected_tune->sched_core;
12356 /* Make a copy of the tuning parameters attached to the core, which
12357 we may later overwrite. */
12358 aarch64_tune_params = *(selected_tune->tune);
12359 aarch64_architecture_version = selected_arch->architecture_version;
12360
12361 if (opts->x_aarch64_override_tune_string)
12362 aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
12363 &aarch64_tune_params);
12364
12365 /* This target defaults to strict volatile bitfields. */
12366 if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
12367 opts->x_flag_strict_volatile_bitfields = 1;
12368
12369 if (aarch64_stack_protector_guard == SSP_GLOBAL
12370 && opts->x_aarch64_stack_protector_guard_offset_str)
12371 {
12372 error ("incompatible options %<-mstack-protector-guard=global%> and "
12373 "%<-mstack-protector-guard-offset=%s%>",
12374 aarch64_stack_protector_guard_offset_str);
12375 }
12376
12377 if (aarch64_stack_protector_guard == SSP_SYSREG
12378 && !(opts->x_aarch64_stack_protector_guard_offset_str
12379 && opts->x_aarch64_stack_protector_guard_reg_str))
12380 {
12381 error ("both %<-mstack-protector-guard-offset%> and "
12382 "%<-mstack-protector-guard-reg%> must be used "
12383 "with %<-mstack-protector-guard=sysreg%>");
12384 }
12385
12386 if (opts->x_aarch64_stack_protector_guard_reg_str)
12387 {
12388 if (strlen (opts->x_aarch64_stack_protector_guard_reg_str) > 100)
12389 error ("specify a system register with a small string length.");
12390 }
12391
12392 if (opts->x_aarch64_stack_protector_guard_offset_str)
12393 {
12394 char *end;
12395 const char *str = aarch64_stack_protector_guard_offset_str;
12396 errno = 0;
12397 long offs = strtol (aarch64_stack_protector_guard_offset_str, &end, 0);
12398 if (!*str || *end || errno)
12399 error ("%qs is not a valid offset in %qs", str,
12400 "-mstack-protector-guard-offset=");
12401 aarch64_stack_protector_guard_offset = offs;
12402 }
12403
12404 initialize_aarch64_code_model (opts);
12405 initialize_aarch64_tls_size (opts);
12406
12407 int queue_depth = 0;
12408 switch (aarch64_tune_params.autoprefetcher_model)
12409 {
12410 case tune_params::AUTOPREFETCHER_OFF:
12411 queue_depth = -1;
12412 break;
12413 case tune_params::AUTOPREFETCHER_WEAK:
12414 queue_depth = 0;
12415 break;
12416 case tune_params::AUTOPREFETCHER_STRONG:
12417 queue_depth = max_insn_queue_index + 1;
12418 break;
12419 default:
12420 gcc_unreachable ();
12421 }
12422
12423 /* We don't mind passing in global_options_set here as we don't use
12424 the *options_set structs anyway. */
12425 maybe_set_param_value (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH,
12426 queue_depth,
12427 opts->x_param_values,
12428 global_options_set.x_param_values);
12429
12430 /* Set up parameters to be used in prefetching algorithm. Do not
12431 override the defaults unless we are tuning for a core we have
12432 researched values for. */
12433 if (aarch64_tune_params.prefetch->num_slots > 0)
12434 maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
12435 aarch64_tune_params.prefetch->num_slots,
12436 opts->x_param_values,
12437 global_options_set.x_param_values);
12438 if (aarch64_tune_params.prefetch->l1_cache_size >= 0)
12439 maybe_set_param_value (PARAM_L1_CACHE_SIZE,
12440 aarch64_tune_params.prefetch->l1_cache_size,
12441 opts->x_param_values,
12442 global_options_set.x_param_values);
12443 if (aarch64_tune_params.prefetch->l1_cache_line_size >= 0)
12444 maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
12445 aarch64_tune_params.prefetch->l1_cache_line_size,
12446 opts->x_param_values,
12447 global_options_set.x_param_values);
12448 if (aarch64_tune_params.prefetch->l2_cache_size >= 0)
12449 maybe_set_param_value (PARAM_L2_CACHE_SIZE,
12450 aarch64_tune_params.prefetch->l2_cache_size,
12451 opts->x_param_values,
12452 global_options_set.x_param_values);
12453 if (!aarch64_tune_params.prefetch->prefetch_dynamic_strides)
12454 maybe_set_param_value (PARAM_PREFETCH_DYNAMIC_STRIDES,
12455 0,
12456 opts->x_param_values,
12457 global_options_set.x_param_values);
12458 if (aarch64_tune_params.prefetch->minimum_stride >= 0)
12459 maybe_set_param_value (PARAM_PREFETCH_MINIMUM_STRIDE,
12460 aarch64_tune_params.prefetch->minimum_stride,
12461 opts->x_param_values,
12462 global_options_set.x_param_values);
12463
12464 /* Use the alternative scheduling-pressure algorithm by default. */
12465 maybe_set_param_value (PARAM_SCHED_PRESSURE_ALGORITHM, SCHED_PRESSURE_MODEL,
12466 opts->x_param_values,
12467 global_options_set.x_param_values);
12468
12469 /* If the user hasn't changed it via configure then set the default to 64 KB
12470 for the backend. */
12471 maybe_set_param_value (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE,
12472 DEFAULT_STK_CLASH_GUARD_SIZE == 0
12473 ? 16 : DEFAULT_STK_CLASH_GUARD_SIZE,
12474 opts->x_param_values,
12475 global_options_set.x_param_values);
12476
12477 /* Validate the guard size. */
12478 int guard_size = PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
12479
12480 /* Enforce that interval is the same size as size so the mid-end does the
12481 right thing. */
12482 maybe_set_param_value (PARAM_STACK_CLASH_PROTECTION_PROBE_INTERVAL,
12483 guard_size,
12484 opts->x_param_values,
12485 global_options_set.x_param_values);
12486
12487 /* The maybe_set calls won't update the value if the user has explicitly set
12488 one. Which means we need to validate that probing interval and guard size
12489 are equal. */
12490 int probe_interval
12491 = PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_PROBE_INTERVAL);
12492 if (guard_size != probe_interval)
12493 error ("stack clash guard size %<%d%> must be equal to probing interval "
12494 "%<%d%>", guard_size, probe_interval);
12495
12496 /* Enable sw prefetching at specified optimization level for
12497 CPUS that have prefetch. Lower optimization level threshold by 1
12498 when profiling is enabled. */
12499 if (opts->x_flag_prefetch_loop_arrays < 0
12500 && !opts->x_optimize_size
12501 && aarch64_tune_params.prefetch->default_opt_level >= 0
12502 && opts->x_optimize >= aarch64_tune_params.prefetch->default_opt_level)
12503 opts->x_flag_prefetch_loop_arrays = 1;
12504
12505 if (opts->x_aarch64_arch_string == NULL)
12506 opts->x_aarch64_arch_string = selected_arch->name;
12507 if (opts->x_aarch64_cpu_string == NULL)
12508 opts->x_aarch64_cpu_string = selected_cpu->name;
12509 if (opts->x_aarch64_tune_string == NULL)
12510 opts->x_aarch64_tune_string = selected_tune->name;
12511
12512 aarch64_override_options_after_change_1 (opts);
12513 }
12514
12515 /* Print a hint with a suggestion for a core or architecture name that
12516 most closely resembles what the user passed in STR. ARCH is true if
12517 the user is asking for an architecture name. ARCH is false if the user
12518 is asking for a core name. */
12519
12520 static void
12521 aarch64_print_hint_for_core_or_arch (const char *str, bool arch)
12522 {
12523 auto_vec<const char *> candidates;
12524 const struct processor *entry = arch ? all_architectures : all_cores;
12525 for (; entry->name != NULL; entry++)
12526 candidates.safe_push (entry->name);
12527
12528 #ifdef HAVE_LOCAL_CPU_DETECT
12529 /* Add also "native" as possible value. */
12530 if (arch)
12531 candidates.safe_push ("native");
12532 #endif
12533
12534 char *s;
12535 const char *hint = candidates_list_and_hint (str, s, candidates);
12536 if (hint)
12537 inform (input_location, "valid arguments are: %s;"
12538 " did you mean %qs?", s, hint);
12539 else
12540 inform (input_location, "valid arguments are: %s", s);
12541
12542 XDELETEVEC (s);
12543 }
12544
12545 /* Print a hint with a suggestion for a core name that most closely resembles
12546 what the user passed in STR. */
12547
12548 inline static void
12549 aarch64_print_hint_for_core (const char *str)
12550 {
12551 aarch64_print_hint_for_core_or_arch (str, false);
12552 }
12553
12554 /* Print a hint with a suggestion for an architecture name that most closely
12555 resembles what the user passed in STR. */
12556
12557 inline static void
12558 aarch64_print_hint_for_arch (const char *str)
12559 {
12560 aarch64_print_hint_for_core_or_arch (str, true);
12561 }
12562
12563
12564 /* Print a hint with a suggestion for an extension name
12565 that most closely resembles what the user passed in STR. */
12566
12567 void
12568 aarch64_print_hint_for_extensions (const std::string &str)
12569 {
12570 auto_vec<const char *> candidates;
12571 aarch64_get_all_extension_candidates (&candidates);
12572 char *s;
12573 const char *hint = candidates_list_and_hint (str.c_str (), s, candidates);
12574 if (hint)
12575 inform (input_location, "valid arguments are: %s;"
12576 " did you mean %qs?", s, hint);
12577 else
12578 inform (input_location, "valid arguments are: %s;", s);
12579
12580 XDELETEVEC (s);
12581 }
12582
12583 /* Validate a command-line -mcpu option. Parse the cpu and extensions (if any)
12584 specified in STR and throw errors if appropriate. Put the results if
12585 they are valid in RES and ISA_FLAGS. Return whether the option is
12586 valid. */
12587
12588 static bool
12589 aarch64_validate_mcpu (const char *str, const struct processor **res,
12590 uint64_t *isa_flags)
12591 {
12592 std::string invalid_extension;
12593 enum aarch64_parse_opt_result parse_res
12594 = aarch64_parse_cpu (str, res, isa_flags, &invalid_extension);
12595
12596 if (parse_res == AARCH64_PARSE_OK)
12597 return true;
12598
12599 switch (parse_res)
12600 {
12601 case AARCH64_PARSE_MISSING_ARG:
12602 error ("missing cpu name in %<-mcpu=%s%>", str);
12603 break;
12604 case AARCH64_PARSE_INVALID_ARG:
12605 error ("unknown value %qs for %<-mcpu%>", str);
12606 aarch64_print_hint_for_core (str);
12607 break;
12608 case AARCH64_PARSE_INVALID_FEATURE:
12609 error ("invalid feature modifier %qs in %<-mcpu=%s%>",
12610 invalid_extension.c_str (), str);
12611 aarch64_print_hint_for_extensions (invalid_extension);
12612 break;
12613 default:
12614 gcc_unreachable ();
12615 }
12616
12617 return false;
12618 }
12619
12620 /* Parses CONST_STR for branch protection features specified in
12621 aarch64_branch_protect_types, and set any global variables required. Returns
12622 the parsing result and assigns LAST_STR to the last processed token from
12623 CONST_STR so that it can be used for error reporting. */
12624
12625 static enum
12626 aarch64_parse_opt_result aarch64_parse_branch_protection (const char *const_str,
12627 char** last_str)
12628 {
12629 char *str_root = xstrdup (const_str);
12630 char* token_save = NULL;
12631 char *str = strtok_r (str_root, "+", &token_save);
12632 enum aarch64_parse_opt_result res = AARCH64_PARSE_OK;
12633 if (!str)
12634 res = AARCH64_PARSE_MISSING_ARG;
12635 else
12636 {
12637 char *next_str = strtok_r (NULL, "+", &token_save);
12638 /* Reset the branch protection features to their defaults. */
12639 aarch64_handle_no_branch_protection (NULL, NULL);
12640
12641 while (str && res == AARCH64_PARSE_OK)
12642 {
12643 const aarch64_branch_protect_type* type = aarch64_branch_protect_types;
12644 bool found = false;
12645 /* Search for this type. */
12646 while (type && type->name && !found && res == AARCH64_PARSE_OK)
12647 {
12648 if (strcmp (str, type->name) == 0)
12649 {
12650 found = true;
12651 res = type->handler (str, next_str);
12652 str = next_str;
12653 next_str = strtok_r (NULL, "+", &token_save);
12654 }
12655 else
12656 type++;
12657 }
12658 if (found && res == AARCH64_PARSE_OK)
12659 {
12660 bool found_subtype = true;
12661 /* Loop through each token until we find one that isn't a
12662 subtype. */
12663 while (found_subtype)
12664 {
12665 found_subtype = false;
12666 const aarch64_branch_protect_type *subtype = type->subtypes;
12667 /* Search for the subtype. */
12668 while (str && subtype && subtype->name && !found_subtype
12669 && res == AARCH64_PARSE_OK)
12670 {
12671 if (strcmp (str, subtype->name) == 0)
12672 {
12673 found_subtype = true;
12674 res = subtype->handler (str, next_str);
12675 str = next_str;
12676 next_str = strtok_r (NULL, "+", &token_save);
12677 }
12678 else
12679 subtype++;
12680 }
12681 }
12682 }
12683 else if (!found)
12684 res = AARCH64_PARSE_INVALID_ARG;
12685 }
12686 }
12687 /* Copy the last processed token into the argument to pass it back.
12688 Used by option and attribute validation to print the offending token. */
12689 if (last_str)
12690 {
12691 if (str) strcpy (*last_str, str);
12692 else *last_str = NULL;
12693 }
12694 if (res == AARCH64_PARSE_OK)
12695 {
12696 /* If needed, alloc the accepted string then copy in const_str.
12697 Used by override_option_after_change_1. */
12698 if (!accepted_branch_protection_string)
12699 accepted_branch_protection_string = (char *) xmalloc (
12700 BRANCH_PROTECT_STR_MAX
12701 + 1);
12702 strncpy (accepted_branch_protection_string, const_str,
12703 BRANCH_PROTECT_STR_MAX + 1);
12704 /* Forcibly null-terminate. */
12705 accepted_branch_protection_string[BRANCH_PROTECT_STR_MAX] = '\0';
12706 }
12707 return res;
12708 }
12709
12710 static bool
12711 aarch64_validate_mbranch_protection (const char *const_str)
12712 {
12713 char *str = (char *) xmalloc (strlen (const_str));
12714 enum aarch64_parse_opt_result res =
12715 aarch64_parse_branch_protection (const_str, &str);
12716 if (res == AARCH64_PARSE_INVALID_ARG)
12717 error ("invalid argument %<%s%> for %<-mbranch-protection=%>", str);
12718 else if (res == AARCH64_PARSE_MISSING_ARG)
12719 error ("missing argument for %<-mbranch-protection=%>");
12720 free (str);
12721 return res == AARCH64_PARSE_OK;
12722 }
12723
12724 /* Validate a command-line -march option. Parse the arch and extensions
12725 (if any) specified in STR and throw errors if appropriate. Put the
12726 results, if they are valid, in RES and ISA_FLAGS. Return whether the
12727 option is valid. */
12728
12729 static bool
12730 aarch64_validate_march (const char *str, const struct processor **res,
12731 uint64_t *isa_flags)
12732 {
12733 std::string invalid_extension;
12734 enum aarch64_parse_opt_result parse_res
12735 = aarch64_parse_arch (str, res, isa_flags, &invalid_extension);
12736
12737 if (parse_res == AARCH64_PARSE_OK)
12738 return true;
12739
12740 switch (parse_res)
12741 {
12742 case AARCH64_PARSE_MISSING_ARG:
12743 error ("missing arch name in %<-march=%s%>", str);
12744 break;
12745 case AARCH64_PARSE_INVALID_ARG:
12746 error ("unknown value %qs for %<-march%>", str);
12747 aarch64_print_hint_for_arch (str);
12748 break;
12749 case AARCH64_PARSE_INVALID_FEATURE:
12750 error ("invalid feature modifier %qs in %<-march=%s%>",
12751 invalid_extension.c_str (), str);
12752 aarch64_print_hint_for_extensions (invalid_extension);
12753 break;
12754 default:
12755 gcc_unreachable ();
12756 }
12757
12758 return false;
12759 }
12760
12761 /* Validate a command-line -mtune option. Parse the cpu
12762 specified in STR and throw errors if appropriate. Put the
12763 result, if it is valid, in RES. Return whether the option is
12764 valid. */
12765
12766 static bool
12767 aarch64_validate_mtune (const char *str, const struct processor **res)
12768 {
12769 enum aarch64_parse_opt_result parse_res
12770 = aarch64_parse_tune (str, res);
12771
12772 if (parse_res == AARCH64_PARSE_OK)
12773 return true;
12774
12775 switch (parse_res)
12776 {
12777 case AARCH64_PARSE_MISSING_ARG:
12778 error ("missing cpu name in %<-mtune=%s%>", str);
12779 break;
12780 case AARCH64_PARSE_INVALID_ARG:
12781 error ("unknown value %qs for %<-mtune%>", str);
12782 aarch64_print_hint_for_core (str);
12783 break;
12784 default:
12785 gcc_unreachable ();
12786 }
12787 return false;
12788 }
12789
12790 /* Return the CPU corresponding to the enum CPU.
12791 If it doesn't specify a cpu, return the default. */
12792
12793 static const struct processor *
12794 aarch64_get_tune_cpu (enum aarch64_processor cpu)
12795 {
12796 if (cpu != aarch64_none)
12797 return &all_cores[cpu];
12798
12799 /* The & 0x3f is to extract the bottom 6 bits that encode the
12800 default cpu as selected by the --with-cpu GCC configure option
12801 in config.gcc.
12802 ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
12803 flags mechanism should be reworked to make it more sane. */
12804 return &all_cores[TARGET_CPU_DEFAULT & 0x3f];
12805 }
12806
12807 /* Return the architecture corresponding to the enum ARCH.
12808 If it doesn't specify a valid architecture, return the default. */
12809
12810 static const struct processor *
12811 aarch64_get_arch (enum aarch64_arch arch)
12812 {
12813 if (arch != aarch64_no_arch)
12814 return &all_architectures[arch];
12815
12816 const struct processor *cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
12817
12818 return &all_architectures[cpu->arch];
12819 }
12820
12821 /* Return the VG value associated with -msve-vector-bits= value VALUE. */
12822
12823 static poly_uint16
12824 aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits_enum value)
12825 {
12826 /* For now generate vector-length agnostic code for -msve-vector-bits=128.
12827 This ensures we can clearly distinguish SVE and Advanced SIMD modes when
12828 deciding which .md file patterns to use and when deciding whether
12829 something is a legitimate address or constant. */
12830 if (value == SVE_SCALABLE || value == SVE_128)
12831 return poly_uint16 (2, 2);
12832 else
12833 return (int) value / 64;
12834 }
12835
12836 /* Implement TARGET_OPTION_OVERRIDE. This is called once in the beginning
12837 and is used to parse the -m{cpu,tune,arch} strings and setup the initial
12838 tuning structs. In particular it must set selected_tune and
12839 aarch64_isa_flags that define the available ISA features and tuning
12840 decisions. It must also set selected_arch as this will be used to
12841 output the .arch asm tags for each function. */
12842
12843 static void
12844 aarch64_override_options (void)
12845 {
12846 uint64_t cpu_isa = 0;
12847 uint64_t arch_isa = 0;
12848 aarch64_isa_flags = 0;
12849
12850 bool valid_cpu = true;
12851 bool valid_tune = true;
12852 bool valid_arch = true;
12853
12854 selected_cpu = NULL;
12855 selected_arch = NULL;
12856 selected_tune = NULL;
12857
12858 if (aarch64_branch_protection_string)
12859 aarch64_validate_mbranch_protection (aarch64_branch_protection_string);
12860
12861 /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
12862 If either of -march or -mtune is given, they override their
12863 respective component of -mcpu. */
12864 if (aarch64_cpu_string)
12865 valid_cpu = aarch64_validate_mcpu (aarch64_cpu_string, &selected_cpu,
12866 &cpu_isa);
12867
12868 if (aarch64_arch_string)
12869 valid_arch = aarch64_validate_march (aarch64_arch_string, &selected_arch,
12870 &arch_isa);
12871
12872 if (aarch64_tune_string)
12873 valid_tune = aarch64_validate_mtune (aarch64_tune_string, &selected_tune);
12874
12875 #ifdef SUBTARGET_OVERRIDE_OPTIONS
12876 SUBTARGET_OVERRIDE_OPTIONS;
12877 #endif
12878
12879 /* If the user did not specify a processor, choose the default
12880 one for them. This will be the CPU set during configuration using
12881 --with-cpu, otherwise it is "generic". */
12882 if (!selected_cpu)
12883 {
12884 if (selected_arch)
12885 {
12886 selected_cpu = &all_cores[selected_arch->ident];
12887 aarch64_isa_flags = arch_isa;
12888 explicit_arch = selected_arch->arch;
12889 }
12890 else
12891 {
12892 /* Get default configure-time CPU. */
12893 selected_cpu = aarch64_get_tune_cpu (aarch64_none);
12894 aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
12895 }
12896
12897 if (selected_tune)
12898 explicit_tune_core = selected_tune->ident;
12899 }
12900 /* If both -mcpu and -march are specified check that they are architecturally
12901 compatible, warn if they're not and prefer the -march ISA flags. */
12902 else if (selected_arch)
12903 {
12904 if (selected_arch->arch != selected_cpu->arch)
12905 {
12906 warning (0, "switch %<-mcpu=%s%> conflicts with %<-march=%s%> switch",
12907 all_architectures[selected_cpu->arch].name,
12908 selected_arch->name);
12909 }
12910 aarch64_isa_flags = arch_isa;
12911 explicit_arch = selected_arch->arch;
12912 explicit_tune_core = selected_tune ? selected_tune->ident
12913 : selected_cpu->ident;
12914 }
12915 else
12916 {
12917 /* -mcpu but no -march. */
12918 aarch64_isa_flags = cpu_isa;
12919 explicit_tune_core = selected_tune ? selected_tune->ident
12920 : selected_cpu->ident;
12921 gcc_assert (selected_cpu);
12922 selected_arch = &all_architectures[selected_cpu->arch];
12923 explicit_arch = selected_arch->arch;
12924 }
12925
12926 /* Set the arch as well as we will need it when outputing
12927 the .arch directive in assembly. */
12928 if (!selected_arch)
12929 {
12930 gcc_assert (selected_cpu);
12931 selected_arch = &all_architectures[selected_cpu->arch];
12932 }
12933
12934 if (!selected_tune)
12935 selected_tune = selected_cpu;
12936
12937 if (aarch64_enable_bti == 2)
12938 {
12939 #ifdef TARGET_ENABLE_BTI
12940 aarch64_enable_bti = 1;
12941 #else
12942 aarch64_enable_bti = 0;
12943 #endif
12944 }
12945
12946 /* Return address signing is currently not supported for ILP32 targets. For
12947 LP64 targets use the configured option in the absence of a command-line
12948 option for -mbranch-protection. */
12949 if (!TARGET_ILP32 && accepted_branch_protection_string == NULL)
12950 {
12951 #ifdef TARGET_ENABLE_PAC_RET
12952 aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
12953 #else
12954 aarch64_ra_sign_scope = AARCH64_FUNCTION_NONE;
12955 #endif
12956 }
12957
12958 #ifndef HAVE_AS_MABI_OPTION
12959 /* The compiler may have been configured with 2.23.* binutils, which does
12960 not have support for ILP32. */
12961 if (TARGET_ILP32)
12962 error ("assembler does not support %<-mabi=ilp32%>");
12963 #endif
12964
12965 /* Convert -msve-vector-bits to a VG count. */
12966 aarch64_sve_vg = aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits);
12967
12968 if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE && TARGET_ILP32)
12969 sorry ("return address signing is only supported for %<-mabi=lp64%>");
12970
12971 /* Make sure we properly set up the explicit options. */
12972 if ((aarch64_cpu_string && valid_cpu)
12973 || (aarch64_tune_string && valid_tune))
12974 gcc_assert (explicit_tune_core != aarch64_none);
12975
12976 if ((aarch64_cpu_string && valid_cpu)
12977 || (aarch64_arch_string && valid_arch))
12978 gcc_assert (explicit_arch != aarch64_no_arch);
12979
12980 /* The pass to insert speculation tracking runs before
12981 shrink-wrapping and the latter does not know how to update the
12982 tracking status. So disable it in this case. */
12983 if (aarch64_track_speculation)
12984 flag_shrink_wrap = 0;
12985
12986 aarch64_override_options_internal (&global_options);
12987
12988 /* Save these options as the default ones in case we push and pop them later
12989 while processing functions with potential target attributes. */
12990 target_option_default_node = target_option_current_node
12991 = build_target_option_node (&global_options);
12992 }
12993
12994 /* Implement targetm.override_options_after_change. */
12995
12996 static void
12997 aarch64_override_options_after_change (void)
12998 {
12999 aarch64_override_options_after_change_1 (&global_options);
13000 }
13001
13002 static struct machine_function *
13003 aarch64_init_machine_status (void)
13004 {
13005 struct machine_function *machine;
13006 machine = ggc_cleared_alloc<machine_function> ();
13007 return machine;
13008 }
13009
13010 void
13011 aarch64_init_expanders (void)
13012 {
13013 init_machine_status = aarch64_init_machine_status;
13014 }
13015
13016 /* A checking mechanism for the implementation of the various code models. */
13017 static void
13018 initialize_aarch64_code_model (struct gcc_options *opts)
13019 {
13020 if (opts->x_flag_pic)
13021 {
13022 switch (opts->x_aarch64_cmodel_var)
13023 {
13024 case AARCH64_CMODEL_TINY:
13025 aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
13026 break;
13027 case AARCH64_CMODEL_SMALL:
13028 #ifdef HAVE_AS_SMALL_PIC_RELOCS
13029 aarch64_cmodel = (flag_pic == 2
13030 ? AARCH64_CMODEL_SMALL_PIC
13031 : AARCH64_CMODEL_SMALL_SPIC);
13032 #else
13033 aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
13034 #endif
13035 break;
13036 case AARCH64_CMODEL_LARGE:
13037 sorry ("code model %qs with %<-f%s%>", "large",
13038 opts->x_flag_pic > 1 ? "PIC" : "pic");
13039 break;
13040 default:
13041 gcc_unreachable ();
13042 }
13043 }
13044 else
13045 aarch64_cmodel = opts->x_aarch64_cmodel_var;
13046 }
13047
13048 /* Implement TARGET_OPTION_SAVE. */
13049
13050 static void
13051 aarch64_option_save (struct cl_target_option *ptr, struct gcc_options *opts)
13052 {
13053 ptr->x_aarch64_override_tune_string = opts->x_aarch64_override_tune_string;
13054 ptr->x_aarch64_branch_protection_string
13055 = opts->x_aarch64_branch_protection_string;
13056 }
13057
13058 /* Implements TARGET_OPTION_RESTORE. Restore the backend codegen decisions
13059 using the information saved in PTR. */
13060
13061 static void
13062 aarch64_option_restore (struct gcc_options *opts, struct cl_target_option *ptr)
13063 {
13064 opts->x_explicit_tune_core = ptr->x_explicit_tune_core;
13065 selected_tune = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
13066 opts->x_explicit_arch = ptr->x_explicit_arch;
13067 selected_arch = aarch64_get_arch (ptr->x_explicit_arch);
13068 opts->x_aarch64_override_tune_string = ptr->x_aarch64_override_tune_string;
13069 opts->x_aarch64_branch_protection_string
13070 = ptr->x_aarch64_branch_protection_string;
13071 if (opts->x_aarch64_branch_protection_string)
13072 {
13073 aarch64_parse_branch_protection (opts->x_aarch64_branch_protection_string,
13074 NULL);
13075 }
13076
13077 aarch64_override_options_internal (opts);
13078 }
13079
13080 /* Implement TARGET_OPTION_PRINT. */
13081
13082 static void
13083 aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
13084 {
13085 const struct processor *cpu
13086 = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
13087 uint64_t isa_flags = ptr->x_aarch64_isa_flags;
13088 const struct processor *arch = aarch64_get_arch (ptr->x_explicit_arch);
13089 std::string extension
13090 = aarch64_get_extension_string_for_isa_flags (isa_flags, arch->flags);
13091
13092 fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
13093 fprintf (file, "%*sselected arch = %s%s\n", indent, "",
13094 arch->name, extension.c_str ());
13095 }
13096
13097 static GTY(()) tree aarch64_previous_fndecl;
13098
13099 void
13100 aarch64_reset_previous_fndecl (void)
13101 {
13102 aarch64_previous_fndecl = NULL;
13103 }
13104
13105 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
13106 Used by aarch64_set_current_function and aarch64_pragma_target_parse to
13107 make sure optab availability predicates are recomputed when necessary. */
13108
13109 void
13110 aarch64_save_restore_target_globals (tree new_tree)
13111 {
13112 if (TREE_TARGET_GLOBALS (new_tree))
13113 restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
13114 else if (new_tree == target_option_default_node)
13115 restore_target_globals (&default_target_globals);
13116 else
13117 TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
13118 }
13119
13120 /* Implement TARGET_SET_CURRENT_FUNCTION. Unpack the codegen decisions
13121 like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
13122 of the function, if such exists. This function may be called multiple
13123 times on a single function so use aarch64_previous_fndecl to avoid
13124 setting up identical state. */
13125
13126 static void
13127 aarch64_set_current_function (tree fndecl)
13128 {
13129 if (!fndecl || fndecl == aarch64_previous_fndecl)
13130 return;
13131
13132 tree old_tree = (aarch64_previous_fndecl
13133 ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl)
13134 : NULL_TREE);
13135
13136 tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
13137
13138 /* If current function has no attributes but the previous one did,
13139 use the default node. */
13140 if (!new_tree && old_tree)
13141 new_tree = target_option_default_node;
13142
13143 /* If nothing to do, return. #pragma GCC reset or #pragma GCC pop to
13144 the default have been handled by aarch64_save_restore_target_globals from
13145 aarch64_pragma_target_parse. */
13146 if (old_tree == new_tree)
13147 return;
13148
13149 aarch64_previous_fndecl = fndecl;
13150
13151 /* First set the target options. */
13152 cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
13153
13154 aarch64_save_restore_target_globals (new_tree);
13155 }
13156
13157 /* Enum describing the various ways we can handle attributes.
13158 In many cases we can reuse the generic option handling machinery. */
13159
13160 enum aarch64_attr_opt_type
13161 {
13162 aarch64_attr_mask, /* Attribute should set a bit in target_flags. */
13163 aarch64_attr_bool, /* Attribute sets or unsets a boolean variable. */
13164 aarch64_attr_enum, /* Attribute sets an enum variable. */
13165 aarch64_attr_custom /* Attribute requires a custom handling function. */
13166 };
13167
13168 /* All the information needed to handle a target attribute.
13169 NAME is the name of the attribute.
13170 ATTR_TYPE specifies the type of behavior of the attribute as described
13171 in the definition of enum aarch64_attr_opt_type.
13172 ALLOW_NEG is true if the attribute supports a "no-" form.
13173 HANDLER is the function that takes the attribute string as an argument
13174 It is needed only when the ATTR_TYPE is aarch64_attr_custom.
13175 OPT_NUM is the enum specifying the option that the attribute modifies.
13176 This is needed for attributes that mirror the behavior of a command-line
13177 option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
13178 aarch64_attr_enum. */
13179
13180 struct aarch64_attribute_info
13181 {
13182 const char *name;
13183 enum aarch64_attr_opt_type attr_type;
13184 bool allow_neg;
13185 bool (*handler) (const char *);
13186 enum opt_code opt_num;
13187 };
13188
13189 /* Handle the ARCH_STR argument to the arch= target attribute. */
13190
13191 static bool
13192 aarch64_handle_attr_arch (const char *str)
13193 {
13194 const struct processor *tmp_arch = NULL;
13195 std::string invalid_extension;
13196 enum aarch64_parse_opt_result parse_res
13197 = aarch64_parse_arch (str, &tmp_arch, &aarch64_isa_flags, &invalid_extension);
13198
13199 if (parse_res == AARCH64_PARSE_OK)
13200 {
13201 gcc_assert (tmp_arch);
13202 selected_arch = tmp_arch;
13203 explicit_arch = selected_arch->arch;
13204 return true;
13205 }
13206
13207 switch (parse_res)
13208 {
13209 case AARCH64_PARSE_MISSING_ARG:
13210 error ("missing name in %<target(\"arch=\")%> pragma or attribute");
13211 break;
13212 case AARCH64_PARSE_INVALID_ARG:
13213 error ("invalid name (\"%s\") in %<target(\"arch=\")%> pragma or attribute", str);
13214 aarch64_print_hint_for_arch (str);
13215 break;
13216 case AARCH64_PARSE_INVALID_FEATURE:
13217 error ("invalid feature modifier %s of value (\"%s\") in "
13218 "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
13219 aarch64_print_hint_for_extensions (invalid_extension);
13220 break;
13221 default:
13222 gcc_unreachable ();
13223 }
13224
13225 return false;
13226 }
13227
13228 /* Handle the argument CPU_STR to the cpu= target attribute. */
13229
13230 static bool
13231 aarch64_handle_attr_cpu (const char *str)
13232 {
13233 const struct processor *tmp_cpu = NULL;
13234 std::string invalid_extension;
13235 enum aarch64_parse_opt_result parse_res
13236 = aarch64_parse_cpu (str, &tmp_cpu, &aarch64_isa_flags, &invalid_extension);
13237
13238 if (parse_res == AARCH64_PARSE_OK)
13239 {
13240 gcc_assert (tmp_cpu);
13241 selected_tune = tmp_cpu;
13242 explicit_tune_core = selected_tune->ident;
13243
13244 selected_arch = &all_architectures[tmp_cpu->arch];
13245 explicit_arch = selected_arch->arch;
13246 return true;
13247 }
13248
13249 switch (parse_res)
13250 {
13251 case AARCH64_PARSE_MISSING_ARG:
13252 error ("missing name in %<target(\"cpu=\")%> pragma or attribute");
13253 break;
13254 case AARCH64_PARSE_INVALID_ARG:
13255 error ("invalid name (\"%s\") in %<target(\"cpu=\")%> pragma or attribute", str);
13256 aarch64_print_hint_for_core (str);
13257 break;
13258 case AARCH64_PARSE_INVALID_FEATURE:
13259 error ("invalid feature modifier %s of value (\"%s\") in "
13260 "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
13261 aarch64_print_hint_for_extensions (invalid_extension);
13262 break;
13263 default:
13264 gcc_unreachable ();
13265 }
13266
13267 return false;
13268 }
13269
13270 /* Handle the argument STR to the branch-protection= attribute. */
13271
13272 static bool
13273 aarch64_handle_attr_branch_protection (const char* str)
13274 {
13275 char *err_str = (char *) xmalloc (strlen (str));
13276 enum aarch64_parse_opt_result res = aarch64_parse_branch_protection (str,
13277 &err_str);
13278 bool success = false;
13279 switch (res)
13280 {
13281 case AARCH64_PARSE_MISSING_ARG:
13282 error ("missing argument to %<target(\"branch-protection=\")%> pragma or"
13283 " attribute");
13284 break;
13285 case AARCH64_PARSE_INVALID_ARG:
13286 error ("invalid protection type (\"%s\") in %<target(\"branch-protection"
13287 "=\")%> pragma or attribute", err_str);
13288 break;
13289 case AARCH64_PARSE_OK:
13290 success = true;
13291 /* Fall through. */
13292 case AARCH64_PARSE_INVALID_FEATURE:
13293 break;
13294 default:
13295 gcc_unreachable ();
13296 }
13297 free (err_str);
13298 return success;
13299 }
13300
13301 /* Handle the argument STR to the tune= target attribute. */
13302
13303 static bool
13304 aarch64_handle_attr_tune (const char *str)
13305 {
13306 const struct processor *tmp_tune = NULL;
13307 enum aarch64_parse_opt_result parse_res
13308 = aarch64_parse_tune (str, &tmp_tune);
13309
13310 if (parse_res == AARCH64_PARSE_OK)
13311 {
13312 gcc_assert (tmp_tune);
13313 selected_tune = tmp_tune;
13314 explicit_tune_core = selected_tune->ident;
13315 return true;
13316 }
13317
13318 switch (parse_res)
13319 {
13320 case AARCH64_PARSE_INVALID_ARG:
13321 error ("invalid name (\"%s\") in %<target(\"tune=\")%> pragma or attribute", str);
13322 aarch64_print_hint_for_core (str);
13323 break;
13324 default:
13325 gcc_unreachable ();
13326 }
13327
13328 return false;
13329 }
13330
13331 /* Parse an architecture extensions target attribute string specified in STR.
13332 For example "+fp+nosimd". Show any errors if needed. Return TRUE
13333 if successful. Update aarch64_isa_flags to reflect the ISA features
13334 modified. */
13335
13336 static bool
13337 aarch64_handle_attr_isa_flags (char *str)
13338 {
13339 enum aarch64_parse_opt_result parse_res;
13340 uint64_t isa_flags = aarch64_isa_flags;
13341
13342 /* We allow "+nothing" in the beginning to clear out all architectural
13343 features if the user wants to handpick specific features. */
13344 if (strncmp ("+nothing", str, 8) == 0)
13345 {
13346 isa_flags = 0;
13347 str += 8;
13348 }
13349
13350 std::string invalid_extension;
13351 parse_res = aarch64_parse_extension (str, &isa_flags, &invalid_extension);
13352
13353 if (parse_res == AARCH64_PARSE_OK)
13354 {
13355 aarch64_isa_flags = isa_flags;
13356 return true;
13357 }
13358
13359 switch (parse_res)
13360 {
13361 case AARCH64_PARSE_MISSING_ARG:
13362 error ("missing value in %<target()%> pragma or attribute");
13363 break;
13364
13365 case AARCH64_PARSE_INVALID_FEATURE:
13366 error ("invalid feature modifier %s of value (\"%s\") in "
13367 "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
13368 break;
13369
13370 default:
13371 gcc_unreachable ();
13372 }
13373
13374 return false;
13375 }
13376
13377 /* The target attributes that we support. On top of these we also support just
13378 ISA extensions, like __attribute__ ((target ("+crc"))), but that case is
13379 handled explicitly in aarch64_process_one_target_attr. */
13380
13381 static const struct aarch64_attribute_info aarch64_attributes[] =
13382 {
13383 { "general-regs-only", aarch64_attr_mask, false, NULL,
13384 OPT_mgeneral_regs_only },
13385 { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
13386 OPT_mfix_cortex_a53_835769 },
13387 { "fix-cortex-a53-843419", aarch64_attr_bool, true, NULL,
13388 OPT_mfix_cortex_a53_843419 },
13389 { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
13390 { "strict-align", aarch64_attr_mask, true, NULL, OPT_mstrict_align },
13391 { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
13392 OPT_momit_leaf_frame_pointer },
13393 { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
13394 { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
13395 OPT_march_ },
13396 { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
13397 { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
13398 OPT_mtune_ },
13399 { "branch-protection", aarch64_attr_custom, false,
13400 aarch64_handle_attr_branch_protection, OPT_mbranch_protection_ },
13401 { "sign-return-address", aarch64_attr_enum, false, NULL,
13402 OPT_msign_return_address_ },
13403 { NULL, aarch64_attr_custom, false, NULL, OPT____ }
13404 };
13405
13406 /* Parse ARG_STR which contains the definition of one target attribute.
13407 Show appropriate errors if any or return true if the attribute is valid. */
13408
13409 static bool
13410 aarch64_process_one_target_attr (char *arg_str)
13411 {
13412 bool invert = false;
13413
13414 size_t len = strlen (arg_str);
13415
13416 if (len == 0)
13417 {
13418 error ("malformed %<target()%> pragma or attribute");
13419 return false;
13420 }
13421
13422 char *str_to_check = (char *) alloca (len + 1);
13423 strcpy (str_to_check, arg_str);
13424
13425 /* We have something like __attribute__ ((target ("+fp+nosimd"))).
13426 It is easier to detect and handle it explicitly here rather than going
13427 through the machinery for the rest of the target attributes in this
13428 function. */
13429 if (*str_to_check == '+')
13430 return aarch64_handle_attr_isa_flags (str_to_check);
13431
13432 if (len > 3 && strncmp (str_to_check, "no-", 3) == 0)
13433 {
13434 invert = true;
13435 str_to_check += 3;
13436 }
13437 char *arg = strchr (str_to_check, '=');
13438
13439 /* If we found opt=foo then terminate STR_TO_CHECK at the '='
13440 and point ARG to "foo". */
13441 if (arg)
13442 {
13443 *arg = '\0';
13444 arg++;
13445 }
13446 const struct aarch64_attribute_info *p_attr;
13447 bool found = false;
13448 for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
13449 {
13450 /* If the names don't match up, or the user has given an argument
13451 to an attribute that doesn't accept one, or didn't give an argument
13452 to an attribute that expects one, fail to match. */
13453 if (strcmp (str_to_check, p_attr->name) != 0)
13454 continue;
13455
13456 found = true;
13457 bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
13458 || p_attr->attr_type == aarch64_attr_enum;
13459
13460 if (attr_need_arg_p ^ (arg != NULL))
13461 {
13462 error ("pragma or attribute %<target(\"%s\")%> does not accept an argument", str_to_check);
13463 return false;
13464 }
13465
13466 /* If the name matches but the attribute does not allow "no-" versions
13467 then we can't match. */
13468 if (invert && !p_attr->allow_neg)
13469 {
13470 error ("pragma or attribute %<target(\"%s\")%> does not allow a negated form", str_to_check);
13471 return false;
13472 }
13473
13474 switch (p_attr->attr_type)
13475 {
13476 /* Has a custom handler registered.
13477 For example, cpu=, arch=, tune=. */
13478 case aarch64_attr_custom:
13479 gcc_assert (p_attr->handler);
13480 if (!p_attr->handler (arg))
13481 return false;
13482 break;
13483
13484 /* Either set or unset a boolean option. */
13485 case aarch64_attr_bool:
13486 {
13487 struct cl_decoded_option decoded;
13488
13489 generate_option (p_attr->opt_num, NULL, !invert,
13490 CL_TARGET, &decoded);
13491 aarch64_handle_option (&global_options, &global_options_set,
13492 &decoded, input_location);
13493 break;
13494 }
13495 /* Set or unset a bit in the target_flags. aarch64_handle_option
13496 should know what mask to apply given the option number. */
13497 case aarch64_attr_mask:
13498 {
13499 struct cl_decoded_option decoded;
13500 /* We only need to specify the option number.
13501 aarch64_handle_option will know which mask to apply. */
13502 decoded.opt_index = p_attr->opt_num;
13503 decoded.value = !invert;
13504 aarch64_handle_option (&global_options, &global_options_set,
13505 &decoded, input_location);
13506 break;
13507 }
13508 /* Use the option setting machinery to set an option to an enum. */
13509 case aarch64_attr_enum:
13510 {
13511 gcc_assert (arg);
13512 bool valid;
13513 int value;
13514 valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
13515 &value, CL_TARGET);
13516 if (valid)
13517 {
13518 set_option (&global_options, NULL, p_attr->opt_num, value,
13519 NULL, DK_UNSPECIFIED, input_location,
13520 global_dc);
13521 }
13522 else
13523 {
13524 error ("pragma or attribute %<target(\"%s=%s\")%> is not valid", str_to_check, arg);
13525 }
13526 break;
13527 }
13528 default:
13529 gcc_unreachable ();
13530 }
13531 }
13532
13533 /* If we reached here we either have found an attribute and validated
13534 it or didn't match any. If we matched an attribute but its arguments
13535 were malformed we will have returned false already. */
13536 return found;
13537 }
13538
13539 /* Count how many times the character C appears in
13540 NULL-terminated string STR. */
13541
13542 static unsigned int
13543 num_occurences_in_str (char c, char *str)
13544 {
13545 unsigned int res = 0;
13546 while (*str != '\0')
13547 {
13548 if (*str == c)
13549 res++;
13550
13551 str++;
13552 }
13553
13554 return res;
13555 }
13556
13557 /* Parse the tree in ARGS that contains the target attribute information
13558 and update the global target options space. */
13559
13560 bool
13561 aarch64_process_target_attr (tree args)
13562 {
13563 if (TREE_CODE (args) == TREE_LIST)
13564 {
13565 do
13566 {
13567 tree head = TREE_VALUE (args);
13568 if (head)
13569 {
13570 if (!aarch64_process_target_attr (head))
13571 return false;
13572 }
13573 args = TREE_CHAIN (args);
13574 } while (args);
13575
13576 return true;
13577 }
13578
13579 if (TREE_CODE (args) != STRING_CST)
13580 {
13581 error ("attribute %<target%> argument not a string");
13582 return false;
13583 }
13584
13585 size_t len = strlen (TREE_STRING_POINTER (args));
13586 char *str_to_check = (char *) alloca (len + 1);
13587 strcpy (str_to_check, TREE_STRING_POINTER (args));
13588
13589 if (len == 0)
13590 {
13591 error ("malformed %<target()%> pragma or attribute");
13592 return false;
13593 }
13594
13595 /* Used to catch empty spaces between commas i.e.
13596 attribute ((target ("attr1,,attr2"))). */
13597 unsigned int num_commas = num_occurences_in_str (',', str_to_check);
13598
13599 /* Handle multiple target attributes separated by ','. */
13600 char *token = strtok_r (str_to_check, ",", &str_to_check);
13601
13602 unsigned int num_attrs = 0;
13603 while (token)
13604 {
13605 num_attrs++;
13606 if (!aarch64_process_one_target_attr (token))
13607 {
13608 error ("pragma or attribute %<target(\"%s\")%> is not valid", token);
13609 return false;
13610 }
13611
13612 token = strtok_r (NULL, ",", &str_to_check);
13613 }
13614
13615 if (num_attrs != num_commas + 1)
13616 {
13617 error ("malformed %<target(\"%s\")%> pragma or attribute", TREE_STRING_POINTER (args));
13618 return false;
13619 }
13620
13621 return true;
13622 }
13623
13624 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P. This is used to
13625 process attribute ((target ("..."))). */
13626
13627 static bool
13628 aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
13629 {
13630 struct cl_target_option cur_target;
13631 bool ret;
13632 tree old_optimize;
13633 tree new_target, new_optimize;
13634 tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
13635
13636 /* If what we're processing is the current pragma string then the
13637 target option node is already stored in target_option_current_node
13638 by aarch64_pragma_target_parse in aarch64-c.c. Use that to avoid
13639 having to re-parse the string. This is especially useful to keep
13640 arm_neon.h compile times down since that header contains a lot
13641 of intrinsics enclosed in pragmas. */
13642 if (!existing_target && args == current_target_pragma)
13643 {
13644 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
13645 return true;
13646 }
13647 tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
13648
13649 old_optimize = build_optimization_node (&global_options);
13650 func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
13651
13652 /* If the function changed the optimization levels as well as setting
13653 target options, start with the optimizations specified. */
13654 if (func_optimize && func_optimize != old_optimize)
13655 cl_optimization_restore (&global_options,
13656 TREE_OPTIMIZATION (func_optimize));
13657
13658 /* Save the current target options to restore at the end. */
13659 cl_target_option_save (&cur_target, &global_options);
13660
13661 /* If fndecl already has some target attributes applied to it, unpack
13662 them so that we add this attribute on top of them, rather than
13663 overwriting them. */
13664 if (existing_target)
13665 {
13666 struct cl_target_option *existing_options
13667 = TREE_TARGET_OPTION (existing_target);
13668
13669 if (existing_options)
13670 cl_target_option_restore (&global_options, existing_options);
13671 }
13672 else
13673 cl_target_option_restore (&global_options,
13674 TREE_TARGET_OPTION (target_option_current_node));
13675
13676 ret = aarch64_process_target_attr (args);
13677
13678 /* Set up any additional state. */
13679 if (ret)
13680 {
13681 aarch64_override_options_internal (&global_options);
13682 /* Initialize SIMD builtins if we haven't already.
13683 Set current_target_pragma to NULL for the duration so that
13684 the builtin initialization code doesn't try to tag the functions
13685 being built with the attributes specified by any current pragma, thus
13686 going into an infinite recursion. */
13687 if (TARGET_SIMD)
13688 {
13689 tree saved_current_target_pragma = current_target_pragma;
13690 current_target_pragma = NULL;
13691 aarch64_init_simd_builtins ();
13692 current_target_pragma = saved_current_target_pragma;
13693 }
13694 new_target = build_target_option_node (&global_options);
13695 }
13696 else
13697 new_target = NULL;
13698
13699 new_optimize = build_optimization_node (&global_options);
13700
13701 if (fndecl && ret)
13702 {
13703 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
13704
13705 if (old_optimize != new_optimize)
13706 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
13707 }
13708
13709 cl_target_option_restore (&global_options, &cur_target);
13710
13711 if (old_optimize != new_optimize)
13712 cl_optimization_restore (&global_options,
13713 TREE_OPTIMIZATION (old_optimize));
13714 return ret;
13715 }
13716
13717 /* Helper for aarch64_can_inline_p. In the case where CALLER and CALLEE are
13718 tri-bool options (yes, no, don't care) and the default value is
13719 DEF, determine whether to reject inlining. */
13720
13721 static bool
13722 aarch64_tribools_ok_for_inlining_p (int caller, int callee,
13723 int dont_care, int def)
13724 {
13725 /* If the callee doesn't care, always allow inlining. */
13726 if (callee == dont_care)
13727 return true;
13728
13729 /* If the caller doesn't care, always allow inlining. */
13730 if (caller == dont_care)
13731 return true;
13732
13733 /* Otherwise, allow inlining if either the callee and caller values
13734 agree, or if the callee is using the default value. */
13735 return (callee == caller || callee == def);
13736 }
13737
13738 /* Implement TARGET_CAN_INLINE_P. Decide whether it is valid
13739 to inline CALLEE into CALLER based on target-specific info.
13740 Make sure that the caller and callee have compatible architectural
13741 features. Then go through the other possible target attributes
13742 and see if they can block inlining. Try not to reject always_inline
13743 callees unless they are incompatible architecturally. */
13744
13745 static bool
13746 aarch64_can_inline_p (tree caller, tree callee)
13747 {
13748 tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
13749 tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
13750
13751 struct cl_target_option *caller_opts
13752 = TREE_TARGET_OPTION (caller_tree ? caller_tree
13753 : target_option_default_node);
13754
13755 struct cl_target_option *callee_opts
13756 = TREE_TARGET_OPTION (callee_tree ? callee_tree
13757 : target_option_default_node);
13758
13759 /* Callee's ISA flags should be a subset of the caller's. */
13760 if ((caller_opts->x_aarch64_isa_flags & callee_opts->x_aarch64_isa_flags)
13761 != callee_opts->x_aarch64_isa_flags)
13762 return false;
13763
13764 /* Allow non-strict aligned functions inlining into strict
13765 aligned ones. */
13766 if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
13767 != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
13768 && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
13769 && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
13770 return false;
13771
13772 bool always_inline = lookup_attribute ("always_inline",
13773 DECL_ATTRIBUTES (callee));
13774
13775 /* If the architectural features match up and the callee is always_inline
13776 then the other attributes don't matter. */
13777 if (always_inline)
13778 return true;
13779
13780 if (caller_opts->x_aarch64_cmodel_var
13781 != callee_opts->x_aarch64_cmodel_var)
13782 return false;
13783
13784 if (caller_opts->x_aarch64_tls_dialect
13785 != callee_opts->x_aarch64_tls_dialect)
13786 return false;
13787
13788 /* Honour explicit requests to workaround errata. */
13789 if (!aarch64_tribools_ok_for_inlining_p (
13790 caller_opts->x_aarch64_fix_a53_err835769,
13791 callee_opts->x_aarch64_fix_a53_err835769,
13792 2, TARGET_FIX_ERR_A53_835769_DEFAULT))
13793 return false;
13794
13795 if (!aarch64_tribools_ok_for_inlining_p (
13796 caller_opts->x_aarch64_fix_a53_err843419,
13797 callee_opts->x_aarch64_fix_a53_err843419,
13798 2, TARGET_FIX_ERR_A53_843419))
13799 return false;
13800
13801 /* If the user explicitly specified -momit-leaf-frame-pointer for the
13802 caller and calle and they don't match up, reject inlining. */
13803 if (!aarch64_tribools_ok_for_inlining_p (
13804 caller_opts->x_flag_omit_leaf_frame_pointer,
13805 callee_opts->x_flag_omit_leaf_frame_pointer,
13806 2, 1))
13807 return false;
13808
13809 /* If the callee has specific tuning overrides, respect them. */
13810 if (callee_opts->x_aarch64_override_tune_string != NULL
13811 && caller_opts->x_aarch64_override_tune_string == NULL)
13812 return false;
13813
13814 /* If the user specified tuning override strings for the
13815 caller and callee and they don't match up, reject inlining.
13816 We just do a string compare here, we don't analyze the meaning
13817 of the string, as it would be too costly for little gain. */
13818 if (callee_opts->x_aarch64_override_tune_string
13819 && caller_opts->x_aarch64_override_tune_string
13820 && (strcmp (callee_opts->x_aarch64_override_tune_string,
13821 caller_opts->x_aarch64_override_tune_string) != 0))
13822 return false;
13823
13824 return true;
13825 }
13826
13827 /* Return true if SYMBOL_REF X binds locally. */
13828
13829 static bool
13830 aarch64_symbol_binds_local_p (const_rtx x)
13831 {
13832 return (SYMBOL_REF_DECL (x)
13833 ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
13834 : SYMBOL_REF_LOCAL_P (x));
13835 }
13836
13837 /* Return true if SYMBOL_REF X is thread local */
13838 static bool
13839 aarch64_tls_symbol_p (rtx x)
13840 {
13841 if (! TARGET_HAVE_TLS)
13842 return false;
13843
13844 if (GET_CODE (x) != SYMBOL_REF)
13845 return false;
13846
13847 return SYMBOL_REF_TLS_MODEL (x) != 0;
13848 }
13849
13850 /* Classify a TLS symbol into one of the TLS kinds. */
13851 enum aarch64_symbol_type
13852 aarch64_classify_tls_symbol (rtx x)
13853 {
13854 enum tls_model tls_kind = tls_symbolic_operand_type (x);
13855
13856 switch (tls_kind)
13857 {
13858 case TLS_MODEL_GLOBAL_DYNAMIC:
13859 case TLS_MODEL_LOCAL_DYNAMIC:
13860 return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
13861
13862 case TLS_MODEL_INITIAL_EXEC:
13863 switch (aarch64_cmodel)
13864 {
13865 case AARCH64_CMODEL_TINY:
13866 case AARCH64_CMODEL_TINY_PIC:
13867 return SYMBOL_TINY_TLSIE;
13868 default:
13869 return SYMBOL_SMALL_TLSIE;
13870 }
13871
13872 case TLS_MODEL_LOCAL_EXEC:
13873 if (aarch64_tls_size == 12)
13874 return SYMBOL_TLSLE12;
13875 else if (aarch64_tls_size == 24)
13876 return SYMBOL_TLSLE24;
13877 else if (aarch64_tls_size == 32)
13878 return SYMBOL_TLSLE32;
13879 else if (aarch64_tls_size == 48)
13880 return SYMBOL_TLSLE48;
13881 else
13882 gcc_unreachable ();
13883
13884 case TLS_MODEL_EMULATED:
13885 case TLS_MODEL_NONE:
13886 return SYMBOL_FORCE_TO_MEM;
13887
13888 default:
13889 gcc_unreachable ();
13890 }
13891 }
13892
13893 /* Return the correct method for accessing X + OFFSET, where X is either
13894 a SYMBOL_REF or LABEL_REF. */
13895
13896 enum aarch64_symbol_type
13897 aarch64_classify_symbol (rtx x, HOST_WIDE_INT offset)
13898 {
13899 if (GET_CODE (x) == LABEL_REF)
13900 {
13901 switch (aarch64_cmodel)
13902 {
13903 case AARCH64_CMODEL_LARGE:
13904 return SYMBOL_FORCE_TO_MEM;
13905
13906 case AARCH64_CMODEL_TINY_PIC:
13907 case AARCH64_CMODEL_TINY:
13908 return SYMBOL_TINY_ABSOLUTE;
13909
13910 case AARCH64_CMODEL_SMALL_SPIC:
13911 case AARCH64_CMODEL_SMALL_PIC:
13912 case AARCH64_CMODEL_SMALL:
13913 return SYMBOL_SMALL_ABSOLUTE;
13914
13915 default:
13916 gcc_unreachable ();
13917 }
13918 }
13919
13920 if (GET_CODE (x) == SYMBOL_REF)
13921 {
13922 if (aarch64_tls_symbol_p (x))
13923 return aarch64_classify_tls_symbol (x);
13924
13925 switch (aarch64_cmodel)
13926 {
13927 case AARCH64_CMODEL_TINY:
13928 /* When we retrieve symbol + offset address, we have to make sure
13929 the offset does not cause overflow of the final address. But
13930 we have no way of knowing the address of symbol at compile time
13931 so we can't accurately say if the distance between the PC and
13932 symbol + offset is outside the addressible range of +/-1M in the
13933 TINY code model. So we rely on images not being greater than
13934 1M and cap the offset at 1M and anything beyond 1M will have to
13935 be loaded using an alternative mechanism. Furthermore if the
13936 symbol is a weak reference to something that isn't known to
13937 resolve to a symbol in this module, then force to memory. */
13938 if ((SYMBOL_REF_WEAK (x)
13939 && !aarch64_symbol_binds_local_p (x))
13940 || !IN_RANGE (offset, -1048575, 1048575))
13941 return SYMBOL_FORCE_TO_MEM;
13942 return SYMBOL_TINY_ABSOLUTE;
13943
13944 case AARCH64_CMODEL_SMALL:
13945 /* Same reasoning as the tiny code model, but the offset cap here is
13946 4G. */
13947 if ((SYMBOL_REF_WEAK (x)
13948 && !aarch64_symbol_binds_local_p (x))
13949 || !IN_RANGE (offset, HOST_WIDE_INT_C (-4294967263),
13950 HOST_WIDE_INT_C (4294967264)))
13951 return SYMBOL_FORCE_TO_MEM;
13952 return SYMBOL_SMALL_ABSOLUTE;
13953
13954 case AARCH64_CMODEL_TINY_PIC:
13955 if (!aarch64_symbol_binds_local_p (x))
13956 return SYMBOL_TINY_GOT;
13957 return SYMBOL_TINY_ABSOLUTE;
13958
13959 case AARCH64_CMODEL_SMALL_SPIC:
13960 case AARCH64_CMODEL_SMALL_PIC:
13961 if (!aarch64_symbol_binds_local_p (x))
13962 return (aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
13963 ? SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G);
13964 return SYMBOL_SMALL_ABSOLUTE;
13965
13966 case AARCH64_CMODEL_LARGE:
13967 /* This is alright even in PIC code as the constant
13968 pool reference is always PC relative and within
13969 the same translation unit. */
13970 if (!aarch64_pcrelative_literal_loads && CONSTANT_POOL_ADDRESS_P (x))
13971 return SYMBOL_SMALL_ABSOLUTE;
13972 else
13973 return SYMBOL_FORCE_TO_MEM;
13974
13975 default:
13976 gcc_unreachable ();
13977 }
13978 }
13979
13980 /* By default push everything into the constant pool. */
13981 return SYMBOL_FORCE_TO_MEM;
13982 }
13983
13984 bool
13985 aarch64_constant_address_p (rtx x)
13986 {
13987 return (CONSTANT_P (x) && memory_address_p (DImode, x));
13988 }
13989
13990 bool
13991 aarch64_legitimate_pic_operand_p (rtx x)
13992 {
13993 if (GET_CODE (x) == SYMBOL_REF
13994 || (GET_CODE (x) == CONST
13995 && GET_CODE (XEXP (x, 0)) == PLUS
13996 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
13997 return false;
13998
13999 return true;
14000 }
14001
14002 /* Implement TARGET_LEGITIMATE_CONSTANT_P hook. Return true for constants
14003 that should be rematerialized rather than spilled. */
14004
14005 static bool
14006 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
14007 {
14008 /* Support CSE and rematerialization of common constants. */
14009 if (CONST_INT_P (x)
14010 || (CONST_DOUBLE_P (x) && GET_MODE_CLASS (mode) == MODE_FLOAT)
14011 || GET_CODE (x) == CONST_VECTOR)
14012 return true;
14013
14014 /* Do not allow vector struct mode constants for Advanced SIMD.
14015 We could support 0 and -1 easily, but they need support in
14016 aarch64-simd.md. */
14017 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
14018 if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
14019 return false;
14020
14021 /* Only accept variable-length vector constants if they can be
14022 handled directly.
14023
14024 ??? It would be possible to handle rematerialization of other
14025 constants via secondary reloads. */
14026 if (vec_flags & VEC_ANY_SVE)
14027 return aarch64_simd_valid_immediate (x, NULL);
14028
14029 if (GET_CODE (x) == HIGH)
14030 x = XEXP (x, 0);
14031
14032 /* Accept polynomial constants that can be calculated by using the
14033 destination of a move as the sole temporary. Constants that
14034 require a second temporary cannot be rematerialized (they can't be
14035 forced to memory and also aren't legitimate constants). */
14036 poly_int64 offset;
14037 if (poly_int_rtx_p (x, &offset))
14038 return aarch64_offset_temporaries (false, offset) <= 1;
14039
14040 /* If an offset is being added to something else, we need to allow the
14041 base to be moved into the destination register, meaning that there
14042 are no free temporaries for the offset. */
14043 x = strip_offset (x, &offset);
14044 if (!offset.is_constant () && aarch64_offset_temporaries (true, offset) > 0)
14045 return false;
14046
14047 /* Do not allow const (plus (anchor_symbol, const_int)). */
14048 if (maybe_ne (offset, 0) && SYMBOL_REF_P (x) && SYMBOL_REF_ANCHOR_P (x))
14049 return false;
14050
14051 /* Treat symbols as constants. Avoid TLS symbols as they are complex,
14052 so spilling them is better than rematerialization. */
14053 if (SYMBOL_REF_P (x) && !SYMBOL_REF_TLS_MODEL (x))
14054 return true;
14055
14056 /* Label references are always constant. */
14057 if (GET_CODE (x) == LABEL_REF)
14058 return true;
14059
14060 return false;
14061 }
14062
14063 rtx
14064 aarch64_load_tp (rtx target)
14065 {
14066 if (!target
14067 || GET_MODE (target) != Pmode
14068 || !register_operand (target, Pmode))
14069 target = gen_reg_rtx (Pmode);
14070
14071 /* Can return in any reg. */
14072 emit_insn (gen_aarch64_load_tp_hard (target));
14073 return target;
14074 }
14075
14076 /* On AAPCS systems, this is the "struct __va_list". */
14077 static GTY(()) tree va_list_type;
14078
14079 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
14080 Return the type to use as __builtin_va_list.
14081
14082 AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
14083
14084 struct __va_list
14085 {
14086 void *__stack;
14087 void *__gr_top;
14088 void *__vr_top;
14089 int __gr_offs;
14090 int __vr_offs;
14091 }; */
14092
14093 static tree
14094 aarch64_build_builtin_va_list (void)
14095 {
14096 tree va_list_name;
14097 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
14098
14099 /* Create the type. */
14100 va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
14101 /* Give it the required name. */
14102 va_list_name = build_decl (BUILTINS_LOCATION,
14103 TYPE_DECL,
14104 get_identifier ("__va_list"),
14105 va_list_type);
14106 DECL_ARTIFICIAL (va_list_name) = 1;
14107 TYPE_NAME (va_list_type) = va_list_name;
14108 TYPE_STUB_DECL (va_list_type) = va_list_name;
14109
14110 /* Create the fields. */
14111 f_stack = build_decl (BUILTINS_LOCATION,
14112 FIELD_DECL, get_identifier ("__stack"),
14113 ptr_type_node);
14114 f_grtop = build_decl (BUILTINS_LOCATION,
14115 FIELD_DECL, get_identifier ("__gr_top"),
14116 ptr_type_node);
14117 f_vrtop = build_decl (BUILTINS_LOCATION,
14118 FIELD_DECL, get_identifier ("__vr_top"),
14119 ptr_type_node);
14120 f_groff = build_decl (BUILTINS_LOCATION,
14121 FIELD_DECL, get_identifier ("__gr_offs"),
14122 integer_type_node);
14123 f_vroff = build_decl (BUILTINS_LOCATION,
14124 FIELD_DECL, get_identifier ("__vr_offs"),
14125 integer_type_node);
14126
14127 /* Tell tree-stdarg pass about our internal offset fields.
14128 NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
14129 purpose to identify whether the code is updating va_list internal
14130 offset fields through irregular way. */
14131 va_list_gpr_counter_field = f_groff;
14132 va_list_fpr_counter_field = f_vroff;
14133
14134 DECL_ARTIFICIAL (f_stack) = 1;
14135 DECL_ARTIFICIAL (f_grtop) = 1;
14136 DECL_ARTIFICIAL (f_vrtop) = 1;
14137 DECL_ARTIFICIAL (f_groff) = 1;
14138 DECL_ARTIFICIAL (f_vroff) = 1;
14139
14140 DECL_FIELD_CONTEXT (f_stack) = va_list_type;
14141 DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
14142 DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
14143 DECL_FIELD_CONTEXT (f_groff) = va_list_type;
14144 DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
14145
14146 TYPE_FIELDS (va_list_type) = f_stack;
14147 DECL_CHAIN (f_stack) = f_grtop;
14148 DECL_CHAIN (f_grtop) = f_vrtop;
14149 DECL_CHAIN (f_vrtop) = f_groff;
14150 DECL_CHAIN (f_groff) = f_vroff;
14151
14152 /* Compute its layout. */
14153 layout_type (va_list_type);
14154
14155 return va_list_type;
14156 }
14157
14158 /* Implement TARGET_EXPAND_BUILTIN_VA_START. */
14159 static void
14160 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
14161 {
14162 const CUMULATIVE_ARGS *cum;
14163 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
14164 tree stack, grtop, vrtop, groff, vroff;
14165 tree t;
14166 int gr_save_area_size = cfun->va_list_gpr_size;
14167 int vr_save_area_size = cfun->va_list_fpr_size;
14168 int vr_offset;
14169
14170 cum = &crtl->args.info;
14171 if (cfun->va_list_gpr_size)
14172 gr_save_area_size = MIN ((NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD,
14173 cfun->va_list_gpr_size);
14174 if (cfun->va_list_fpr_size)
14175 vr_save_area_size = MIN ((NUM_FP_ARG_REGS - cum->aapcs_nvrn)
14176 * UNITS_PER_VREG, cfun->va_list_fpr_size);
14177
14178 if (!TARGET_FLOAT)
14179 {
14180 gcc_assert (cum->aapcs_nvrn == 0);
14181 vr_save_area_size = 0;
14182 }
14183
14184 f_stack = TYPE_FIELDS (va_list_type_node);
14185 f_grtop = DECL_CHAIN (f_stack);
14186 f_vrtop = DECL_CHAIN (f_grtop);
14187 f_groff = DECL_CHAIN (f_vrtop);
14188 f_vroff = DECL_CHAIN (f_groff);
14189
14190 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
14191 NULL_TREE);
14192 grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
14193 NULL_TREE);
14194 vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
14195 NULL_TREE);
14196 groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
14197 NULL_TREE);
14198 vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
14199 NULL_TREE);
14200
14201 /* Emit code to initialize STACK, which points to the next varargs stack
14202 argument. CUM->AAPCS_STACK_SIZE gives the number of stack words used
14203 by named arguments. STACK is 8-byte aligned. */
14204 t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
14205 if (cum->aapcs_stack_size > 0)
14206 t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
14207 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
14208 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
14209
14210 /* Emit code to initialize GRTOP, the top of the GR save area.
14211 virtual_incoming_args_rtx should have been 16 byte aligned. */
14212 t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
14213 t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
14214 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
14215
14216 /* Emit code to initialize VRTOP, the top of the VR save area.
14217 This address is gr_save_area_bytes below GRTOP, rounded
14218 down to the next 16-byte boundary. */
14219 t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
14220 vr_offset = ROUND_UP (gr_save_area_size,
14221 STACK_BOUNDARY / BITS_PER_UNIT);
14222
14223 if (vr_offset)
14224 t = fold_build_pointer_plus_hwi (t, -vr_offset);
14225 t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
14226 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
14227
14228 /* Emit code to initialize GROFF, the offset from GRTOP of the
14229 next GPR argument. */
14230 t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
14231 build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
14232 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
14233
14234 /* Likewise emit code to initialize VROFF, the offset from FTOP
14235 of the next VR argument. */
14236 t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
14237 build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
14238 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
14239 }
14240
14241 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR. */
14242
14243 static tree
14244 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
14245 gimple_seq *post_p ATTRIBUTE_UNUSED)
14246 {
14247 tree addr;
14248 bool indirect_p;
14249 bool is_ha; /* is HFA or HVA. */
14250 bool dw_align; /* double-word align. */
14251 machine_mode ag_mode = VOIDmode;
14252 int nregs;
14253 machine_mode mode;
14254
14255 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
14256 tree stack, f_top, f_off, off, arg, roundup, on_stack;
14257 HOST_WIDE_INT size, rsize, adjust, align;
14258 tree t, u, cond1, cond2;
14259
14260 indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
14261 if (indirect_p)
14262 type = build_pointer_type (type);
14263
14264 mode = TYPE_MODE (type);
14265
14266 f_stack = TYPE_FIELDS (va_list_type_node);
14267 f_grtop = DECL_CHAIN (f_stack);
14268 f_vrtop = DECL_CHAIN (f_grtop);
14269 f_groff = DECL_CHAIN (f_vrtop);
14270 f_vroff = DECL_CHAIN (f_groff);
14271
14272 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
14273 f_stack, NULL_TREE);
14274 size = int_size_in_bytes (type);
14275
14276 bool abi_break;
14277 align
14278 = aarch64_function_arg_alignment (mode, type, &abi_break) / BITS_PER_UNIT;
14279
14280 dw_align = false;
14281 adjust = 0;
14282 if (aarch64_vfp_is_call_or_return_candidate (mode,
14283 type,
14284 &ag_mode,
14285 &nregs,
14286 &is_ha))
14287 {
14288 /* No frontends can create types with variable-sized modes, so we
14289 shouldn't be asked to pass or return them. */
14290 unsigned int ag_size = GET_MODE_SIZE (ag_mode).to_constant ();
14291
14292 /* TYPE passed in fp/simd registers. */
14293 if (!TARGET_FLOAT)
14294 aarch64_err_no_fpadvsimd (mode);
14295
14296 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
14297 unshare_expr (valist), f_vrtop, NULL_TREE);
14298 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
14299 unshare_expr (valist), f_vroff, NULL_TREE);
14300
14301 rsize = nregs * UNITS_PER_VREG;
14302
14303 if (is_ha)
14304 {
14305 if (BYTES_BIG_ENDIAN && ag_size < UNITS_PER_VREG)
14306 adjust = UNITS_PER_VREG - ag_size;
14307 }
14308 else if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
14309 && size < UNITS_PER_VREG)
14310 {
14311 adjust = UNITS_PER_VREG - size;
14312 }
14313 }
14314 else
14315 {
14316 /* TYPE passed in general registers. */
14317 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
14318 unshare_expr (valist), f_grtop, NULL_TREE);
14319 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
14320 unshare_expr (valist), f_groff, NULL_TREE);
14321 rsize = ROUND_UP (size, UNITS_PER_WORD);
14322 nregs = rsize / UNITS_PER_WORD;
14323
14324 if (align > 8)
14325 {
14326 if (abi_break && warn_psabi)
14327 inform (input_location, "parameter passing for argument of type "
14328 "%qT changed in GCC 9.1", type);
14329 dw_align = true;
14330 }
14331
14332 if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
14333 && size < UNITS_PER_WORD)
14334 {
14335 adjust = UNITS_PER_WORD - size;
14336 }
14337 }
14338
14339 /* Get a local temporary for the field value. */
14340 off = get_initialized_tmp_var (f_off, pre_p, NULL);
14341
14342 /* Emit code to branch if off >= 0. */
14343 t = build2 (GE_EXPR, boolean_type_node, off,
14344 build_int_cst (TREE_TYPE (off), 0));
14345 cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
14346
14347 if (dw_align)
14348 {
14349 /* Emit: offs = (offs + 15) & -16. */
14350 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
14351 build_int_cst (TREE_TYPE (off), 15));
14352 t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
14353 build_int_cst (TREE_TYPE (off), -16));
14354 roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
14355 }
14356 else
14357 roundup = NULL;
14358
14359 /* Update ap.__[g|v]r_offs */
14360 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
14361 build_int_cst (TREE_TYPE (off), rsize));
14362 t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
14363
14364 /* String up. */
14365 if (roundup)
14366 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
14367
14368 /* [cond2] if (ap.__[g|v]r_offs > 0) */
14369 u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
14370 build_int_cst (TREE_TYPE (f_off), 0));
14371 cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
14372
14373 /* String up: make sure the assignment happens before the use. */
14374 t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
14375 COND_EXPR_ELSE (cond1) = t;
14376
14377 /* Prepare the trees handling the argument that is passed on the stack;
14378 the top level node will store in ON_STACK. */
14379 arg = get_initialized_tmp_var (stack, pre_p, NULL);
14380 if (align > 8)
14381 {
14382 /* if (alignof(type) > 8) (arg = arg + 15) & -16; */
14383 t = fold_build_pointer_plus_hwi (arg, 15);
14384 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
14385 build_int_cst (TREE_TYPE (t), -16));
14386 roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
14387 }
14388 else
14389 roundup = NULL;
14390 /* Advance ap.__stack */
14391 t = fold_build_pointer_plus_hwi (arg, size + 7);
14392 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
14393 build_int_cst (TREE_TYPE (t), -8));
14394 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
14395 /* String up roundup and advance. */
14396 if (roundup)
14397 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
14398 /* String up with arg */
14399 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
14400 /* Big-endianness related address adjustment. */
14401 if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
14402 && size < UNITS_PER_WORD)
14403 {
14404 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
14405 size_int (UNITS_PER_WORD - size));
14406 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
14407 }
14408
14409 COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
14410 COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
14411
14412 /* Adjustment to OFFSET in the case of BIG_ENDIAN. */
14413 t = off;
14414 if (adjust)
14415 t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
14416 build_int_cst (TREE_TYPE (off), adjust));
14417
14418 t = fold_convert (sizetype, t);
14419 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
14420
14421 if (is_ha)
14422 {
14423 /* type ha; // treat as "struct {ftype field[n];}"
14424 ... [computing offs]
14425 for (i = 0; i <nregs; ++i, offs += 16)
14426 ha.field[i] = *((ftype *)(ap.__vr_top + offs));
14427 return ha; */
14428 int i;
14429 tree tmp_ha, field_t, field_ptr_t;
14430
14431 /* Declare a local variable. */
14432 tmp_ha = create_tmp_var_raw (type, "ha");
14433 gimple_add_tmp_var (tmp_ha);
14434
14435 /* Establish the base type. */
14436 switch (ag_mode)
14437 {
14438 case E_SFmode:
14439 field_t = float_type_node;
14440 field_ptr_t = float_ptr_type_node;
14441 break;
14442 case E_DFmode:
14443 field_t = double_type_node;
14444 field_ptr_t = double_ptr_type_node;
14445 break;
14446 case E_TFmode:
14447 field_t = long_double_type_node;
14448 field_ptr_t = long_double_ptr_type_node;
14449 break;
14450 case E_HFmode:
14451 field_t = aarch64_fp16_type_node;
14452 field_ptr_t = aarch64_fp16_ptr_type_node;
14453 break;
14454 case E_V2SImode:
14455 case E_V4SImode:
14456 {
14457 tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
14458 field_t = build_vector_type_for_mode (innertype, ag_mode);
14459 field_ptr_t = build_pointer_type (field_t);
14460 }
14461 break;
14462 default:
14463 gcc_assert (0);
14464 }
14465
14466 /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area */
14467 tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
14468 addr = t;
14469 t = fold_convert (field_ptr_t, addr);
14470 t = build2 (MODIFY_EXPR, field_t,
14471 build1 (INDIRECT_REF, field_t, tmp_ha),
14472 build1 (INDIRECT_REF, field_t, t));
14473
14474 /* ha.field[i] = *((field_ptr_t)vr_saved_area + i) */
14475 for (i = 1; i < nregs; ++i)
14476 {
14477 addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
14478 u = fold_convert (field_ptr_t, addr);
14479 u = build2 (MODIFY_EXPR, field_t,
14480 build2 (MEM_REF, field_t, tmp_ha,
14481 build_int_cst (field_ptr_t,
14482 (i *
14483 int_size_in_bytes (field_t)))),
14484 build1 (INDIRECT_REF, field_t, u));
14485 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
14486 }
14487
14488 u = fold_convert (TREE_TYPE (f_top), tmp_ha);
14489 t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
14490 }
14491
14492 COND_EXPR_ELSE (cond2) = t;
14493 addr = fold_convert (build_pointer_type (type), cond1);
14494 addr = build_va_arg_indirect_ref (addr);
14495
14496 if (indirect_p)
14497 addr = build_va_arg_indirect_ref (addr);
14498
14499 return addr;
14500 }
14501
14502 /* Implement TARGET_SETUP_INCOMING_VARARGS. */
14503
14504 static void
14505 aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
14506 tree type, int *pretend_size ATTRIBUTE_UNUSED,
14507 int no_rtl)
14508 {
14509 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
14510 CUMULATIVE_ARGS local_cum;
14511 int gr_saved = cfun->va_list_gpr_size;
14512 int vr_saved = cfun->va_list_fpr_size;
14513
14514 /* The caller has advanced CUM up to, but not beyond, the last named
14515 argument. Advance a local copy of CUM past the last "real" named
14516 argument, to find out how many registers are left over. */
14517 local_cum = *cum;
14518 aarch64_function_arg_advance (pack_cumulative_args(&local_cum), mode, type, true);
14519
14520 /* Found out how many registers we need to save.
14521 Honor tree-stdvar analysis results. */
14522 if (cfun->va_list_gpr_size)
14523 gr_saved = MIN (NUM_ARG_REGS - local_cum.aapcs_ncrn,
14524 cfun->va_list_gpr_size / UNITS_PER_WORD);
14525 if (cfun->va_list_fpr_size)
14526 vr_saved = MIN (NUM_FP_ARG_REGS - local_cum.aapcs_nvrn,
14527 cfun->va_list_fpr_size / UNITS_PER_VREG);
14528
14529 if (!TARGET_FLOAT)
14530 {
14531 gcc_assert (local_cum.aapcs_nvrn == 0);
14532 vr_saved = 0;
14533 }
14534
14535 if (!no_rtl)
14536 {
14537 if (gr_saved > 0)
14538 {
14539 rtx ptr, mem;
14540
14541 /* virtual_incoming_args_rtx should have been 16-byte aligned. */
14542 ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
14543 - gr_saved * UNITS_PER_WORD);
14544 mem = gen_frame_mem (BLKmode, ptr);
14545 set_mem_alias_set (mem, get_varargs_alias_set ());
14546
14547 move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
14548 mem, gr_saved);
14549 }
14550 if (vr_saved > 0)
14551 {
14552 /* We can't use move_block_from_reg, because it will use
14553 the wrong mode, storing D regs only. */
14554 machine_mode mode = TImode;
14555 int off, i, vr_start;
14556
14557 /* Set OFF to the offset from virtual_incoming_args_rtx of
14558 the first vector register. The VR save area lies below
14559 the GR one, and is aligned to 16 bytes. */
14560 off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
14561 STACK_BOUNDARY / BITS_PER_UNIT);
14562 off -= vr_saved * UNITS_PER_VREG;
14563
14564 vr_start = V0_REGNUM + local_cum.aapcs_nvrn;
14565 for (i = 0; i < vr_saved; ++i)
14566 {
14567 rtx ptr, mem;
14568
14569 ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
14570 mem = gen_frame_mem (mode, ptr);
14571 set_mem_alias_set (mem, get_varargs_alias_set ());
14572 aarch64_emit_move (mem, gen_rtx_REG (mode, vr_start + i));
14573 off += UNITS_PER_VREG;
14574 }
14575 }
14576 }
14577
14578 /* We don't save the size into *PRETEND_SIZE because we want to avoid
14579 any complication of having crtl->args.pretend_args_size changed. */
14580 cfun->machine->frame.saved_varargs_size
14581 = (ROUND_UP (gr_saved * UNITS_PER_WORD,
14582 STACK_BOUNDARY / BITS_PER_UNIT)
14583 + vr_saved * UNITS_PER_VREG);
14584 }
14585
14586 static void
14587 aarch64_conditional_register_usage (void)
14588 {
14589 int i;
14590 if (!TARGET_FLOAT)
14591 {
14592 for (i = V0_REGNUM; i <= V31_REGNUM; i++)
14593 {
14594 fixed_regs[i] = 1;
14595 call_used_regs[i] = 1;
14596 }
14597 }
14598 if (!TARGET_SVE)
14599 for (i = P0_REGNUM; i <= P15_REGNUM; i++)
14600 {
14601 fixed_regs[i] = 1;
14602 call_used_regs[i] = 1;
14603 }
14604
14605 /* When tracking speculation, we need a couple of call-clobbered registers
14606 to track the speculation state. It would be nice to just use
14607 IP0 and IP1, but currently there are numerous places that just
14608 assume these registers are free for other uses (eg pointer
14609 authentication). */
14610 if (aarch64_track_speculation)
14611 {
14612 fixed_regs[SPECULATION_TRACKER_REGNUM] = 1;
14613 call_used_regs[SPECULATION_TRACKER_REGNUM] = 1;
14614 fixed_regs[SPECULATION_SCRATCH_REGNUM] = 1;
14615 call_used_regs[SPECULATION_SCRATCH_REGNUM] = 1;
14616 }
14617 }
14618
14619 /* Walk down the type tree of TYPE counting consecutive base elements.
14620 If *MODEP is VOIDmode, then set it to the first valid floating point
14621 type. If a non-floating point type is found, or if a floating point
14622 type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
14623 otherwise return the count in the sub-tree. */
14624 static int
14625 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
14626 {
14627 machine_mode mode;
14628 HOST_WIDE_INT size;
14629
14630 switch (TREE_CODE (type))
14631 {
14632 case REAL_TYPE:
14633 mode = TYPE_MODE (type);
14634 if (mode != DFmode && mode != SFmode
14635 && mode != TFmode && mode != HFmode)
14636 return -1;
14637
14638 if (*modep == VOIDmode)
14639 *modep = mode;
14640
14641 if (*modep == mode)
14642 return 1;
14643
14644 break;
14645
14646 case COMPLEX_TYPE:
14647 mode = TYPE_MODE (TREE_TYPE (type));
14648 if (mode != DFmode && mode != SFmode
14649 && mode != TFmode && mode != HFmode)
14650 return -1;
14651
14652 if (*modep == VOIDmode)
14653 *modep = mode;
14654
14655 if (*modep == mode)
14656 return 2;
14657
14658 break;
14659
14660 case VECTOR_TYPE:
14661 /* Use V2SImode and V4SImode as representatives of all 64-bit
14662 and 128-bit vector types. */
14663 size = int_size_in_bytes (type);
14664 switch (size)
14665 {
14666 case 8:
14667 mode = V2SImode;
14668 break;
14669 case 16:
14670 mode = V4SImode;
14671 break;
14672 default:
14673 return -1;
14674 }
14675
14676 if (*modep == VOIDmode)
14677 *modep = mode;
14678
14679 /* Vector modes are considered to be opaque: two vectors are
14680 equivalent for the purposes of being homogeneous aggregates
14681 if they are the same size. */
14682 if (*modep == mode)
14683 return 1;
14684
14685 break;
14686
14687 case ARRAY_TYPE:
14688 {
14689 int count;
14690 tree index = TYPE_DOMAIN (type);
14691
14692 /* Can't handle incomplete types nor sizes that are not
14693 fixed. */
14694 if (!COMPLETE_TYPE_P (type)
14695 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
14696 return -1;
14697
14698 count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
14699 if (count == -1
14700 || !index
14701 || !TYPE_MAX_VALUE (index)
14702 || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
14703 || !TYPE_MIN_VALUE (index)
14704 || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
14705 || count < 0)
14706 return -1;
14707
14708 count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
14709 - tree_to_uhwi (TYPE_MIN_VALUE (index)));
14710
14711 /* There must be no padding. */
14712 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
14713 count * GET_MODE_BITSIZE (*modep)))
14714 return -1;
14715
14716 return count;
14717 }
14718
14719 case RECORD_TYPE:
14720 {
14721 int count = 0;
14722 int sub_count;
14723 tree field;
14724
14725 /* Can't handle incomplete types nor sizes that are not
14726 fixed. */
14727 if (!COMPLETE_TYPE_P (type)
14728 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
14729 return -1;
14730
14731 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
14732 {
14733 if (TREE_CODE (field) != FIELD_DECL)
14734 continue;
14735
14736 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
14737 if (sub_count < 0)
14738 return -1;
14739 count += sub_count;
14740 }
14741
14742 /* There must be no padding. */
14743 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
14744 count * GET_MODE_BITSIZE (*modep)))
14745 return -1;
14746
14747 return count;
14748 }
14749
14750 case UNION_TYPE:
14751 case QUAL_UNION_TYPE:
14752 {
14753 /* These aren't very interesting except in a degenerate case. */
14754 int count = 0;
14755 int sub_count;
14756 tree field;
14757
14758 /* Can't handle incomplete types nor sizes that are not
14759 fixed. */
14760 if (!COMPLETE_TYPE_P (type)
14761 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
14762 return -1;
14763
14764 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
14765 {
14766 if (TREE_CODE (field) != FIELD_DECL)
14767 continue;
14768
14769 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
14770 if (sub_count < 0)
14771 return -1;
14772 count = count > sub_count ? count : sub_count;
14773 }
14774
14775 /* There must be no padding. */
14776 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
14777 count * GET_MODE_BITSIZE (*modep)))
14778 return -1;
14779
14780 return count;
14781 }
14782
14783 default:
14784 break;
14785 }
14786
14787 return -1;
14788 }
14789
14790 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
14791 type as described in AAPCS64 \S 4.1.2.
14792
14793 See the comment above aarch64_composite_type_p for the notes on MODE. */
14794
14795 static bool
14796 aarch64_short_vector_p (const_tree type,
14797 machine_mode mode)
14798 {
14799 poly_int64 size = -1;
14800
14801 if (type && TREE_CODE (type) == VECTOR_TYPE)
14802 size = int_size_in_bytes (type);
14803 else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
14804 || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
14805 size = GET_MODE_SIZE (mode);
14806
14807 return known_eq (size, 8) || known_eq (size, 16);
14808 }
14809
14810 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
14811 type as described in AAPCS64 \S 4.3. This includes aggregate, union and
14812 array types. The C99 floating-point complex types are also considered
14813 as composite types, according to AAPCS64 \S 7.1.1. The complex integer
14814 types, which are GCC extensions and out of the scope of AAPCS64, are
14815 treated as composite types here as well.
14816
14817 Note that MODE itself is not sufficient in determining whether a type
14818 is such a composite type or not. This is because
14819 stor-layout.c:compute_record_mode may have already changed the MODE
14820 (BLKmode) of a RECORD_TYPE TYPE to some other mode. For example, a
14821 structure with only one field may have its MODE set to the mode of the
14822 field. Also an integer mode whose size matches the size of the
14823 RECORD_TYPE type may be used to substitute the original mode
14824 (i.e. BLKmode) in certain circumstances. In other words, MODE cannot be
14825 solely relied on. */
14826
14827 static bool
14828 aarch64_composite_type_p (const_tree type,
14829 machine_mode mode)
14830 {
14831 if (aarch64_short_vector_p (type, mode))
14832 return false;
14833
14834 if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
14835 return true;
14836
14837 if (mode == BLKmode
14838 || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
14839 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
14840 return true;
14841
14842 return false;
14843 }
14844
14845 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
14846 shall be passed or returned in simd/fp register(s) (providing these
14847 parameter passing registers are available).
14848
14849 Upon successful return, *COUNT returns the number of needed registers,
14850 *BASE_MODE returns the mode of the individual register and when IS_HAF
14851 is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
14852 floating-point aggregate or a homogeneous short-vector aggregate. */
14853
14854 static bool
14855 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
14856 const_tree type,
14857 machine_mode *base_mode,
14858 int *count,
14859 bool *is_ha)
14860 {
14861 machine_mode new_mode = VOIDmode;
14862 bool composite_p = aarch64_composite_type_p (type, mode);
14863
14864 if (is_ha != NULL) *is_ha = false;
14865
14866 if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
14867 || aarch64_short_vector_p (type, mode))
14868 {
14869 *count = 1;
14870 new_mode = mode;
14871 }
14872 else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
14873 {
14874 if (is_ha != NULL) *is_ha = true;
14875 *count = 2;
14876 new_mode = GET_MODE_INNER (mode);
14877 }
14878 else if (type && composite_p)
14879 {
14880 int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
14881
14882 if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
14883 {
14884 if (is_ha != NULL) *is_ha = true;
14885 *count = ag_count;
14886 }
14887 else
14888 return false;
14889 }
14890 else
14891 return false;
14892
14893 *base_mode = new_mode;
14894 return true;
14895 }
14896
14897 /* Implement TARGET_STRUCT_VALUE_RTX. */
14898
14899 static rtx
14900 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
14901 int incoming ATTRIBUTE_UNUSED)
14902 {
14903 return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
14904 }
14905
14906 /* Implements target hook vector_mode_supported_p. */
14907 static bool
14908 aarch64_vector_mode_supported_p (machine_mode mode)
14909 {
14910 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
14911 return vec_flags != 0 && (vec_flags & VEC_STRUCT) == 0;
14912 }
14913
14914 /* Return the full-width SVE vector mode for element mode MODE, if one
14915 exists. */
14916 opt_machine_mode
14917 aarch64_full_sve_mode (scalar_mode mode)
14918 {
14919 switch (mode)
14920 {
14921 case E_DFmode:
14922 return VNx2DFmode;
14923 case E_SFmode:
14924 return VNx4SFmode;
14925 case E_HFmode:
14926 return VNx8HFmode;
14927 case E_DImode:
14928 return VNx2DImode;
14929 case E_SImode:
14930 return VNx4SImode;
14931 case E_HImode:
14932 return VNx8HImode;
14933 case E_QImode:
14934 return VNx16QImode;
14935 default:
14936 return opt_machine_mode ();
14937 }
14938 }
14939
14940 /* Return the 128-bit Advanced SIMD vector mode for element mode MODE,
14941 if it exists. */
14942 opt_machine_mode
14943 aarch64_vq_mode (scalar_mode mode)
14944 {
14945 switch (mode)
14946 {
14947 case E_DFmode:
14948 return V2DFmode;
14949 case E_SFmode:
14950 return V4SFmode;
14951 case E_HFmode:
14952 return V8HFmode;
14953 case E_SImode:
14954 return V4SImode;
14955 case E_HImode:
14956 return V8HImode;
14957 case E_QImode:
14958 return V16QImode;
14959 case E_DImode:
14960 return V2DImode;
14961 default:
14962 return opt_machine_mode ();
14963 }
14964 }
14965
14966 /* Return appropriate SIMD container
14967 for MODE within a vector of WIDTH bits. */
14968 static machine_mode
14969 aarch64_simd_container_mode (scalar_mode mode, poly_int64 width)
14970 {
14971 if (TARGET_SVE && known_eq (width, BITS_PER_SVE_VECTOR))
14972 return aarch64_full_sve_mode (mode).else_mode (word_mode);
14973
14974 gcc_assert (known_eq (width, 64) || known_eq (width, 128));
14975 if (TARGET_SIMD)
14976 {
14977 if (known_eq (width, 128))
14978 return aarch64_vq_mode (mode).else_mode (word_mode);
14979 else
14980 switch (mode)
14981 {
14982 case E_SFmode:
14983 return V2SFmode;
14984 case E_HFmode:
14985 return V4HFmode;
14986 case E_SImode:
14987 return V2SImode;
14988 case E_HImode:
14989 return V4HImode;
14990 case E_QImode:
14991 return V8QImode;
14992 default:
14993 break;
14994 }
14995 }
14996 return word_mode;
14997 }
14998
14999 /* Return 128-bit container as the preferred SIMD mode for MODE. */
15000 static machine_mode
15001 aarch64_preferred_simd_mode (scalar_mode mode)
15002 {
15003 poly_int64 bits = TARGET_SVE ? BITS_PER_SVE_VECTOR : 128;
15004 return aarch64_simd_container_mode (mode, bits);
15005 }
15006
15007 /* Return a list of possible vector sizes for the vectorizer
15008 to iterate over. */
15009 static void
15010 aarch64_autovectorize_vector_sizes (vector_sizes *sizes, bool)
15011 {
15012 if (TARGET_SVE)
15013 sizes->safe_push (BYTES_PER_SVE_VECTOR);
15014 sizes->safe_push (16);
15015 sizes->safe_push (8);
15016 }
15017
15018 /* Implement TARGET_MANGLE_TYPE. */
15019
15020 static const char *
15021 aarch64_mangle_type (const_tree type)
15022 {
15023 /* The AArch64 ABI documents say that "__va_list" has to be
15024 mangled as if it is in the "std" namespace. */
15025 if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
15026 return "St9__va_list";
15027
15028 /* Half-precision float. */
15029 if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16)
15030 return "Dh";
15031
15032 /* Mangle AArch64-specific internal types. TYPE_NAME is non-NULL_TREE for
15033 builtin types. */
15034 if (TYPE_NAME (type) != NULL)
15035 return aarch64_mangle_builtin_type (type);
15036
15037 /* Use the default mangling. */
15038 return NULL;
15039 }
15040
15041 /* Find the first rtx_insn before insn that will generate an assembly
15042 instruction. */
15043
15044 static rtx_insn *
15045 aarch64_prev_real_insn (rtx_insn *insn)
15046 {
15047 if (!insn)
15048 return NULL;
15049
15050 do
15051 {
15052 insn = prev_real_insn (insn);
15053 }
15054 while (insn && recog_memoized (insn) < 0);
15055
15056 return insn;
15057 }
15058
15059 static bool
15060 is_madd_op (enum attr_type t1)
15061 {
15062 unsigned int i;
15063 /* A number of these may be AArch32 only. */
15064 enum attr_type mlatypes[] = {
15065 TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
15066 TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
15067 TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
15068 };
15069
15070 for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
15071 {
15072 if (t1 == mlatypes[i])
15073 return true;
15074 }
15075
15076 return false;
15077 }
15078
15079 /* Check if there is a register dependency between a load and the insn
15080 for which we hold recog_data. */
15081
15082 static bool
15083 dep_between_memop_and_curr (rtx memop)
15084 {
15085 rtx load_reg;
15086 int opno;
15087
15088 gcc_assert (GET_CODE (memop) == SET);
15089
15090 if (!REG_P (SET_DEST (memop)))
15091 return false;
15092
15093 load_reg = SET_DEST (memop);
15094 for (opno = 1; opno < recog_data.n_operands; opno++)
15095 {
15096 rtx operand = recog_data.operand[opno];
15097 if (REG_P (operand)
15098 && reg_overlap_mentioned_p (load_reg, operand))
15099 return true;
15100
15101 }
15102 return false;
15103 }
15104
15105
15106 /* When working around the Cortex-A53 erratum 835769,
15107 given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
15108 instruction and has a preceding memory instruction such that a NOP
15109 should be inserted between them. */
15110
15111 bool
15112 aarch64_madd_needs_nop (rtx_insn* insn)
15113 {
15114 enum attr_type attr_type;
15115 rtx_insn *prev;
15116 rtx body;
15117
15118 if (!TARGET_FIX_ERR_A53_835769)
15119 return false;
15120
15121 if (!INSN_P (insn) || recog_memoized (insn) < 0)
15122 return false;
15123
15124 attr_type = get_attr_type (insn);
15125 if (!is_madd_op (attr_type))
15126 return false;
15127
15128 prev = aarch64_prev_real_insn (insn);
15129 /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
15130 Restore recog state to INSN to avoid state corruption. */
15131 extract_constrain_insn_cached (insn);
15132
15133 if (!prev || !contains_mem_rtx_p (PATTERN (prev)))
15134 return false;
15135
15136 body = single_set (prev);
15137
15138 /* If the previous insn is a memory op and there is no dependency between
15139 it and the DImode madd, emit a NOP between them. If body is NULL then we
15140 have a complex memory operation, probably a load/store pair.
15141 Be conservative for now and emit a NOP. */
15142 if (GET_MODE (recog_data.operand[0]) == DImode
15143 && (!body || !dep_between_memop_and_curr (body)))
15144 return true;
15145
15146 return false;
15147
15148 }
15149
15150
15151 /* Implement FINAL_PRESCAN_INSN. */
15152
15153 void
15154 aarch64_final_prescan_insn (rtx_insn *insn)
15155 {
15156 if (aarch64_madd_needs_nop (insn))
15157 fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
15158 }
15159
15160
15161 /* Return true if BASE_OR_STEP is a valid immediate operand for an SVE INDEX
15162 instruction. */
15163
15164 bool
15165 aarch64_sve_index_immediate_p (rtx base_or_step)
15166 {
15167 return (CONST_INT_P (base_or_step)
15168 && IN_RANGE (INTVAL (base_or_step), -16, 15));
15169 }
15170
15171 /* Return true if X is a valid immediate for the SVE ADD and SUB
15172 instructions. Negate X first if NEGATE_P is true. */
15173
15174 bool
15175 aarch64_sve_arith_immediate_p (rtx x, bool negate_p)
15176 {
15177 rtx elt;
15178
15179 if (!const_vec_duplicate_p (x, &elt)
15180 || !CONST_INT_P (elt))
15181 return false;
15182
15183 HOST_WIDE_INT val = INTVAL (elt);
15184 if (negate_p)
15185 val = -val;
15186 val &= GET_MODE_MASK (GET_MODE_INNER (GET_MODE (x)));
15187
15188 if (val & 0xff)
15189 return IN_RANGE (val, 0, 0xff);
15190 return IN_RANGE (val, 0, 0xff00);
15191 }
15192
15193 /* Return true if X is a valid immediate operand for an SVE logical
15194 instruction such as AND. */
15195
15196 bool
15197 aarch64_sve_bitmask_immediate_p (rtx x)
15198 {
15199 rtx elt;
15200
15201 return (const_vec_duplicate_p (x, &elt)
15202 && CONST_INT_P (elt)
15203 && aarch64_bitmask_imm (INTVAL (elt),
15204 GET_MODE_INNER (GET_MODE (x))));
15205 }
15206
15207 /* Return true if X is a valid immediate for the SVE DUP and CPY
15208 instructions. */
15209
15210 bool
15211 aarch64_sve_dup_immediate_p (rtx x)
15212 {
15213 x = aarch64_bit_representation (unwrap_const_vec_duplicate (x));
15214 if (!CONST_INT_P (x))
15215 return false;
15216
15217 HOST_WIDE_INT val = INTVAL (x);
15218 if (val & 0xff)
15219 return IN_RANGE (val, -0x80, 0x7f);
15220 return IN_RANGE (val, -0x8000, 0x7f00);
15221 }
15222
15223 /* Return true if X is a valid immediate operand for an SVE CMP instruction.
15224 SIGNED_P says whether the operand is signed rather than unsigned. */
15225
15226 bool
15227 aarch64_sve_cmp_immediate_p (rtx x, bool signed_p)
15228 {
15229 rtx elt;
15230
15231 return (const_vec_duplicate_p (x, &elt)
15232 && CONST_INT_P (elt)
15233 && (signed_p
15234 ? IN_RANGE (INTVAL (elt), -16, 15)
15235 : IN_RANGE (INTVAL (elt), 0, 127)));
15236 }
15237
15238 /* Return true if X is a valid immediate operand for an SVE FADD or FSUB
15239 instruction. Negate X first if NEGATE_P is true. */
15240
15241 bool
15242 aarch64_sve_float_arith_immediate_p (rtx x, bool negate_p)
15243 {
15244 rtx elt;
15245 REAL_VALUE_TYPE r;
15246
15247 if (!const_vec_duplicate_p (x, &elt)
15248 || GET_CODE (elt) != CONST_DOUBLE)
15249 return false;
15250
15251 r = *CONST_DOUBLE_REAL_VALUE (elt);
15252
15253 if (negate_p)
15254 r = real_value_negate (&r);
15255
15256 if (real_equal (&r, &dconst1))
15257 return true;
15258 if (real_equal (&r, &dconsthalf))
15259 return true;
15260 return false;
15261 }
15262
15263 /* Return true if X is a valid immediate operand for an SVE FMUL
15264 instruction. */
15265
15266 bool
15267 aarch64_sve_float_mul_immediate_p (rtx x)
15268 {
15269 rtx elt;
15270
15271 return (const_vec_duplicate_p (x, &elt)
15272 && GET_CODE (elt) == CONST_DOUBLE
15273 && (real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconsthalf)
15274 || real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconst2)));
15275 }
15276
15277 /* Return true if replicating VAL32 is a valid 2-byte or 4-byte immediate
15278 for the Advanced SIMD operation described by WHICH and INSN. If INFO
15279 is nonnull, use it to describe valid immediates. */
15280 static bool
15281 aarch64_advsimd_valid_immediate_hs (unsigned int val32,
15282 simd_immediate_info *info,
15283 enum simd_immediate_check which,
15284 simd_immediate_info::insn_type insn)
15285 {
15286 /* Try a 4-byte immediate with LSL. */
15287 for (unsigned int shift = 0; shift < 32; shift += 8)
15288 if ((val32 & (0xff << shift)) == val32)
15289 {
15290 if (info)
15291 *info = simd_immediate_info (SImode, val32 >> shift, insn,
15292 simd_immediate_info::LSL, shift);
15293 return true;
15294 }
15295
15296 /* Try a 2-byte immediate with LSL. */
15297 unsigned int imm16 = val32 & 0xffff;
15298 if (imm16 == (val32 >> 16))
15299 for (unsigned int shift = 0; shift < 16; shift += 8)
15300 if ((imm16 & (0xff << shift)) == imm16)
15301 {
15302 if (info)
15303 *info = simd_immediate_info (HImode, imm16 >> shift, insn,
15304 simd_immediate_info::LSL, shift);
15305 return true;
15306 }
15307
15308 /* Try a 4-byte immediate with MSL, except for cases that MVN
15309 can handle. */
15310 if (which == AARCH64_CHECK_MOV)
15311 for (unsigned int shift = 8; shift < 24; shift += 8)
15312 {
15313 unsigned int low = (1 << shift) - 1;
15314 if (((val32 & (0xff << shift)) | low) == val32)
15315 {
15316 if (info)
15317 *info = simd_immediate_info (SImode, val32 >> shift, insn,
15318 simd_immediate_info::MSL, shift);
15319 return true;
15320 }
15321 }
15322
15323 return false;
15324 }
15325
15326 /* Return true if replicating VAL64 is a valid immediate for the
15327 Advanced SIMD operation described by WHICH. If INFO is nonnull,
15328 use it to describe valid immediates. */
15329 static bool
15330 aarch64_advsimd_valid_immediate (unsigned HOST_WIDE_INT val64,
15331 simd_immediate_info *info,
15332 enum simd_immediate_check which)
15333 {
15334 unsigned int val32 = val64 & 0xffffffff;
15335 unsigned int val16 = val64 & 0xffff;
15336 unsigned int val8 = val64 & 0xff;
15337
15338 if (val32 == (val64 >> 32))
15339 {
15340 if ((which & AARCH64_CHECK_ORR) != 0
15341 && aarch64_advsimd_valid_immediate_hs (val32, info, which,
15342 simd_immediate_info::MOV))
15343 return true;
15344
15345 if ((which & AARCH64_CHECK_BIC) != 0
15346 && aarch64_advsimd_valid_immediate_hs (~val32, info, which,
15347 simd_immediate_info::MVN))
15348 return true;
15349
15350 /* Try using a replicated byte. */
15351 if (which == AARCH64_CHECK_MOV
15352 && val16 == (val32 >> 16)
15353 && val8 == (val16 >> 8))
15354 {
15355 if (info)
15356 *info = simd_immediate_info (QImode, val8);
15357 return true;
15358 }
15359 }
15360
15361 /* Try using a bit-to-bytemask. */
15362 if (which == AARCH64_CHECK_MOV)
15363 {
15364 unsigned int i;
15365 for (i = 0; i < 64; i += 8)
15366 {
15367 unsigned char byte = (val64 >> i) & 0xff;
15368 if (byte != 0 && byte != 0xff)
15369 break;
15370 }
15371 if (i == 64)
15372 {
15373 if (info)
15374 *info = simd_immediate_info (DImode, val64);
15375 return true;
15376 }
15377 }
15378 return false;
15379 }
15380
15381 /* Return true if replicating VAL64 gives a valid immediate for an SVE MOV
15382 instruction. If INFO is nonnull, use it to describe valid immediates. */
15383
15384 static bool
15385 aarch64_sve_valid_immediate (unsigned HOST_WIDE_INT val64,
15386 simd_immediate_info *info)
15387 {
15388 scalar_int_mode mode = DImode;
15389 unsigned int val32 = val64 & 0xffffffff;
15390 if (val32 == (val64 >> 32))
15391 {
15392 mode = SImode;
15393 unsigned int val16 = val32 & 0xffff;
15394 if (val16 == (val32 >> 16))
15395 {
15396 mode = HImode;
15397 unsigned int val8 = val16 & 0xff;
15398 if (val8 == (val16 >> 8))
15399 mode = QImode;
15400 }
15401 }
15402 HOST_WIDE_INT val = trunc_int_for_mode (val64, mode);
15403 if (IN_RANGE (val, -0x80, 0x7f))
15404 {
15405 /* DUP with no shift. */
15406 if (info)
15407 *info = simd_immediate_info (mode, val);
15408 return true;
15409 }
15410 if ((val & 0xff) == 0 && IN_RANGE (val, -0x8000, 0x7f00))
15411 {
15412 /* DUP with LSL #8. */
15413 if (info)
15414 *info = simd_immediate_info (mode, val);
15415 return true;
15416 }
15417 if (aarch64_bitmask_imm (val64, mode))
15418 {
15419 /* DUPM. */
15420 if (info)
15421 *info = simd_immediate_info (mode, val);
15422 return true;
15423 }
15424 return false;
15425 }
15426
15427 /* Return true if X is a valid SVE predicate. If INFO is nonnull, use
15428 it to describe valid immediates. */
15429
15430 static bool
15431 aarch64_sve_pred_valid_immediate (rtx x, simd_immediate_info *info)
15432 {
15433 if (x == CONST0_RTX (GET_MODE (x)))
15434 {
15435 if (info)
15436 *info = simd_immediate_info (DImode, 0);
15437 return true;
15438 }
15439
15440 /* Analyze the value as a VNx16BImode. This should be relatively
15441 efficient, since rtx_vector_builder has enough built-in capacity
15442 to store all VLA predicate constants without needing the heap. */
15443 rtx_vector_builder builder;
15444 if (!aarch64_get_sve_pred_bits (builder, x))
15445 return false;
15446
15447 unsigned int elt_size = aarch64_widest_sve_pred_elt_size (builder);
15448 if (int vl = aarch64_partial_ptrue_length (builder, elt_size))
15449 {
15450 machine_mode mode = aarch64_sve_pred_mode (elt_size).require ();
15451 aarch64_svpattern pattern = aarch64_svpattern_for_vl (mode, vl);
15452 if (pattern != AARCH64_NUM_SVPATTERNS)
15453 {
15454 if (info)
15455 {
15456 scalar_int_mode int_mode = aarch64_sve_element_int_mode (mode);
15457 *info = simd_immediate_info (int_mode, pattern);
15458 }
15459 return true;
15460 }
15461 }
15462 return false;
15463 }
15464
15465 /* Return true if OP is a valid SIMD immediate for the operation
15466 described by WHICH. If INFO is nonnull, use it to describe valid
15467 immediates. */
15468 bool
15469 aarch64_simd_valid_immediate (rtx op, simd_immediate_info *info,
15470 enum simd_immediate_check which)
15471 {
15472 machine_mode mode = GET_MODE (op);
15473 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
15474 if (vec_flags == 0 || vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
15475 return false;
15476
15477 if (vec_flags & VEC_SVE_PRED)
15478 return aarch64_sve_pred_valid_immediate (op, info);
15479
15480 scalar_mode elt_mode = GET_MODE_INNER (mode);
15481 rtx base, step;
15482 unsigned int n_elts;
15483 if (GET_CODE (op) == CONST_VECTOR
15484 && CONST_VECTOR_DUPLICATE_P (op))
15485 n_elts = CONST_VECTOR_NPATTERNS (op);
15486 else if ((vec_flags & VEC_SVE_DATA)
15487 && const_vec_series_p (op, &base, &step))
15488 {
15489 gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_INT);
15490 if (!aarch64_sve_index_immediate_p (base)
15491 || !aarch64_sve_index_immediate_p (step))
15492 return false;
15493
15494 if (info)
15495 *info = simd_immediate_info (elt_mode, base, step);
15496 return true;
15497 }
15498 else if (GET_CODE (op) == CONST_VECTOR
15499 && CONST_VECTOR_NUNITS (op).is_constant (&n_elts))
15500 /* N_ELTS set above. */;
15501 else
15502 return false;
15503
15504 scalar_float_mode elt_float_mode;
15505 if (n_elts == 1
15506 && is_a <scalar_float_mode> (elt_mode, &elt_float_mode))
15507 {
15508 rtx elt = CONST_VECTOR_ENCODED_ELT (op, 0);
15509 if (aarch64_float_const_zero_rtx_p (elt)
15510 || aarch64_float_const_representable_p (elt))
15511 {
15512 if (info)
15513 *info = simd_immediate_info (elt_float_mode, elt);
15514 return true;
15515 }
15516 }
15517
15518 unsigned int elt_size = GET_MODE_SIZE (elt_mode);
15519 if (elt_size > 8)
15520 return false;
15521
15522 scalar_int_mode elt_int_mode = int_mode_for_mode (elt_mode).require ();
15523
15524 /* Expand the vector constant out into a byte vector, with the least
15525 significant byte of the register first. */
15526 auto_vec<unsigned char, 16> bytes;
15527 bytes.reserve (n_elts * elt_size);
15528 for (unsigned int i = 0; i < n_elts; i++)
15529 {
15530 /* The vector is provided in gcc endian-neutral fashion.
15531 For aarch64_be Advanced SIMD, it must be laid out in the vector
15532 register in reverse order. */
15533 bool swap_p = ((vec_flags & VEC_ADVSIMD) != 0 && BYTES_BIG_ENDIAN);
15534 rtx elt = CONST_VECTOR_ELT (op, swap_p ? (n_elts - 1 - i) : i);
15535
15536 if (elt_mode != elt_int_mode)
15537 elt = gen_lowpart (elt_int_mode, elt);
15538
15539 if (!CONST_INT_P (elt))
15540 return false;
15541
15542 unsigned HOST_WIDE_INT elt_val = INTVAL (elt);
15543 for (unsigned int byte = 0; byte < elt_size; byte++)
15544 {
15545 bytes.quick_push (elt_val & 0xff);
15546 elt_val >>= BITS_PER_UNIT;
15547 }
15548 }
15549
15550 /* The immediate must repeat every eight bytes. */
15551 unsigned int nbytes = bytes.length ();
15552 for (unsigned i = 8; i < nbytes; ++i)
15553 if (bytes[i] != bytes[i - 8])
15554 return false;
15555
15556 /* Get the repeating 8-byte value as an integer. No endian correction
15557 is needed here because bytes is already in lsb-first order. */
15558 unsigned HOST_WIDE_INT val64 = 0;
15559 for (unsigned int i = 0; i < 8; i++)
15560 val64 |= ((unsigned HOST_WIDE_INT) bytes[i % nbytes]
15561 << (i * BITS_PER_UNIT));
15562
15563 if (vec_flags & VEC_SVE_DATA)
15564 return aarch64_sve_valid_immediate (val64, info);
15565 else
15566 return aarch64_advsimd_valid_immediate (val64, info, which);
15567 }
15568
15569 /* Check whether X is a VEC_SERIES-like constant that starts at 0 and
15570 has a step in the range of INDEX. Return the index expression if so,
15571 otherwise return null. */
15572 rtx
15573 aarch64_check_zero_based_sve_index_immediate (rtx x)
15574 {
15575 rtx base, step;
15576 if (const_vec_series_p (x, &base, &step)
15577 && base == const0_rtx
15578 && aarch64_sve_index_immediate_p (step))
15579 return step;
15580 return NULL_RTX;
15581 }
15582
15583 /* Check of immediate shift constants are within range. */
15584 bool
15585 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
15586 {
15587 int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
15588 if (left)
15589 return aarch64_const_vec_all_same_in_range_p (x, 0, bit_width - 1);
15590 else
15591 return aarch64_const_vec_all_same_in_range_p (x, 1, bit_width);
15592 }
15593
15594 /* Return the bitmask CONST_INT to select the bits required by a zero extract
15595 operation of width WIDTH at bit position POS. */
15596
15597 rtx
15598 aarch64_mask_from_zextract_ops (rtx width, rtx pos)
15599 {
15600 gcc_assert (CONST_INT_P (width));
15601 gcc_assert (CONST_INT_P (pos));
15602
15603 unsigned HOST_WIDE_INT mask
15604 = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1;
15605 return GEN_INT (mask << UINTVAL (pos));
15606 }
15607
15608 bool
15609 aarch64_mov_operand_p (rtx x, machine_mode mode)
15610 {
15611 if (GET_CODE (x) == HIGH
15612 && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
15613 return true;
15614
15615 if (CONST_INT_P (x))
15616 return true;
15617
15618 if (VECTOR_MODE_P (GET_MODE (x)))
15619 {
15620 /* Require predicate constants to be VNx16BI before RA, so that we
15621 force everything to have a canonical form. */
15622 if (!lra_in_progress
15623 && !reload_completed
15624 && GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_BOOL
15625 && GET_MODE (x) != VNx16BImode)
15626 return false;
15627
15628 return aarch64_simd_valid_immediate (x, NULL);
15629 }
15630
15631 if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
15632 return true;
15633
15634 if (aarch64_sve_cnt_immediate_p (x))
15635 return true;
15636
15637 return aarch64_classify_symbolic_expression (x)
15638 == SYMBOL_TINY_ABSOLUTE;
15639 }
15640
15641 /* Return a const_int vector of VAL. */
15642 rtx
15643 aarch64_simd_gen_const_vector_dup (machine_mode mode, HOST_WIDE_INT val)
15644 {
15645 rtx c = gen_int_mode (val, GET_MODE_INNER (mode));
15646 return gen_const_vec_duplicate (mode, c);
15647 }
15648
15649 /* Check OP is a legal scalar immediate for the MOVI instruction. */
15650
15651 bool
15652 aarch64_simd_scalar_immediate_valid_for_move (rtx op, scalar_int_mode mode)
15653 {
15654 machine_mode vmode;
15655
15656 vmode = aarch64_simd_container_mode (mode, 64);
15657 rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
15658 return aarch64_simd_valid_immediate (op_v, NULL);
15659 }
15660
15661 /* Construct and return a PARALLEL RTX vector with elements numbering the
15662 lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
15663 the vector - from the perspective of the architecture. This does not
15664 line up with GCC's perspective on lane numbers, so we end up with
15665 different masks depending on our target endian-ness. The diagram
15666 below may help. We must draw the distinction when building masks
15667 which select one half of the vector. An instruction selecting
15668 architectural low-lanes for a big-endian target, must be described using
15669 a mask selecting GCC high-lanes.
15670
15671 Big-Endian Little-Endian
15672
15673 GCC 0 1 2 3 3 2 1 0
15674 | x | x | x | x | | x | x | x | x |
15675 Architecture 3 2 1 0 3 2 1 0
15676
15677 Low Mask: { 2, 3 } { 0, 1 }
15678 High Mask: { 0, 1 } { 2, 3 }
15679
15680 MODE Is the mode of the vector and NUNITS is the number of units in it. */
15681
15682 rtx
15683 aarch64_simd_vect_par_cnst_half (machine_mode mode, int nunits, bool high)
15684 {
15685 rtvec v = rtvec_alloc (nunits / 2);
15686 int high_base = nunits / 2;
15687 int low_base = 0;
15688 int base;
15689 rtx t1;
15690 int i;
15691
15692 if (BYTES_BIG_ENDIAN)
15693 base = high ? low_base : high_base;
15694 else
15695 base = high ? high_base : low_base;
15696
15697 for (i = 0; i < nunits / 2; i++)
15698 RTVEC_ELT (v, i) = GEN_INT (base + i);
15699
15700 t1 = gen_rtx_PARALLEL (mode, v);
15701 return t1;
15702 }
15703
15704 /* Check OP for validity as a PARALLEL RTX vector with elements
15705 numbering the lanes of either the high (HIGH == TRUE) or low lanes,
15706 from the perspective of the architecture. See the diagram above
15707 aarch64_simd_vect_par_cnst_half for more details. */
15708
15709 bool
15710 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
15711 bool high)
15712 {
15713 int nelts;
15714 if (!VECTOR_MODE_P (mode) || !GET_MODE_NUNITS (mode).is_constant (&nelts))
15715 return false;
15716
15717 rtx ideal = aarch64_simd_vect_par_cnst_half (mode, nelts, high);
15718 HOST_WIDE_INT count_op = XVECLEN (op, 0);
15719 HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
15720 int i = 0;
15721
15722 if (count_op != count_ideal)
15723 return false;
15724
15725 for (i = 0; i < count_ideal; i++)
15726 {
15727 rtx elt_op = XVECEXP (op, 0, i);
15728 rtx elt_ideal = XVECEXP (ideal, 0, i);
15729
15730 if (!CONST_INT_P (elt_op)
15731 || INTVAL (elt_ideal) != INTVAL (elt_op))
15732 return false;
15733 }
15734 return true;
15735 }
15736
15737 /* Return a PARALLEL containing NELTS elements, with element I equal
15738 to BASE + I * STEP. */
15739
15740 rtx
15741 aarch64_gen_stepped_int_parallel (unsigned int nelts, int base, int step)
15742 {
15743 rtvec vec = rtvec_alloc (nelts);
15744 for (unsigned int i = 0; i < nelts; ++i)
15745 RTVEC_ELT (vec, i) = gen_int_mode (base + i * step, DImode);
15746 return gen_rtx_PARALLEL (VOIDmode, vec);
15747 }
15748
15749 /* Return true if OP is a PARALLEL of CONST_INTs that form a linear
15750 series with step STEP. */
15751
15752 bool
15753 aarch64_stepped_int_parallel_p (rtx op, int step)
15754 {
15755 if (GET_CODE (op) != PARALLEL || !CONST_INT_P (XVECEXP (op, 0, 0)))
15756 return false;
15757
15758 unsigned HOST_WIDE_INT base = UINTVAL (XVECEXP (op, 0, 0));
15759 for (int i = 1; i < XVECLEN (op, 0); ++i)
15760 if (!CONST_INT_P (XVECEXP (op, 0, i))
15761 || UINTVAL (XVECEXP (op, 0, i)) != base + i * step)
15762 return false;
15763
15764 return true;
15765 }
15766
15767 /* Bounds-check lanes. Ensure OPERAND lies between LOW (inclusive) and
15768 HIGH (exclusive). */
15769 void
15770 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
15771 const_tree exp)
15772 {
15773 HOST_WIDE_INT lane;
15774 gcc_assert (CONST_INT_P (operand));
15775 lane = INTVAL (operand);
15776
15777 if (lane < low || lane >= high)
15778 {
15779 if (exp)
15780 error ("%Klane %wd out of range %wd - %wd", exp, lane, low, high - 1);
15781 else
15782 error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
15783 }
15784 }
15785
15786 /* Peform endian correction on lane number N, which indexes a vector
15787 of mode MODE, and return the result as an SImode rtx. */
15788
15789 rtx
15790 aarch64_endian_lane_rtx (machine_mode mode, unsigned int n)
15791 {
15792 return gen_int_mode (ENDIAN_LANE_N (GET_MODE_NUNITS (mode), n), SImode);
15793 }
15794
15795 /* Return TRUE if OP is a valid vector addressing mode. */
15796
15797 bool
15798 aarch64_simd_mem_operand_p (rtx op)
15799 {
15800 return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
15801 || REG_P (XEXP (op, 0)));
15802 }
15803
15804 /* Return true if OP is a valid MEM operand for an SVE LD1R instruction. */
15805
15806 bool
15807 aarch64_sve_ld1r_operand_p (rtx op)
15808 {
15809 struct aarch64_address_info addr;
15810 scalar_mode mode;
15811
15812 return (MEM_P (op)
15813 && is_a <scalar_mode> (GET_MODE (op), &mode)
15814 && aarch64_classify_address (&addr, XEXP (op, 0), mode, false)
15815 && addr.type == ADDRESS_REG_IMM
15816 && offset_6bit_unsigned_scaled_p (mode, addr.const_offset));
15817 }
15818
15819 /* Return true if OP is a valid MEM operand for an SVE LD1RQ instruction. */
15820 bool
15821 aarch64_sve_ld1rq_operand_p (rtx op)
15822 {
15823 struct aarch64_address_info addr;
15824 scalar_mode elem_mode = GET_MODE_INNER (GET_MODE (op));
15825 if (!MEM_P (op)
15826 || !aarch64_classify_address (&addr, XEXP (op, 0), elem_mode, false))
15827 return false;
15828
15829 if (addr.type == ADDRESS_REG_IMM)
15830 return offset_4bit_signed_scaled_p (TImode, addr.const_offset);
15831
15832 if (addr.type == ADDRESS_REG_REG)
15833 return (1U << addr.shift) == GET_MODE_SIZE (elem_mode);
15834
15835 return false;
15836 }
15837
15838 /* Return true if OP is a valid MEM operand for an SVE LDR instruction.
15839 The conditions for STR are the same. */
15840 bool
15841 aarch64_sve_ldr_operand_p (rtx op)
15842 {
15843 struct aarch64_address_info addr;
15844
15845 return (MEM_P (op)
15846 && aarch64_classify_address (&addr, XEXP (op, 0), GET_MODE (op),
15847 false, ADDR_QUERY_ANY)
15848 && addr.type == ADDRESS_REG_IMM);
15849 }
15850
15851 /* Return true if OP is a valid MEM operand for an SVE_STRUCT mode.
15852 We need to be able to access the individual pieces, so the range
15853 is different from LD[234] and ST[234]. */
15854 bool
15855 aarch64_sve_struct_memory_operand_p (rtx op)
15856 {
15857 if (!MEM_P (op))
15858 return false;
15859
15860 machine_mode mode = GET_MODE (op);
15861 struct aarch64_address_info addr;
15862 if (!aarch64_classify_address (&addr, XEXP (op, 0), SVE_BYTE_MODE, false,
15863 ADDR_QUERY_ANY)
15864 || addr.type != ADDRESS_REG_IMM)
15865 return false;
15866
15867 poly_int64 first = addr.const_offset;
15868 poly_int64 last = first + GET_MODE_SIZE (mode) - BYTES_PER_SVE_VECTOR;
15869 return (offset_4bit_signed_scaled_p (SVE_BYTE_MODE, first)
15870 && offset_4bit_signed_scaled_p (SVE_BYTE_MODE, last));
15871 }
15872
15873 /* Emit a register copy from operand to operand, taking care not to
15874 early-clobber source registers in the process.
15875
15876 COUNT is the number of components into which the copy needs to be
15877 decomposed. */
15878 void
15879 aarch64_simd_emit_reg_reg_move (rtx *operands, machine_mode mode,
15880 unsigned int count)
15881 {
15882 unsigned int i;
15883 int rdest = REGNO (operands[0]);
15884 int rsrc = REGNO (operands[1]);
15885
15886 if (!reg_overlap_mentioned_p (operands[0], operands[1])
15887 || rdest < rsrc)
15888 for (i = 0; i < count; i++)
15889 emit_move_insn (gen_rtx_REG (mode, rdest + i),
15890 gen_rtx_REG (mode, rsrc + i));
15891 else
15892 for (i = 0; i < count; i++)
15893 emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
15894 gen_rtx_REG (mode, rsrc + count - i - 1));
15895 }
15896
15897 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
15898 one of VSTRUCT modes: OI, CI, or XI. */
15899 int
15900 aarch64_simd_attr_length_rglist (machine_mode mode)
15901 {
15902 /* This is only used (and only meaningful) for Advanced SIMD, not SVE. */
15903 return (GET_MODE_SIZE (mode).to_constant () / UNITS_PER_VREG) * 4;
15904 }
15905
15906 /* Implement target hook TARGET_VECTOR_ALIGNMENT. The AAPCS64 sets the maximum
15907 alignment of a vector to 128 bits. SVE predicates have an alignment of
15908 16 bits. */
15909 static HOST_WIDE_INT
15910 aarch64_simd_vector_alignment (const_tree type)
15911 {
15912 if (TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
15913 /* ??? Checking the mode isn't ideal, but VECTOR_BOOLEAN_TYPE_P can
15914 be set for non-predicate vectors of booleans. Modes are the most
15915 direct way we have of identifying real SVE predicate types. */
15916 return GET_MODE_CLASS (TYPE_MODE (type)) == MODE_VECTOR_BOOL ? 16 : 128;
15917 return wi::umin (wi::to_wide (TYPE_SIZE (type)), 128).to_uhwi ();
15918 }
15919
15920 /* Implement target hook TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT. */
15921 static poly_uint64
15922 aarch64_vectorize_preferred_vector_alignment (const_tree type)
15923 {
15924 if (aarch64_sve_data_mode_p (TYPE_MODE (type)))
15925 {
15926 /* If the length of the vector is fixed, try to align to that length,
15927 otherwise don't try to align at all. */
15928 HOST_WIDE_INT result;
15929 if (!BITS_PER_SVE_VECTOR.is_constant (&result))
15930 result = TYPE_ALIGN (TREE_TYPE (type));
15931 return result;
15932 }
15933 return TYPE_ALIGN (type);
15934 }
15935
15936 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE. */
15937 static bool
15938 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
15939 {
15940 if (is_packed)
15941 return false;
15942
15943 /* For fixed-length vectors, check that the vectorizer will aim for
15944 full-vector alignment. This isn't true for generic GCC vectors
15945 that are wider than the ABI maximum of 128 bits. */
15946 poly_uint64 preferred_alignment =
15947 aarch64_vectorize_preferred_vector_alignment (type);
15948 if (TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
15949 && maybe_ne (wi::to_widest (TYPE_SIZE (type)),
15950 preferred_alignment))
15951 return false;
15952
15953 /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned. */
15954 return true;
15955 }
15956
15957 /* Return true if the vector misalignment factor is supported by the
15958 target. */
15959 static bool
15960 aarch64_builtin_support_vector_misalignment (machine_mode mode,
15961 const_tree type, int misalignment,
15962 bool is_packed)
15963 {
15964 if (TARGET_SIMD && STRICT_ALIGNMENT)
15965 {
15966 /* Return if movmisalign pattern is not supported for this mode. */
15967 if (optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing)
15968 return false;
15969
15970 /* Misalignment factor is unknown at compile time. */
15971 if (misalignment == -1)
15972 return false;
15973 }
15974 return default_builtin_support_vector_misalignment (mode, type, misalignment,
15975 is_packed);
15976 }
15977
15978 /* If VALS is a vector constant that can be loaded into a register
15979 using DUP, generate instructions to do so and return an RTX to
15980 assign to the register. Otherwise return NULL_RTX. */
15981 static rtx
15982 aarch64_simd_dup_constant (rtx vals)
15983 {
15984 machine_mode mode = GET_MODE (vals);
15985 machine_mode inner_mode = GET_MODE_INNER (mode);
15986 rtx x;
15987
15988 if (!const_vec_duplicate_p (vals, &x))
15989 return NULL_RTX;
15990
15991 /* We can load this constant by using DUP and a constant in a
15992 single ARM register. This will be cheaper than a vector
15993 load. */
15994 x = copy_to_mode_reg (inner_mode, x);
15995 return gen_vec_duplicate (mode, x);
15996 }
15997
15998
15999 /* Generate code to load VALS, which is a PARALLEL containing only
16000 constants (for vec_init) or CONST_VECTOR, efficiently into a
16001 register. Returns an RTX to copy into the register, or NULL_RTX
16002 for a PARALLEL that cannot be converted into a CONST_VECTOR. */
16003 static rtx
16004 aarch64_simd_make_constant (rtx vals)
16005 {
16006 machine_mode mode = GET_MODE (vals);
16007 rtx const_dup;
16008 rtx const_vec = NULL_RTX;
16009 int n_const = 0;
16010 int i;
16011
16012 if (GET_CODE (vals) == CONST_VECTOR)
16013 const_vec = vals;
16014 else if (GET_CODE (vals) == PARALLEL)
16015 {
16016 /* A CONST_VECTOR must contain only CONST_INTs and
16017 CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
16018 Only store valid constants in a CONST_VECTOR. */
16019 int n_elts = XVECLEN (vals, 0);
16020 for (i = 0; i < n_elts; ++i)
16021 {
16022 rtx x = XVECEXP (vals, 0, i);
16023 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
16024 n_const++;
16025 }
16026 if (n_const == n_elts)
16027 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
16028 }
16029 else
16030 gcc_unreachable ();
16031
16032 if (const_vec != NULL_RTX
16033 && aarch64_simd_valid_immediate (const_vec, NULL))
16034 /* Load using MOVI/MVNI. */
16035 return const_vec;
16036 else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
16037 /* Loaded using DUP. */
16038 return const_dup;
16039 else if (const_vec != NULL_RTX)
16040 /* Load from constant pool. We cannot take advantage of single-cycle
16041 LD1 because we need a PC-relative addressing mode. */
16042 return const_vec;
16043 else
16044 /* A PARALLEL containing something not valid inside CONST_VECTOR.
16045 We cannot construct an initializer. */
16046 return NULL_RTX;
16047 }
16048
16049 /* Expand a vector initialisation sequence, such that TARGET is
16050 initialised to contain VALS. */
16051
16052 void
16053 aarch64_expand_vector_init (rtx target, rtx vals)
16054 {
16055 machine_mode mode = GET_MODE (target);
16056 scalar_mode inner_mode = GET_MODE_INNER (mode);
16057 /* The number of vector elements. */
16058 int n_elts = XVECLEN (vals, 0);
16059 /* The number of vector elements which are not constant. */
16060 int n_var = 0;
16061 rtx any_const = NULL_RTX;
16062 /* The first element of vals. */
16063 rtx v0 = XVECEXP (vals, 0, 0);
16064 bool all_same = true;
16065
16066 /* This is a special vec_init<M><N> where N is not an element mode but a
16067 vector mode with half the elements of M. We expect to find two entries
16068 of mode N in VALS and we must put their concatentation into TARGET. */
16069 if (XVECLEN (vals, 0) == 2 && VECTOR_MODE_P (GET_MODE (XVECEXP (vals, 0, 0))))
16070 {
16071 gcc_assert (known_eq (GET_MODE_SIZE (mode),
16072 2 * GET_MODE_SIZE (GET_MODE (XVECEXP (vals, 0, 0)))));
16073 rtx lo = XVECEXP (vals, 0, 0);
16074 rtx hi = XVECEXP (vals, 0, 1);
16075 machine_mode narrow_mode = GET_MODE (lo);
16076 gcc_assert (GET_MODE_INNER (narrow_mode) == inner_mode);
16077 gcc_assert (narrow_mode == GET_MODE (hi));
16078
16079 /* When we want to concatenate a half-width vector with zeroes we can
16080 use the aarch64_combinez[_be] patterns. Just make sure that the
16081 zeroes are in the right half. */
16082 if (BYTES_BIG_ENDIAN
16083 && aarch64_simd_imm_zero (lo, narrow_mode)
16084 && general_operand (hi, narrow_mode))
16085 emit_insn (gen_aarch64_combinez_be (narrow_mode, target, hi, lo));
16086 else if (!BYTES_BIG_ENDIAN
16087 && aarch64_simd_imm_zero (hi, narrow_mode)
16088 && general_operand (lo, narrow_mode))
16089 emit_insn (gen_aarch64_combinez (narrow_mode, target, lo, hi));
16090 else
16091 {
16092 /* Else create the two half-width registers and combine them. */
16093 if (!REG_P (lo))
16094 lo = force_reg (GET_MODE (lo), lo);
16095 if (!REG_P (hi))
16096 hi = force_reg (GET_MODE (hi), hi);
16097
16098 if (BYTES_BIG_ENDIAN)
16099 std::swap (lo, hi);
16100 emit_insn (gen_aarch64_simd_combine (narrow_mode, target, lo, hi));
16101 }
16102 return;
16103 }
16104
16105 /* Count the number of variable elements to initialise. */
16106 for (int i = 0; i < n_elts; ++i)
16107 {
16108 rtx x = XVECEXP (vals, 0, i);
16109 if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
16110 ++n_var;
16111 else
16112 any_const = x;
16113
16114 all_same &= rtx_equal_p (x, v0);
16115 }
16116
16117 /* No variable elements, hand off to aarch64_simd_make_constant which knows
16118 how best to handle this. */
16119 if (n_var == 0)
16120 {
16121 rtx constant = aarch64_simd_make_constant (vals);
16122 if (constant != NULL_RTX)
16123 {
16124 emit_move_insn (target, constant);
16125 return;
16126 }
16127 }
16128
16129 /* Splat a single non-constant element if we can. */
16130 if (all_same)
16131 {
16132 rtx x = copy_to_mode_reg (inner_mode, v0);
16133 aarch64_emit_move (target, gen_vec_duplicate (mode, x));
16134 return;
16135 }
16136
16137 enum insn_code icode = optab_handler (vec_set_optab, mode);
16138 gcc_assert (icode != CODE_FOR_nothing);
16139
16140 /* If there are only variable elements, try to optimize
16141 the insertion using dup for the most common element
16142 followed by insertions. */
16143
16144 /* The algorithm will fill matches[*][0] with the earliest matching element,
16145 and matches[X][1] with the count of duplicate elements (if X is the
16146 earliest element which has duplicates). */
16147
16148 if (n_var == n_elts && n_elts <= 16)
16149 {
16150 int matches[16][2] = {0};
16151 for (int i = 0; i < n_elts; i++)
16152 {
16153 for (int j = 0; j <= i; j++)
16154 {
16155 if (rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, j)))
16156 {
16157 matches[i][0] = j;
16158 matches[j][1]++;
16159 break;
16160 }
16161 }
16162 }
16163 int maxelement = 0;
16164 int maxv = 0;
16165 for (int i = 0; i < n_elts; i++)
16166 if (matches[i][1] > maxv)
16167 {
16168 maxelement = i;
16169 maxv = matches[i][1];
16170 }
16171
16172 /* Create a duplicate of the most common element, unless all elements
16173 are equally useless to us, in which case just immediately set the
16174 vector register using the first element. */
16175
16176 if (maxv == 1)
16177 {
16178 /* For vectors of two 64-bit elements, we can do even better. */
16179 if (n_elts == 2
16180 && (inner_mode == E_DImode
16181 || inner_mode == E_DFmode))
16182
16183 {
16184 rtx x0 = XVECEXP (vals, 0, 0);
16185 rtx x1 = XVECEXP (vals, 0, 1);
16186 /* Combine can pick up this case, but handling it directly
16187 here leaves clearer RTL.
16188
16189 This is load_pair_lanes<mode>, and also gives us a clean-up
16190 for store_pair_lanes<mode>. */
16191 if (memory_operand (x0, inner_mode)
16192 && memory_operand (x1, inner_mode)
16193 && !STRICT_ALIGNMENT
16194 && rtx_equal_p (XEXP (x1, 0),
16195 plus_constant (Pmode,
16196 XEXP (x0, 0),
16197 GET_MODE_SIZE (inner_mode))))
16198 {
16199 rtx t;
16200 if (inner_mode == DFmode)
16201 t = gen_load_pair_lanesdf (target, x0, x1);
16202 else
16203 t = gen_load_pair_lanesdi (target, x0, x1);
16204 emit_insn (t);
16205 return;
16206 }
16207 }
16208 /* The subreg-move sequence below will move into lane zero of the
16209 vector register. For big-endian we want that position to hold
16210 the last element of VALS. */
16211 maxelement = BYTES_BIG_ENDIAN ? n_elts - 1 : 0;
16212 rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
16213 aarch64_emit_move (target, lowpart_subreg (mode, x, inner_mode));
16214 }
16215 else
16216 {
16217 rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
16218 aarch64_emit_move (target, gen_vec_duplicate (mode, x));
16219 }
16220
16221 /* Insert the rest. */
16222 for (int i = 0; i < n_elts; i++)
16223 {
16224 rtx x = XVECEXP (vals, 0, i);
16225 if (matches[i][0] == maxelement)
16226 continue;
16227 x = copy_to_mode_reg (inner_mode, x);
16228 emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
16229 }
16230 return;
16231 }
16232
16233 /* Initialise a vector which is part-variable. We want to first try
16234 to build those lanes which are constant in the most efficient way we
16235 can. */
16236 if (n_var != n_elts)
16237 {
16238 rtx copy = copy_rtx (vals);
16239
16240 /* Load constant part of vector. We really don't care what goes into the
16241 parts we will overwrite, but we're more likely to be able to load the
16242 constant efficiently if it has fewer, larger, repeating parts
16243 (see aarch64_simd_valid_immediate). */
16244 for (int i = 0; i < n_elts; i++)
16245 {
16246 rtx x = XVECEXP (vals, 0, i);
16247 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
16248 continue;
16249 rtx subst = any_const;
16250 for (int bit = n_elts / 2; bit > 0; bit /= 2)
16251 {
16252 /* Look in the copied vector, as more elements are const. */
16253 rtx test = XVECEXP (copy, 0, i ^ bit);
16254 if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
16255 {
16256 subst = test;
16257 break;
16258 }
16259 }
16260 XVECEXP (copy, 0, i) = subst;
16261 }
16262 aarch64_expand_vector_init (target, copy);
16263 }
16264
16265 /* Insert the variable lanes directly. */
16266 for (int i = 0; i < n_elts; i++)
16267 {
16268 rtx x = XVECEXP (vals, 0, i);
16269 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
16270 continue;
16271 x = copy_to_mode_reg (inner_mode, x);
16272 emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
16273 }
16274 }
16275
16276 /* Emit RTL corresponding to:
16277 insr TARGET, ELEM. */
16278
16279 static void
16280 emit_insr (rtx target, rtx elem)
16281 {
16282 machine_mode mode = GET_MODE (target);
16283 scalar_mode elem_mode = GET_MODE_INNER (mode);
16284 elem = force_reg (elem_mode, elem);
16285
16286 insn_code icode = optab_handler (vec_shl_insert_optab, mode);
16287 gcc_assert (icode != CODE_FOR_nothing);
16288 emit_insn (GEN_FCN (icode) (target, target, elem));
16289 }
16290
16291 /* Subroutine of aarch64_sve_expand_vector_init for handling
16292 trailing constants.
16293 This function works as follows:
16294 (a) Create a new vector consisting of trailing constants.
16295 (b) Initialize TARGET with the constant vector using emit_move_insn.
16296 (c) Insert remaining elements in TARGET using insr.
16297 NELTS is the total number of elements in original vector while
16298 while NELTS_REQD is the number of elements that are actually
16299 significant.
16300
16301 ??? The heuristic used is to do above only if number of constants
16302 is at least half the total number of elements. May need fine tuning. */
16303
16304 static bool
16305 aarch64_sve_expand_vector_init_handle_trailing_constants
16306 (rtx target, const rtx_vector_builder &builder, int nelts, int nelts_reqd)
16307 {
16308 machine_mode mode = GET_MODE (target);
16309 scalar_mode elem_mode = GET_MODE_INNER (mode);
16310 int n_trailing_constants = 0;
16311
16312 for (int i = nelts_reqd - 1;
16313 i >= 0 && aarch64_legitimate_constant_p (elem_mode, builder.elt (i));
16314 i--)
16315 n_trailing_constants++;
16316
16317 if (n_trailing_constants >= nelts_reqd / 2)
16318 {
16319 rtx_vector_builder v (mode, 1, nelts);
16320 for (int i = 0; i < nelts; i++)
16321 v.quick_push (builder.elt (i + nelts_reqd - n_trailing_constants));
16322 rtx const_vec = v.build ();
16323 emit_move_insn (target, const_vec);
16324
16325 for (int i = nelts_reqd - n_trailing_constants - 1; i >= 0; i--)
16326 emit_insr (target, builder.elt (i));
16327
16328 return true;
16329 }
16330
16331 return false;
16332 }
16333
16334 /* Subroutine of aarch64_sve_expand_vector_init.
16335 Works as follows:
16336 (a) Initialize TARGET by broadcasting element NELTS_REQD - 1 of BUILDER.
16337 (b) Skip trailing elements from BUILDER, which are the same as
16338 element NELTS_REQD - 1.
16339 (c) Insert earlier elements in reverse order in TARGET using insr. */
16340
16341 static void
16342 aarch64_sve_expand_vector_init_insert_elems (rtx target,
16343 const rtx_vector_builder &builder,
16344 int nelts_reqd)
16345 {
16346 machine_mode mode = GET_MODE (target);
16347 scalar_mode elem_mode = GET_MODE_INNER (mode);
16348
16349 struct expand_operand ops[2];
16350 enum insn_code icode = optab_handler (vec_duplicate_optab, mode);
16351 gcc_assert (icode != CODE_FOR_nothing);
16352
16353 create_output_operand (&ops[0], target, mode);
16354 create_input_operand (&ops[1], builder.elt (nelts_reqd - 1), elem_mode);
16355 expand_insn (icode, 2, ops);
16356
16357 int ndups = builder.count_dups (nelts_reqd - 1, -1, -1);
16358 for (int i = nelts_reqd - ndups - 1; i >= 0; i--)
16359 emit_insr (target, builder.elt (i));
16360 }
16361
16362 /* Subroutine of aarch64_sve_expand_vector_init to handle case
16363 when all trailing elements of builder are same.
16364 This works as follows:
16365 (a) Use expand_insn interface to broadcast last vector element in TARGET.
16366 (b) Insert remaining elements in TARGET using insr.
16367
16368 ??? The heuristic used is to do above if number of same trailing elements
16369 is at least 3/4 of total number of elements, loosely based on
16370 heuristic from mostly_zeros_p. May need fine-tuning. */
16371
16372 static bool
16373 aarch64_sve_expand_vector_init_handle_trailing_same_elem
16374 (rtx target, const rtx_vector_builder &builder, int nelts_reqd)
16375 {
16376 int ndups = builder.count_dups (nelts_reqd - 1, -1, -1);
16377 if (ndups >= (3 * nelts_reqd) / 4)
16378 {
16379 aarch64_sve_expand_vector_init_insert_elems (target, builder,
16380 nelts_reqd - ndups + 1);
16381 return true;
16382 }
16383
16384 return false;
16385 }
16386
16387 /* Initialize register TARGET from BUILDER. NELTS is the constant number
16388 of elements in BUILDER.
16389
16390 The function tries to initialize TARGET from BUILDER if it fits one
16391 of the special cases outlined below.
16392
16393 Failing that, the function divides BUILDER into two sub-vectors:
16394 v_even = even elements of BUILDER;
16395 v_odd = odd elements of BUILDER;
16396
16397 and recursively calls itself with v_even and v_odd.
16398
16399 if (recursive call succeeded for v_even or v_odd)
16400 TARGET = zip (v_even, v_odd)
16401
16402 The function returns true if it managed to build TARGET from BUILDER
16403 with one of the special cases, false otherwise.
16404
16405 Example: {a, 1, b, 2, c, 3, d, 4}
16406
16407 The vector gets divided into:
16408 v_even = {a, b, c, d}
16409 v_odd = {1, 2, 3, 4}
16410
16411 aarch64_sve_expand_vector_init(v_odd) hits case 1 and
16412 initialize tmp2 from constant vector v_odd using emit_move_insn.
16413
16414 aarch64_sve_expand_vector_init(v_even) fails since v_even contains
16415 4 elements, so we construct tmp1 from v_even using insr:
16416 tmp1 = dup(d)
16417 insr tmp1, c
16418 insr tmp1, b
16419 insr tmp1, a
16420
16421 And finally:
16422 TARGET = zip (tmp1, tmp2)
16423 which sets TARGET to {a, 1, b, 2, c, 3, d, 4}. */
16424
16425 static bool
16426 aarch64_sve_expand_vector_init (rtx target, const rtx_vector_builder &builder,
16427 int nelts, int nelts_reqd)
16428 {
16429 machine_mode mode = GET_MODE (target);
16430
16431 /* Case 1: Vector contains trailing constants. */
16432
16433 if (aarch64_sve_expand_vector_init_handle_trailing_constants
16434 (target, builder, nelts, nelts_reqd))
16435 return true;
16436
16437 /* Case 2: Vector contains leading constants. */
16438
16439 rtx_vector_builder rev_builder (mode, 1, nelts_reqd);
16440 for (int i = 0; i < nelts_reqd; i++)
16441 rev_builder.quick_push (builder.elt (nelts_reqd - i - 1));
16442 rev_builder.finalize ();
16443
16444 if (aarch64_sve_expand_vector_init_handle_trailing_constants
16445 (target, rev_builder, nelts, nelts_reqd))
16446 {
16447 emit_insn (gen_aarch64_sve_rev (mode, target, target));
16448 return true;
16449 }
16450
16451 /* Case 3: Vector contains trailing same element. */
16452
16453 if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
16454 (target, builder, nelts_reqd))
16455 return true;
16456
16457 /* Case 4: Vector contains leading same element. */
16458
16459 if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
16460 (target, rev_builder, nelts_reqd) && nelts_reqd == nelts)
16461 {
16462 emit_insn (gen_aarch64_sve_rev (mode, target, target));
16463 return true;
16464 }
16465
16466 /* Avoid recursing below 4-elements.
16467 ??? The threshold 4 may need fine-tuning. */
16468
16469 if (nelts_reqd <= 4)
16470 return false;
16471
16472 rtx_vector_builder v_even (mode, 1, nelts);
16473 rtx_vector_builder v_odd (mode, 1, nelts);
16474
16475 for (int i = 0; i < nelts * 2; i += 2)
16476 {
16477 v_even.quick_push (builder.elt (i));
16478 v_odd.quick_push (builder.elt (i + 1));
16479 }
16480
16481 v_even.finalize ();
16482 v_odd.finalize ();
16483
16484 rtx tmp1 = gen_reg_rtx (mode);
16485 bool did_even_p = aarch64_sve_expand_vector_init (tmp1, v_even,
16486 nelts, nelts_reqd / 2);
16487
16488 rtx tmp2 = gen_reg_rtx (mode);
16489 bool did_odd_p = aarch64_sve_expand_vector_init (tmp2, v_odd,
16490 nelts, nelts_reqd / 2);
16491
16492 if (!did_even_p && !did_odd_p)
16493 return false;
16494
16495 /* Initialize v_even and v_odd using INSR if it didn't match any of the
16496 special cases and zip v_even, v_odd. */
16497
16498 if (!did_even_p)
16499 aarch64_sve_expand_vector_init_insert_elems (tmp1, v_even, nelts_reqd / 2);
16500
16501 if (!did_odd_p)
16502 aarch64_sve_expand_vector_init_insert_elems (tmp2, v_odd, nelts_reqd / 2);
16503
16504 rtvec v = gen_rtvec (2, tmp1, tmp2);
16505 emit_set_insn (target, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
16506 return true;
16507 }
16508
16509 /* Initialize register TARGET from the elements in PARALLEL rtx VALS. */
16510
16511 void
16512 aarch64_sve_expand_vector_init (rtx target, rtx vals)
16513 {
16514 machine_mode mode = GET_MODE (target);
16515 int nelts = XVECLEN (vals, 0);
16516
16517 rtx_vector_builder v (mode, 1, nelts);
16518 for (int i = 0; i < nelts; i++)
16519 v.quick_push (XVECEXP (vals, 0, i));
16520 v.finalize ();
16521
16522 /* If neither sub-vectors of v could be initialized specially,
16523 then use INSR to insert all elements from v into TARGET.
16524 ??? This might not be optimal for vectors with large
16525 initializers like 16-element or above.
16526 For nelts < 4, it probably isn't useful to handle specially. */
16527
16528 if (nelts < 4
16529 || !aarch64_sve_expand_vector_init (target, v, nelts, nelts))
16530 aarch64_sve_expand_vector_init_insert_elems (target, v, nelts);
16531 }
16532
16533 /* Check whether VALUE is a vector constant in which every element
16534 is either a power of 2 or a negated power of 2. If so, return
16535 a constant vector of log2s, and flip CODE between PLUS and MINUS
16536 if VALUE contains negated powers of 2. Return NULL_RTX otherwise. */
16537
16538 static rtx
16539 aarch64_convert_mult_to_shift (rtx value, rtx_code &code)
16540 {
16541 if (GET_CODE (value) != CONST_VECTOR)
16542 return NULL_RTX;
16543
16544 rtx_vector_builder builder;
16545 if (!builder.new_unary_operation (GET_MODE (value), value, false))
16546 return NULL_RTX;
16547
16548 scalar_mode int_mode = GET_MODE_INNER (GET_MODE (value));
16549 /* 1 if the result of the multiplication must be negated,
16550 0 if it mustn't, or -1 if we don't yet care. */
16551 int negate = -1;
16552 unsigned int encoded_nelts = const_vector_encoded_nelts (value);
16553 for (unsigned int i = 0; i < encoded_nelts; ++i)
16554 {
16555 rtx elt = CONST_VECTOR_ENCODED_ELT (value, i);
16556 if (!CONST_SCALAR_INT_P (elt))
16557 return NULL_RTX;
16558 rtx_mode_t val (elt, int_mode);
16559 wide_int pow2 = wi::neg (val);
16560 if (val != pow2)
16561 {
16562 /* It matters whether we negate or not. Make that choice,
16563 and make sure that it's consistent with previous elements. */
16564 if (negate == !wi::neg_p (val))
16565 return NULL_RTX;
16566 negate = wi::neg_p (val);
16567 if (!negate)
16568 pow2 = val;
16569 }
16570 /* POW2 is now the value that we want to be a power of 2. */
16571 int shift = wi::exact_log2 (pow2);
16572 if (shift < 0)
16573 return NULL_RTX;
16574 builder.quick_push (gen_int_mode (shift, int_mode));
16575 }
16576 if (negate == -1)
16577 /* PLUS and MINUS are equivalent; canonicalize on PLUS. */
16578 code = PLUS;
16579 else if (negate == 1)
16580 code = code == PLUS ? MINUS : PLUS;
16581 return builder.build ();
16582 }
16583
16584 /* Prepare for an integer SVE multiply-add or multiply-subtract pattern;
16585 CODE is PLUS for the former and MINUS for the latter. OPERANDS is the
16586 operands array, in the same order as for fma_optab. Return true if
16587 the function emitted all the necessary instructions, false if the caller
16588 should generate the pattern normally with the new OPERANDS array. */
16589
16590 bool
16591 aarch64_prepare_sve_int_fma (rtx *operands, rtx_code code)
16592 {
16593 machine_mode mode = GET_MODE (operands[0]);
16594 if (rtx shifts = aarch64_convert_mult_to_shift (operands[2], code))
16595 {
16596 rtx product = expand_binop (mode, vashl_optab, operands[1], shifts,
16597 NULL_RTX, true, OPTAB_DIRECT);
16598 force_expand_binop (mode, code == PLUS ? add_optab : sub_optab,
16599 operands[3], product, operands[0], true,
16600 OPTAB_DIRECT);
16601 return true;
16602 }
16603 operands[2] = force_reg (mode, operands[2]);
16604 return false;
16605 }
16606
16607 /* Likewise, but for a conditional pattern. */
16608
16609 bool
16610 aarch64_prepare_sve_cond_int_fma (rtx *operands, rtx_code code)
16611 {
16612 machine_mode mode = GET_MODE (operands[0]);
16613 if (rtx shifts = aarch64_convert_mult_to_shift (operands[3], code))
16614 {
16615 rtx product = expand_binop (mode, vashl_optab, operands[2], shifts,
16616 NULL_RTX, true, OPTAB_DIRECT);
16617 emit_insn (gen_cond (code, mode, operands[0], operands[1],
16618 operands[4], product, operands[5]));
16619 return true;
16620 }
16621 operands[3] = force_reg (mode, operands[3]);
16622 return false;
16623 }
16624
16625 static unsigned HOST_WIDE_INT
16626 aarch64_shift_truncation_mask (machine_mode mode)
16627 {
16628 if (!SHIFT_COUNT_TRUNCATED || aarch64_vector_data_mode_p (mode))
16629 return 0;
16630 return GET_MODE_UNIT_BITSIZE (mode) - 1;
16631 }
16632
16633 /* Select a format to encode pointers in exception handling data. */
16634 int
16635 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
16636 {
16637 int type;
16638 switch (aarch64_cmodel)
16639 {
16640 case AARCH64_CMODEL_TINY:
16641 case AARCH64_CMODEL_TINY_PIC:
16642 case AARCH64_CMODEL_SMALL:
16643 case AARCH64_CMODEL_SMALL_PIC:
16644 case AARCH64_CMODEL_SMALL_SPIC:
16645 /* text+got+data < 4Gb. 4-byte signed relocs are sufficient
16646 for everything. */
16647 type = DW_EH_PE_sdata4;
16648 break;
16649 default:
16650 /* No assumptions here. 8-byte relocs required. */
16651 type = DW_EH_PE_sdata8;
16652 break;
16653 }
16654 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
16655 }
16656
16657 /* Output .variant_pcs for aarch64_vector_pcs function symbols. */
16658
16659 static void
16660 aarch64_asm_output_variant_pcs (FILE *stream, const tree decl, const char* name)
16661 {
16662 if (aarch64_simd_decl_p (decl))
16663 {
16664 fprintf (stream, "\t.variant_pcs\t");
16665 assemble_name (stream, name);
16666 fprintf (stream, "\n");
16667 }
16668 }
16669
16670 /* The last .arch and .tune assembly strings that we printed. */
16671 static std::string aarch64_last_printed_arch_string;
16672 static std::string aarch64_last_printed_tune_string;
16673
16674 /* Implement ASM_DECLARE_FUNCTION_NAME. Output the ISA features used
16675 by the function fndecl. */
16676
16677 void
16678 aarch64_declare_function_name (FILE *stream, const char* name,
16679 tree fndecl)
16680 {
16681 tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
16682
16683 struct cl_target_option *targ_options;
16684 if (target_parts)
16685 targ_options = TREE_TARGET_OPTION (target_parts);
16686 else
16687 targ_options = TREE_TARGET_OPTION (target_option_current_node);
16688 gcc_assert (targ_options);
16689
16690 const struct processor *this_arch
16691 = aarch64_get_arch (targ_options->x_explicit_arch);
16692
16693 uint64_t isa_flags = targ_options->x_aarch64_isa_flags;
16694 std::string extension
16695 = aarch64_get_extension_string_for_isa_flags (isa_flags,
16696 this_arch->flags);
16697 /* Only update the assembler .arch string if it is distinct from the last
16698 such string we printed. */
16699 std::string to_print = this_arch->name + extension;
16700 if (to_print != aarch64_last_printed_arch_string)
16701 {
16702 asm_fprintf (asm_out_file, "\t.arch %s\n", to_print.c_str ());
16703 aarch64_last_printed_arch_string = to_print;
16704 }
16705
16706 /* Print the cpu name we're tuning for in the comments, might be
16707 useful to readers of the generated asm. Do it only when it changes
16708 from function to function and verbose assembly is requested. */
16709 const struct processor *this_tune
16710 = aarch64_get_tune_cpu (targ_options->x_explicit_tune_core);
16711
16712 if (flag_debug_asm && aarch64_last_printed_tune_string != this_tune->name)
16713 {
16714 asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
16715 this_tune->name);
16716 aarch64_last_printed_tune_string = this_tune->name;
16717 }
16718
16719 aarch64_asm_output_variant_pcs (stream, fndecl, name);
16720
16721 /* Don't forget the type directive for ELF. */
16722 ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
16723 ASM_OUTPUT_LABEL (stream, name);
16724 }
16725
16726 /* Implement ASM_OUTPUT_DEF_FROM_DECLS. Output .variant_pcs for aliases. */
16727
16728 void
16729 aarch64_asm_output_alias (FILE *stream, const tree decl, const tree target)
16730 {
16731 const char *name = XSTR (XEXP (DECL_RTL (decl), 0), 0);
16732 const char *value = IDENTIFIER_POINTER (target);
16733 aarch64_asm_output_variant_pcs (stream, decl, name);
16734 ASM_OUTPUT_DEF (stream, name, value);
16735 }
16736
16737 /* Implement ASM_OUTPUT_EXTERNAL. Output .variant_pcs for undefined
16738 function symbol references. */
16739
16740 void
16741 aarch64_asm_output_external (FILE *stream, tree decl, const char* name)
16742 {
16743 default_elf_asm_output_external (stream, decl, name);
16744 aarch64_asm_output_variant_pcs (stream, decl, name);
16745 }
16746
16747 /* Triggered after a .cfi_startproc directive is emitted into the assembly file.
16748 Used to output the .cfi_b_key_frame directive when signing the current
16749 function with the B key. */
16750
16751 void
16752 aarch64_post_cfi_startproc (FILE *f, tree ignored ATTRIBUTE_UNUSED)
16753 {
16754 if (cfun->machine->frame.laid_out && aarch64_return_address_signing_enabled ()
16755 && aarch64_ra_sign_key == AARCH64_KEY_B)
16756 asm_fprintf (f, "\t.cfi_b_key_frame\n");
16757 }
16758
16759 /* Implements TARGET_ASM_FILE_START. Output the assembly header. */
16760
16761 static void
16762 aarch64_start_file (void)
16763 {
16764 struct cl_target_option *default_options
16765 = TREE_TARGET_OPTION (target_option_default_node);
16766
16767 const struct processor *default_arch
16768 = aarch64_get_arch (default_options->x_explicit_arch);
16769 uint64_t default_isa_flags = default_options->x_aarch64_isa_flags;
16770 std::string extension
16771 = aarch64_get_extension_string_for_isa_flags (default_isa_flags,
16772 default_arch->flags);
16773
16774 aarch64_last_printed_arch_string = default_arch->name + extension;
16775 aarch64_last_printed_tune_string = "";
16776 asm_fprintf (asm_out_file, "\t.arch %s\n",
16777 aarch64_last_printed_arch_string.c_str ());
16778
16779 default_file_start ();
16780 }
16781
16782 /* Emit load exclusive. */
16783
16784 static void
16785 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
16786 rtx mem, rtx model_rtx)
16787 {
16788 emit_insn (gen_aarch64_load_exclusive (mode, rval, mem, model_rtx));
16789 }
16790
16791 /* Emit store exclusive. */
16792
16793 static void
16794 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
16795 rtx rval, rtx mem, rtx model_rtx)
16796 {
16797 emit_insn (gen_aarch64_store_exclusive (mode, bval, rval, mem, model_rtx));
16798 }
16799
16800 /* Mark the previous jump instruction as unlikely. */
16801
16802 static void
16803 aarch64_emit_unlikely_jump (rtx insn)
16804 {
16805 rtx_insn *jump = emit_jump_insn (insn);
16806 add_reg_br_prob_note (jump, profile_probability::very_unlikely ());
16807 }
16808
16809 /* Expand a compare and swap pattern. */
16810
16811 void
16812 aarch64_expand_compare_and_swap (rtx operands[])
16813 {
16814 rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x, cc_reg;
16815 machine_mode mode, r_mode;
16816
16817 bval = operands[0];
16818 rval = operands[1];
16819 mem = operands[2];
16820 oldval = operands[3];
16821 newval = operands[4];
16822 is_weak = operands[5];
16823 mod_s = operands[6];
16824 mod_f = operands[7];
16825 mode = GET_MODE (mem);
16826
16827 /* Normally the succ memory model must be stronger than fail, but in the
16828 unlikely event of fail being ACQUIRE and succ being RELEASE we need to
16829 promote succ to ACQ_REL so that we don't lose the acquire semantics. */
16830 if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
16831 && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
16832 mod_s = GEN_INT (MEMMODEL_ACQ_REL);
16833
16834 r_mode = mode;
16835 if (mode == QImode || mode == HImode)
16836 {
16837 r_mode = SImode;
16838 rval = gen_reg_rtx (r_mode);
16839 }
16840
16841 if (TARGET_LSE)
16842 {
16843 /* The CAS insn requires oldval and rval overlap, but we need to
16844 have a copy of oldval saved across the operation to tell if
16845 the operation is successful. */
16846 if (reg_overlap_mentioned_p (rval, oldval))
16847 rval = copy_to_mode_reg (r_mode, oldval);
16848 else
16849 emit_move_insn (rval, gen_lowpart (r_mode, oldval));
16850
16851 emit_insn (gen_aarch64_compare_and_swap_lse (mode, rval, mem,
16852 newval, mod_s));
16853 cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
16854 }
16855 else
16856 {
16857 /* The oldval predicate varies by mode. Test it and force to reg. */
16858 insn_code code = code_for_aarch64_compare_and_swap (mode);
16859 if (!insn_data[code].operand[2].predicate (oldval, mode))
16860 oldval = force_reg (mode, oldval);
16861
16862 emit_insn (GEN_FCN (code) (rval, mem, oldval, newval,
16863 is_weak, mod_s, mod_f));
16864 cc_reg = gen_rtx_REG (CCmode, CC_REGNUM);
16865 }
16866
16867 if (r_mode != mode)
16868 rval = gen_lowpart (mode, rval);
16869 emit_move_insn (operands[1], rval);
16870
16871 x = gen_rtx_EQ (SImode, cc_reg, const0_rtx);
16872 emit_insn (gen_rtx_SET (bval, x));
16873 }
16874
16875 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
16876 sequence implementing an atomic operation. */
16877
16878 static void
16879 aarch64_emit_post_barrier (enum memmodel model)
16880 {
16881 const enum memmodel base_model = memmodel_base (model);
16882
16883 if (is_mm_sync (model)
16884 && (base_model == MEMMODEL_ACQUIRE
16885 || base_model == MEMMODEL_ACQ_REL
16886 || base_model == MEMMODEL_SEQ_CST))
16887 {
16888 emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
16889 }
16890 }
16891
16892 /* Split a compare and swap pattern. */
16893
16894 void
16895 aarch64_split_compare_and_swap (rtx operands[])
16896 {
16897 rtx rval, mem, oldval, newval, scratch;
16898 machine_mode mode;
16899 bool is_weak;
16900 rtx_code_label *label1, *label2;
16901 rtx x, cond;
16902 enum memmodel model;
16903 rtx model_rtx;
16904
16905 rval = operands[0];
16906 mem = operands[1];
16907 oldval = operands[2];
16908 newval = operands[3];
16909 is_weak = (operands[4] != const0_rtx);
16910 model_rtx = operands[5];
16911 scratch = operands[7];
16912 mode = GET_MODE (mem);
16913 model = memmodel_from_int (INTVAL (model_rtx));
16914
16915 /* When OLDVAL is zero and we want the strong version we can emit a tighter
16916 loop:
16917 .label1:
16918 LD[A]XR rval, [mem]
16919 CBNZ rval, .label2
16920 ST[L]XR scratch, newval, [mem]
16921 CBNZ scratch, .label1
16922 .label2:
16923 CMP rval, 0. */
16924 bool strong_zero_p = !is_weak && oldval == const0_rtx;
16925
16926 label1 = NULL;
16927 if (!is_weak)
16928 {
16929 label1 = gen_label_rtx ();
16930 emit_label (label1);
16931 }
16932 label2 = gen_label_rtx ();
16933
16934 /* The initial load can be relaxed for a __sync operation since a final
16935 barrier will be emitted to stop code hoisting. */
16936 if (is_mm_sync (model))
16937 aarch64_emit_load_exclusive (mode, rval, mem,
16938 GEN_INT (MEMMODEL_RELAXED));
16939 else
16940 aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
16941
16942 if (strong_zero_p)
16943 {
16944 if (aarch64_track_speculation)
16945 {
16946 /* Emit an explicit compare instruction, so that we can correctly
16947 track the condition codes. */
16948 rtx cc_reg = aarch64_gen_compare_reg (NE, rval, const0_rtx);
16949 x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
16950 }
16951 else
16952 x = gen_rtx_NE (VOIDmode, rval, const0_rtx);
16953
16954 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
16955 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
16956 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
16957 }
16958 else
16959 {
16960 cond = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
16961 x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
16962 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
16963 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
16964 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
16965 }
16966
16967 aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
16968
16969 if (!is_weak)
16970 {
16971 if (aarch64_track_speculation)
16972 {
16973 /* Emit an explicit compare instruction, so that we can correctly
16974 track the condition codes. */
16975 rtx cc_reg = aarch64_gen_compare_reg (NE, scratch, const0_rtx);
16976 x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
16977 }
16978 else
16979 x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
16980
16981 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
16982 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
16983 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
16984 }
16985 else
16986 {
16987 cond = gen_rtx_REG (CCmode, CC_REGNUM);
16988 x = gen_rtx_COMPARE (CCmode, scratch, const0_rtx);
16989 emit_insn (gen_rtx_SET (cond, x));
16990 }
16991
16992 emit_label (label2);
16993 /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
16994 to set the condition flags. If this is not used it will be removed by
16995 later passes. */
16996 if (strong_zero_p)
16997 {
16998 cond = gen_rtx_REG (CCmode, CC_REGNUM);
16999 x = gen_rtx_COMPARE (CCmode, rval, const0_rtx);
17000 emit_insn (gen_rtx_SET (cond, x));
17001 }
17002 /* Emit any final barrier needed for a __sync operation. */
17003 if (is_mm_sync (model))
17004 aarch64_emit_post_barrier (model);
17005 }
17006
17007 /* Split an atomic operation. */
17008
17009 void
17010 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
17011 rtx value, rtx model_rtx, rtx cond)
17012 {
17013 machine_mode mode = GET_MODE (mem);
17014 machine_mode wmode = (mode == DImode ? DImode : SImode);
17015 const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
17016 const bool is_sync = is_mm_sync (model);
17017 rtx_code_label *label;
17018 rtx x;
17019
17020 /* Split the atomic operation into a sequence. */
17021 label = gen_label_rtx ();
17022 emit_label (label);
17023
17024 if (new_out)
17025 new_out = gen_lowpart (wmode, new_out);
17026 if (old_out)
17027 old_out = gen_lowpart (wmode, old_out);
17028 else
17029 old_out = new_out;
17030 value = simplify_gen_subreg (wmode, value, mode, 0);
17031
17032 /* The initial load can be relaxed for a __sync operation since a final
17033 barrier will be emitted to stop code hoisting. */
17034 if (is_sync)
17035 aarch64_emit_load_exclusive (mode, old_out, mem,
17036 GEN_INT (MEMMODEL_RELAXED));
17037 else
17038 aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
17039
17040 switch (code)
17041 {
17042 case SET:
17043 new_out = value;
17044 break;
17045
17046 case NOT:
17047 x = gen_rtx_AND (wmode, old_out, value);
17048 emit_insn (gen_rtx_SET (new_out, x));
17049 x = gen_rtx_NOT (wmode, new_out);
17050 emit_insn (gen_rtx_SET (new_out, x));
17051 break;
17052
17053 case MINUS:
17054 if (CONST_INT_P (value))
17055 {
17056 value = GEN_INT (-INTVAL (value));
17057 code = PLUS;
17058 }
17059 /* Fall through. */
17060
17061 default:
17062 x = gen_rtx_fmt_ee (code, wmode, old_out, value);
17063 emit_insn (gen_rtx_SET (new_out, x));
17064 break;
17065 }
17066
17067 aarch64_emit_store_exclusive (mode, cond, mem,
17068 gen_lowpart (mode, new_out), model_rtx);
17069
17070 if (aarch64_track_speculation)
17071 {
17072 /* Emit an explicit compare instruction, so that we can correctly
17073 track the condition codes. */
17074 rtx cc_reg = aarch64_gen_compare_reg (NE, cond, const0_rtx);
17075 x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
17076 }
17077 else
17078 x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
17079
17080 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
17081 gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
17082 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
17083
17084 /* Emit any final barrier needed for a __sync operation. */
17085 if (is_sync)
17086 aarch64_emit_post_barrier (model);
17087 }
17088
17089 static void
17090 aarch64_init_libfuncs (void)
17091 {
17092 /* Half-precision float operations. The compiler handles all operations
17093 with NULL libfuncs by converting to SFmode. */
17094
17095 /* Conversions. */
17096 set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
17097 set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
17098
17099 /* Arithmetic. */
17100 set_optab_libfunc (add_optab, HFmode, NULL);
17101 set_optab_libfunc (sdiv_optab, HFmode, NULL);
17102 set_optab_libfunc (smul_optab, HFmode, NULL);
17103 set_optab_libfunc (neg_optab, HFmode, NULL);
17104 set_optab_libfunc (sub_optab, HFmode, NULL);
17105
17106 /* Comparisons. */
17107 set_optab_libfunc (eq_optab, HFmode, NULL);
17108 set_optab_libfunc (ne_optab, HFmode, NULL);
17109 set_optab_libfunc (lt_optab, HFmode, NULL);
17110 set_optab_libfunc (le_optab, HFmode, NULL);
17111 set_optab_libfunc (ge_optab, HFmode, NULL);
17112 set_optab_libfunc (gt_optab, HFmode, NULL);
17113 set_optab_libfunc (unord_optab, HFmode, NULL);
17114 }
17115
17116 /* Target hook for c_mode_for_suffix. */
17117 static machine_mode
17118 aarch64_c_mode_for_suffix (char suffix)
17119 {
17120 if (suffix == 'q')
17121 return TFmode;
17122
17123 return VOIDmode;
17124 }
17125
17126 /* We can only represent floating point constants which will fit in
17127 "quarter-precision" values. These values are characterised by
17128 a sign bit, a 4-bit mantissa and a 3-bit exponent. And are given
17129 by:
17130
17131 (-1)^s * (n/16) * 2^r
17132
17133 Where:
17134 's' is the sign bit.
17135 'n' is an integer in the range 16 <= n <= 31.
17136 'r' is an integer in the range -3 <= r <= 4. */
17137
17138 /* Return true iff X can be represented by a quarter-precision
17139 floating point immediate operand X. Note, we cannot represent 0.0. */
17140 bool
17141 aarch64_float_const_representable_p (rtx x)
17142 {
17143 /* This represents our current view of how many bits
17144 make up the mantissa. */
17145 int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
17146 int exponent;
17147 unsigned HOST_WIDE_INT mantissa, mask;
17148 REAL_VALUE_TYPE r, m;
17149 bool fail;
17150
17151 x = unwrap_const_vec_duplicate (x);
17152 if (!CONST_DOUBLE_P (x))
17153 return false;
17154
17155 if (GET_MODE (x) == VOIDmode
17156 || (GET_MODE (x) == HFmode && !TARGET_FP_F16INST))
17157 return false;
17158
17159 r = *CONST_DOUBLE_REAL_VALUE (x);
17160
17161 /* We cannot represent infinities, NaNs or +/-zero. We won't
17162 know if we have +zero until we analyse the mantissa, but we
17163 can reject the other invalid values. */
17164 if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
17165 || REAL_VALUE_MINUS_ZERO (r))
17166 return false;
17167
17168 /* Extract exponent. */
17169 r = real_value_abs (&r);
17170 exponent = REAL_EXP (&r);
17171
17172 /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
17173 highest (sign) bit, with a fixed binary point at bit point_pos.
17174 m1 holds the low part of the mantissa, m2 the high part.
17175 WARNING: If we ever have a representation using more than 2 * H_W_I - 1
17176 bits for the mantissa, this can fail (low bits will be lost). */
17177 real_ldexp (&m, &r, point_pos - exponent);
17178 wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
17179
17180 /* If the low part of the mantissa has bits set we cannot represent
17181 the value. */
17182 if (w.ulow () != 0)
17183 return false;
17184 /* We have rejected the lower HOST_WIDE_INT, so update our
17185 understanding of how many bits lie in the mantissa and
17186 look only at the high HOST_WIDE_INT. */
17187 mantissa = w.elt (1);
17188 point_pos -= HOST_BITS_PER_WIDE_INT;
17189
17190 /* We can only represent values with a mantissa of the form 1.xxxx. */
17191 mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
17192 if ((mantissa & mask) != 0)
17193 return false;
17194
17195 /* Having filtered unrepresentable values, we may now remove all
17196 but the highest 5 bits. */
17197 mantissa >>= point_pos - 5;
17198
17199 /* We cannot represent the value 0.0, so reject it. This is handled
17200 elsewhere. */
17201 if (mantissa == 0)
17202 return false;
17203
17204 /* Then, as bit 4 is always set, we can mask it off, leaving
17205 the mantissa in the range [0, 15]. */
17206 mantissa &= ~(1 << 4);
17207 gcc_assert (mantissa <= 15);
17208
17209 /* GCC internally does not use IEEE754-like encoding (where normalized
17210 significands are in the range [1, 2). GCC uses [0.5, 1) (see real.c).
17211 Our mantissa values are shifted 4 places to the left relative to
17212 normalized IEEE754 so we must modify the exponent returned by REAL_EXP
17213 by 5 places to correct for GCC's representation. */
17214 exponent = 5 - exponent;
17215
17216 return (exponent >= 0 && exponent <= 7);
17217 }
17218
17219 /* Returns the string with the instruction for AdvSIMD MOVI, MVNI, ORR or BIC
17220 immediate with a CONST_VECTOR of MODE and WIDTH. WHICH selects whether to
17221 output MOVI/MVNI, ORR or BIC immediate. */
17222 char*
17223 aarch64_output_simd_mov_immediate (rtx const_vector, unsigned width,
17224 enum simd_immediate_check which)
17225 {
17226 bool is_valid;
17227 static char templ[40];
17228 const char *mnemonic;
17229 const char *shift_op;
17230 unsigned int lane_count = 0;
17231 char element_char;
17232
17233 struct simd_immediate_info info;
17234
17235 /* This will return true to show const_vector is legal for use as either
17236 a AdvSIMD MOVI instruction (or, implicitly, MVNI), ORR or BIC immediate.
17237 It will also update INFO to show how the immediate should be generated.
17238 WHICH selects whether to check for MOVI/MVNI, ORR or BIC. */
17239 is_valid = aarch64_simd_valid_immediate (const_vector, &info, which);
17240 gcc_assert (is_valid);
17241
17242 element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
17243 lane_count = width / GET_MODE_BITSIZE (info.elt_mode);
17244
17245 if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
17246 {
17247 gcc_assert (info.insn == simd_immediate_info::MOV
17248 && info.u.mov.shift == 0);
17249 /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
17250 move immediate path. */
17251 if (aarch64_float_const_zero_rtx_p (info.u.mov.value))
17252 info.u.mov.value = GEN_INT (0);
17253 else
17254 {
17255 const unsigned int buf_size = 20;
17256 char float_buf[buf_size] = {'\0'};
17257 real_to_decimal_for_mode (float_buf,
17258 CONST_DOUBLE_REAL_VALUE (info.u.mov.value),
17259 buf_size, buf_size, 1, info.elt_mode);
17260
17261 if (lane_count == 1)
17262 snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
17263 else
17264 snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
17265 lane_count, element_char, float_buf);
17266 return templ;
17267 }
17268 }
17269
17270 gcc_assert (CONST_INT_P (info.u.mov.value));
17271
17272 if (which == AARCH64_CHECK_MOV)
17273 {
17274 mnemonic = info.insn == simd_immediate_info::MVN ? "mvni" : "movi";
17275 shift_op = (info.u.mov.modifier == simd_immediate_info::MSL
17276 ? "msl" : "lsl");
17277 if (lane_count == 1)
17278 snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
17279 mnemonic, UINTVAL (info.u.mov.value));
17280 else if (info.u.mov.shift)
17281 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
17282 HOST_WIDE_INT_PRINT_HEX ", %s %d", mnemonic, lane_count,
17283 element_char, UINTVAL (info.u.mov.value), shift_op,
17284 info.u.mov.shift);
17285 else
17286 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
17287 HOST_WIDE_INT_PRINT_HEX, mnemonic, lane_count,
17288 element_char, UINTVAL (info.u.mov.value));
17289 }
17290 else
17291 {
17292 /* For AARCH64_CHECK_BIC and AARCH64_CHECK_ORR. */
17293 mnemonic = info.insn == simd_immediate_info::MVN ? "bic" : "orr";
17294 if (info.u.mov.shift)
17295 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
17296 HOST_WIDE_INT_PRINT_DEC ", %s #%d", mnemonic, lane_count,
17297 element_char, UINTVAL (info.u.mov.value), "lsl",
17298 info.u.mov.shift);
17299 else
17300 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
17301 HOST_WIDE_INT_PRINT_DEC, mnemonic, lane_count,
17302 element_char, UINTVAL (info.u.mov.value));
17303 }
17304 return templ;
17305 }
17306
17307 char*
17308 aarch64_output_scalar_simd_mov_immediate (rtx immediate, scalar_int_mode mode)
17309 {
17310
17311 /* If a floating point number was passed and we desire to use it in an
17312 integer mode do the conversion to integer. */
17313 if (CONST_DOUBLE_P (immediate) && GET_MODE_CLASS (mode) == MODE_INT)
17314 {
17315 unsigned HOST_WIDE_INT ival;
17316 if (!aarch64_reinterpret_float_as_int (immediate, &ival))
17317 gcc_unreachable ();
17318 immediate = gen_int_mode (ival, mode);
17319 }
17320
17321 machine_mode vmode;
17322 /* use a 64 bit mode for everything except for DI/DF mode, where we use
17323 a 128 bit vector mode. */
17324 int width = GET_MODE_BITSIZE (mode) == 64 ? 128 : 64;
17325
17326 vmode = aarch64_simd_container_mode (mode, width);
17327 rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
17328 return aarch64_output_simd_mov_immediate (v_op, width);
17329 }
17330
17331 /* Return the output string to use for moving immediate CONST_VECTOR
17332 into an SVE register. */
17333
17334 char *
17335 aarch64_output_sve_mov_immediate (rtx const_vector)
17336 {
17337 static char templ[40];
17338 struct simd_immediate_info info;
17339 char element_char;
17340
17341 bool is_valid = aarch64_simd_valid_immediate (const_vector, &info);
17342 gcc_assert (is_valid);
17343
17344 element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
17345
17346 machine_mode vec_mode = GET_MODE (const_vector);
17347 if (aarch64_sve_pred_mode_p (vec_mode))
17348 {
17349 static char buf[sizeof ("ptrue\t%0.N, vlNNNNN")];
17350 if (info.insn == simd_immediate_info::MOV)
17351 {
17352 gcc_assert (info.u.mov.value == const0_rtx);
17353 snprintf (buf, sizeof (buf), "pfalse\t%%0.b");
17354 }
17355 else
17356 {
17357 gcc_assert (info.insn == simd_immediate_info::PTRUE);
17358 unsigned int total_bytes;
17359 if (info.u.pattern == AARCH64_SV_ALL
17360 && BYTES_PER_SVE_VECTOR.is_constant (&total_bytes))
17361 snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, vl%d", element_char,
17362 total_bytes / GET_MODE_SIZE (info.elt_mode));
17363 else
17364 snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, %s", element_char,
17365 svpattern_token (info.u.pattern));
17366 }
17367 return buf;
17368 }
17369
17370 if (info.insn == simd_immediate_info::INDEX)
17371 {
17372 snprintf (templ, sizeof (templ), "index\t%%0.%c, #"
17373 HOST_WIDE_INT_PRINT_DEC ", #" HOST_WIDE_INT_PRINT_DEC,
17374 element_char, INTVAL (info.u.index.base),
17375 INTVAL (info.u.index.step));
17376 return templ;
17377 }
17378
17379 if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
17380 {
17381 if (aarch64_float_const_zero_rtx_p (info.u.mov.value))
17382 info.u.mov.value = GEN_INT (0);
17383 else
17384 {
17385 const int buf_size = 20;
17386 char float_buf[buf_size] = {};
17387 real_to_decimal_for_mode (float_buf,
17388 CONST_DOUBLE_REAL_VALUE (info.u.mov.value),
17389 buf_size, buf_size, 1, info.elt_mode);
17390
17391 snprintf (templ, sizeof (templ), "fmov\t%%0.%c, #%s",
17392 element_char, float_buf);
17393 return templ;
17394 }
17395 }
17396
17397 snprintf (templ, sizeof (templ), "mov\t%%0.%c, #" HOST_WIDE_INT_PRINT_DEC,
17398 element_char, INTVAL (info.u.mov.value));
17399 return templ;
17400 }
17401
17402 /* Split operands into moves from op[1] + op[2] into op[0]. */
17403
17404 void
17405 aarch64_split_combinev16qi (rtx operands[3])
17406 {
17407 unsigned int dest = REGNO (operands[0]);
17408 unsigned int src1 = REGNO (operands[1]);
17409 unsigned int src2 = REGNO (operands[2]);
17410 machine_mode halfmode = GET_MODE (operands[1]);
17411 unsigned int halfregs = REG_NREGS (operands[1]);
17412 rtx destlo, desthi;
17413
17414 gcc_assert (halfmode == V16QImode);
17415
17416 if (src1 == dest && src2 == dest + halfregs)
17417 {
17418 /* No-op move. Can't split to nothing; emit something. */
17419 emit_note (NOTE_INSN_DELETED);
17420 return;
17421 }
17422
17423 /* Preserve register attributes for variable tracking. */
17424 destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
17425 desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
17426 GET_MODE_SIZE (halfmode));
17427
17428 /* Special case of reversed high/low parts. */
17429 if (reg_overlap_mentioned_p (operands[2], destlo)
17430 && reg_overlap_mentioned_p (operands[1], desthi))
17431 {
17432 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
17433 emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
17434 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
17435 }
17436 else if (!reg_overlap_mentioned_p (operands[2], destlo))
17437 {
17438 /* Try to avoid unnecessary moves if part of the result
17439 is in the right place already. */
17440 if (src1 != dest)
17441 emit_move_insn (destlo, operands[1]);
17442 if (src2 != dest + halfregs)
17443 emit_move_insn (desthi, operands[2]);
17444 }
17445 else
17446 {
17447 if (src2 != dest + halfregs)
17448 emit_move_insn (desthi, operands[2]);
17449 if (src1 != dest)
17450 emit_move_insn (destlo, operands[1]);
17451 }
17452 }
17453
17454 /* vec_perm support. */
17455
17456 struct expand_vec_perm_d
17457 {
17458 rtx target, op0, op1;
17459 vec_perm_indices perm;
17460 machine_mode vmode;
17461 unsigned int vec_flags;
17462 bool one_vector_p;
17463 bool testing_p;
17464 };
17465
17466 /* Generate a variable permutation. */
17467
17468 static void
17469 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
17470 {
17471 machine_mode vmode = GET_MODE (target);
17472 bool one_vector_p = rtx_equal_p (op0, op1);
17473
17474 gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
17475 gcc_checking_assert (GET_MODE (op0) == vmode);
17476 gcc_checking_assert (GET_MODE (op1) == vmode);
17477 gcc_checking_assert (GET_MODE (sel) == vmode);
17478 gcc_checking_assert (TARGET_SIMD);
17479
17480 if (one_vector_p)
17481 {
17482 if (vmode == V8QImode)
17483 {
17484 /* Expand the argument to a V16QI mode by duplicating it. */
17485 rtx pair = gen_reg_rtx (V16QImode);
17486 emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
17487 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
17488 }
17489 else
17490 {
17491 emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
17492 }
17493 }
17494 else
17495 {
17496 rtx pair;
17497
17498 if (vmode == V8QImode)
17499 {
17500 pair = gen_reg_rtx (V16QImode);
17501 emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
17502 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
17503 }
17504 else
17505 {
17506 pair = gen_reg_rtx (OImode);
17507 emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
17508 emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
17509 }
17510 }
17511 }
17512
17513 /* Expand a vec_perm with the operands given by TARGET, OP0, OP1 and SEL.
17514 NELT is the number of elements in the vector. */
17515
17516 void
17517 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel,
17518 unsigned int nelt)
17519 {
17520 machine_mode vmode = GET_MODE (target);
17521 bool one_vector_p = rtx_equal_p (op0, op1);
17522 rtx mask;
17523
17524 /* The TBL instruction does not use a modulo index, so we must take care
17525 of that ourselves. */
17526 mask = aarch64_simd_gen_const_vector_dup (vmode,
17527 one_vector_p ? nelt - 1 : 2 * nelt - 1);
17528 sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
17529
17530 /* For big-endian, we also need to reverse the index within the vector
17531 (but not which vector). */
17532 if (BYTES_BIG_ENDIAN)
17533 {
17534 /* If one_vector_p, mask is a vector of (nelt - 1)'s already. */
17535 if (!one_vector_p)
17536 mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
17537 sel = expand_simple_binop (vmode, XOR, sel, mask,
17538 NULL, 0, OPTAB_LIB_WIDEN);
17539 }
17540 aarch64_expand_vec_perm_1 (target, op0, op1, sel);
17541 }
17542
17543 /* Generate (set TARGET (unspec [OP0 OP1] CODE)). */
17544
17545 static void
17546 emit_unspec2 (rtx target, int code, rtx op0, rtx op1)
17547 {
17548 emit_insn (gen_rtx_SET (target,
17549 gen_rtx_UNSPEC (GET_MODE (target),
17550 gen_rtvec (2, op0, op1), code)));
17551 }
17552
17553 /* Expand an SVE vec_perm with the given operands. */
17554
17555 void
17556 aarch64_expand_sve_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
17557 {
17558 machine_mode data_mode = GET_MODE (target);
17559 machine_mode sel_mode = GET_MODE (sel);
17560 /* Enforced by the pattern condition. */
17561 int nunits = GET_MODE_NUNITS (sel_mode).to_constant ();
17562
17563 /* Note: vec_perm indices are supposed to wrap when they go beyond the
17564 size of the two value vectors, i.e. the upper bits of the indices
17565 are effectively ignored. SVE TBL instead produces 0 for any
17566 out-of-range indices, so we need to modulo all the vec_perm indices
17567 to ensure they are all in range. */
17568 rtx sel_reg = force_reg (sel_mode, sel);
17569
17570 /* Check if the sel only references the first values vector. */
17571 if (GET_CODE (sel) == CONST_VECTOR
17572 && aarch64_const_vec_all_in_range_p (sel, 0, nunits - 1))
17573 {
17574 emit_unspec2 (target, UNSPEC_TBL, op0, sel_reg);
17575 return;
17576 }
17577
17578 /* Check if the two values vectors are the same. */
17579 if (rtx_equal_p (op0, op1))
17580 {
17581 rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode, nunits - 1);
17582 rtx sel_mod = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
17583 NULL, 0, OPTAB_DIRECT);
17584 emit_unspec2 (target, UNSPEC_TBL, op0, sel_mod);
17585 return;
17586 }
17587
17588 /* Run TBL on for each value vector and combine the results. */
17589
17590 rtx res0 = gen_reg_rtx (data_mode);
17591 rtx res1 = gen_reg_rtx (data_mode);
17592 rtx neg_num_elems = aarch64_simd_gen_const_vector_dup (sel_mode, -nunits);
17593 if (GET_CODE (sel) != CONST_VECTOR
17594 || !aarch64_const_vec_all_in_range_p (sel, 0, 2 * nunits - 1))
17595 {
17596 rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode,
17597 2 * nunits - 1);
17598 sel_reg = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
17599 NULL, 0, OPTAB_DIRECT);
17600 }
17601 emit_unspec2 (res0, UNSPEC_TBL, op0, sel_reg);
17602 rtx sel_sub = expand_simple_binop (sel_mode, PLUS, sel_reg, neg_num_elems,
17603 NULL, 0, OPTAB_DIRECT);
17604 emit_unspec2 (res1, UNSPEC_TBL, op1, sel_sub);
17605 if (GET_MODE_CLASS (data_mode) == MODE_VECTOR_INT)
17606 emit_insn (gen_rtx_SET (target, gen_rtx_IOR (data_mode, res0, res1)));
17607 else
17608 emit_unspec2 (target, UNSPEC_IORF, res0, res1);
17609 }
17610
17611 /* Recognize patterns suitable for the TRN instructions. */
17612 static bool
17613 aarch64_evpc_trn (struct expand_vec_perm_d *d)
17614 {
17615 HOST_WIDE_INT odd;
17616 poly_uint64 nelt = d->perm.length ();
17617 rtx out, in0, in1, x;
17618 machine_mode vmode = d->vmode;
17619
17620 if (GET_MODE_UNIT_SIZE (vmode) > 8)
17621 return false;
17622
17623 /* Note that these are little-endian tests.
17624 We correct for big-endian later. */
17625 if (!d->perm[0].is_constant (&odd)
17626 || (odd != 0 && odd != 1)
17627 || !d->perm.series_p (0, 2, odd, 2)
17628 || !d->perm.series_p (1, 2, nelt + odd, 2))
17629 return false;
17630
17631 /* Success! */
17632 if (d->testing_p)
17633 return true;
17634
17635 in0 = d->op0;
17636 in1 = d->op1;
17637 /* We don't need a big-endian lane correction for SVE; see the comment
17638 at the head of aarch64-sve.md for details. */
17639 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
17640 {
17641 x = in0, in0 = in1, in1 = x;
17642 odd = !odd;
17643 }
17644 out = d->target;
17645
17646 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
17647 odd ? UNSPEC_TRN2 : UNSPEC_TRN1));
17648 return true;
17649 }
17650
17651 /* Recognize patterns suitable for the UZP instructions. */
17652 static bool
17653 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
17654 {
17655 HOST_WIDE_INT odd;
17656 rtx out, in0, in1, x;
17657 machine_mode vmode = d->vmode;
17658
17659 if (GET_MODE_UNIT_SIZE (vmode) > 8)
17660 return false;
17661
17662 /* Note that these are little-endian tests.
17663 We correct for big-endian later. */
17664 if (!d->perm[0].is_constant (&odd)
17665 || (odd != 0 && odd != 1)
17666 || !d->perm.series_p (0, 1, odd, 2))
17667 return false;
17668
17669 /* Success! */
17670 if (d->testing_p)
17671 return true;
17672
17673 in0 = d->op0;
17674 in1 = d->op1;
17675 /* We don't need a big-endian lane correction for SVE; see the comment
17676 at the head of aarch64-sve.md for details. */
17677 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
17678 {
17679 x = in0, in0 = in1, in1 = x;
17680 odd = !odd;
17681 }
17682 out = d->target;
17683
17684 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
17685 odd ? UNSPEC_UZP2 : UNSPEC_UZP1));
17686 return true;
17687 }
17688
17689 /* Recognize patterns suitable for the ZIP instructions. */
17690 static bool
17691 aarch64_evpc_zip (struct expand_vec_perm_d *d)
17692 {
17693 unsigned int high;
17694 poly_uint64 nelt = d->perm.length ();
17695 rtx out, in0, in1, x;
17696 machine_mode vmode = d->vmode;
17697
17698 if (GET_MODE_UNIT_SIZE (vmode) > 8)
17699 return false;
17700
17701 /* Note that these are little-endian tests.
17702 We correct for big-endian later. */
17703 poly_uint64 first = d->perm[0];
17704 if ((maybe_ne (first, 0U) && maybe_ne (first * 2, nelt))
17705 || !d->perm.series_p (0, 2, first, 1)
17706 || !d->perm.series_p (1, 2, first + nelt, 1))
17707 return false;
17708 high = maybe_ne (first, 0U);
17709
17710 /* Success! */
17711 if (d->testing_p)
17712 return true;
17713
17714 in0 = d->op0;
17715 in1 = d->op1;
17716 /* We don't need a big-endian lane correction for SVE; see the comment
17717 at the head of aarch64-sve.md for details. */
17718 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
17719 {
17720 x = in0, in0 = in1, in1 = x;
17721 high = !high;
17722 }
17723 out = d->target;
17724
17725 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
17726 high ? UNSPEC_ZIP2 : UNSPEC_ZIP1));
17727 return true;
17728 }
17729
17730 /* Recognize patterns for the EXT insn. */
17731
17732 static bool
17733 aarch64_evpc_ext (struct expand_vec_perm_d *d)
17734 {
17735 HOST_WIDE_INT location;
17736 rtx offset;
17737
17738 /* The first element always refers to the first vector.
17739 Check if the extracted indices are increasing by one. */
17740 if (d->vec_flags == VEC_SVE_PRED
17741 || !d->perm[0].is_constant (&location)
17742 || !d->perm.series_p (0, 1, location, 1))
17743 return false;
17744
17745 /* Success! */
17746 if (d->testing_p)
17747 return true;
17748
17749 /* The case where (location == 0) is a no-op for both big- and little-endian,
17750 and is removed by the mid-end at optimization levels -O1 and higher.
17751
17752 We don't need a big-endian lane correction for SVE; see the comment
17753 at the head of aarch64-sve.md for details. */
17754 if (BYTES_BIG_ENDIAN && location != 0 && d->vec_flags == VEC_ADVSIMD)
17755 {
17756 /* After setup, we want the high elements of the first vector (stored
17757 at the LSB end of the register), and the low elements of the second
17758 vector (stored at the MSB end of the register). So swap. */
17759 std::swap (d->op0, d->op1);
17760 /* location != 0 (above), so safe to assume (nelt - location) < nelt.
17761 to_constant () is safe since this is restricted to Advanced SIMD
17762 vectors. */
17763 location = d->perm.length ().to_constant () - location;
17764 }
17765
17766 offset = GEN_INT (location);
17767 emit_set_insn (d->target,
17768 gen_rtx_UNSPEC (d->vmode,
17769 gen_rtvec (3, d->op0, d->op1, offset),
17770 UNSPEC_EXT));
17771 return true;
17772 }
17773
17774 /* Recognize patterns for the REV{64,32,16} insns, which reverse elements
17775 within each 64-bit, 32-bit or 16-bit granule. */
17776
17777 static bool
17778 aarch64_evpc_rev_local (struct expand_vec_perm_d *d)
17779 {
17780 HOST_WIDE_INT diff;
17781 unsigned int i, size, unspec;
17782 machine_mode pred_mode;
17783
17784 if (d->vec_flags == VEC_SVE_PRED
17785 || !d->one_vector_p
17786 || !d->perm[0].is_constant (&diff))
17787 return false;
17788
17789 size = (diff + 1) * GET_MODE_UNIT_SIZE (d->vmode);
17790 if (size == 8)
17791 {
17792 unspec = UNSPEC_REV64;
17793 pred_mode = VNx2BImode;
17794 }
17795 else if (size == 4)
17796 {
17797 unspec = UNSPEC_REV32;
17798 pred_mode = VNx4BImode;
17799 }
17800 else if (size == 2)
17801 {
17802 unspec = UNSPEC_REV16;
17803 pred_mode = VNx8BImode;
17804 }
17805 else
17806 return false;
17807
17808 unsigned int step = diff + 1;
17809 for (i = 0; i < step; ++i)
17810 if (!d->perm.series_p (i, step, diff - i, step))
17811 return false;
17812
17813 /* Success! */
17814 if (d->testing_p)
17815 return true;
17816
17817 if (d->vec_flags == VEC_SVE_DATA)
17818 {
17819 machine_mode int_mode = aarch64_sve_int_mode (pred_mode);
17820 rtx target = gen_reg_rtx (int_mode);
17821 if (BYTES_BIG_ENDIAN)
17822 /* The act of taking a subreg between INT_MODE and d->vmode
17823 is itself a reversing operation on big-endian targets;
17824 see the comment at the head of aarch64-sve.md for details.
17825 First reinterpret OP0 as INT_MODE without using a subreg
17826 and without changing the contents. */
17827 emit_insn (gen_aarch64_sve_reinterpret (int_mode, target, d->op0));
17828 else
17829 {
17830 /* For SVE we use REV[BHW] unspecs derived from the element size
17831 of v->mode and vector modes whose elements have SIZE bytes.
17832 This ensures that the vector modes match the predicate modes. */
17833 int unspec = aarch64_sve_rev_unspec (d->vmode);
17834 rtx pred = aarch64_ptrue_reg (pred_mode);
17835 emit_insn (gen_aarch64_pred (unspec, int_mode, target, pred,
17836 gen_lowpart (int_mode, d->op0)));
17837 }
17838 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
17839 return true;
17840 }
17841 rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), unspec);
17842 emit_set_insn (d->target, src);
17843 return true;
17844 }
17845
17846 /* Recognize patterns for the REV insn, which reverses elements within
17847 a full vector. */
17848
17849 static bool
17850 aarch64_evpc_rev_global (struct expand_vec_perm_d *d)
17851 {
17852 poly_uint64 nelt = d->perm.length ();
17853
17854 if (!d->one_vector_p || d->vec_flags != VEC_SVE_DATA)
17855 return false;
17856
17857 if (!d->perm.series_p (0, 1, nelt - 1, -1))
17858 return false;
17859
17860 /* Success! */
17861 if (d->testing_p)
17862 return true;
17863
17864 rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), UNSPEC_REV);
17865 emit_set_insn (d->target, src);
17866 return true;
17867 }
17868
17869 static bool
17870 aarch64_evpc_dup (struct expand_vec_perm_d *d)
17871 {
17872 rtx out = d->target;
17873 rtx in0;
17874 HOST_WIDE_INT elt;
17875 machine_mode vmode = d->vmode;
17876 rtx lane;
17877
17878 if (d->vec_flags == VEC_SVE_PRED
17879 || d->perm.encoding ().encoded_nelts () != 1
17880 || !d->perm[0].is_constant (&elt))
17881 return false;
17882
17883 if (d->vec_flags == VEC_SVE_DATA && elt >= 64 * GET_MODE_UNIT_SIZE (vmode))
17884 return false;
17885
17886 /* Success! */
17887 if (d->testing_p)
17888 return true;
17889
17890 /* The generic preparation in aarch64_expand_vec_perm_const_1
17891 swaps the operand order and the permute indices if it finds
17892 d->perm[0] to be in the second operand. Thus, we can always
17893 use d->op0 and need not do any extra arithmetic to get the
17894 correct lane number. */
17895 in0 = d->op0;
17896 lane = GEN_INT (elt); /* The pattern corrects for big-endian. */
17897
17898 rtx parallel = gen_rtx_PARALLEL (vmode, gen_rtvec (1, lane));
17899 rtx select = gen_rtx_VEC_SELECT (GET_MODE_INNER (vmode), in0, parallel);
17900 emit_set_insn (out, gen_rtx_VEC_DUPLICATE (vmode, select));
17901 return true;
17902 }
17903
17904 static bool
17905 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
17906 {
17907 rtx rperm[MAX_COMPILE_TIME_VEC_BYTES], sel;
17908 machine_mode vmode = d->vmode;
17909
17910 /* Make sure that the indices are constant. */
17911 unsigned int encoded_nelts = d->perm.encoding ().encoded_nelts ();
17912 for (unsigned int i = 0; i < encoded_nelts; ++i)
17913 if (!d->perm[i].is_constant ())
17914 return false;
17915
17916 if (d->testing_p)
17917 return true;
17918
17919 /* Generic code will try constant permutation twice. Once with the
17920 original mode and again with the elements lowered to QImode.
17921 So wait and don't do the selector expansion ourselves. */
17922 if (vmode != V8QImode && vmode != V16QImode)
17923 return false;
17924
17925 /* to_constant is safe since this routine is specific to Advanced SIMD
17926 vectors. */
17927 unsigned int nelt = d->perm.length ().to_constant ();
17928 for (unsigned int i = 0; i < nelt; ++i)
17929 /* If big-endian and two vectors we end up with a weird mixed-endian
17930 mode on NEON. Reverse the index within each word but not the word
17931 itself. to_constant is safe because we checked is_constant above. */
17932 rperm[i] = GEN_INT (BYTES_BIG_ENDIAN
17933 ? d->perm[i].to_constant () ^ (nelt - 1)
17934 : d->perm[i].to_constant ());
17935
17936 sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
17937 sel = force_reg (vmode, sel);
17938
17939 aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
17940 return true;
17941 }
17942
17943 /* Try to implement D using an SVE TBL instruction. */
17944
17945 static bool
17946 aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
17947 {
17948 unsigned HOST_WIDE_INT nelt;
17949
17950 /* Permuting two variable-length vectors could overflow the
17951 index range. */
17952 if (!d->one_vector_p && !d->perm.length ().is_constant (&nelt))
17953 return false;
17954
17955 if (d->testing_p)
17956 return true;
17957
17958 machine_mode sel_mode = mode_for_int_vector (d->vmode).require ();
17959 rtx sel = vec_perm_indices_to_rtx (sel_mode, d->perm);
17960 if (d->one_vector_p)
17961 emit_unspec2 (d->target, UNSPEC_TBL, d->op0, force_reg (sel_mode, sel));
17962 else
17963 aarch64_expand_sve_vec_perm (d->target, d->op0, d->op1, sel);
17964 return true;
17965 }
17966
17967 static bool
17968 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
17969 {
17970 /* The pattern matching functions above are written to look for a small
17971 number to begin the sequence (0, 1, N/2). If we begin with an index
17972 from the second operand, we can swap the operands. */
17973 poly_int64 nelt = d->perm.length ();
17974 if (known_ge (d->perm[0], nelt))
17975 {
17976 d->perm.rotate_inputs (1);
17977 std::swap (d->op0, d->op1);
17978 }
17979
17980 if ((d->vec_flags == VEC_ADVSIMD
17981 || d->vec_flags == VEC_SVE_DATA
17982 || d->vec_flags == VEC_SVE_PRED)
17983 && known_gt (nelt, 1))
17984 {
17985 if (aarch64_evpc_rev_local (d))
17986 return true;
17987 else if (aarch64_evpc_rev_global (d))
17988 return true;
17989 else if (aarch64_evpc_ext (d))
17990 return true;
17991 else if (aarch64_evpc_dup (d))
17992 return true;
17993 else if (aarch64_evpc_zip (d))
17994 return true;
17995 else if (aarch64_evpc_uzp (d))
17996 return true;
17997 else if (aarch64_evpc_trn (d))
17998 return true;
17999 if (d->vec_flags == VEC_SVE_DATA)
18000 return aarch64_evpc_sve_tbl (d);
18001 else if (d->vec_flags == VEC_ADVSIMD)
18002 return aarch64_evpc_tbl (d);
18003 }
18004 return false;
18005 }
18006
18007 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST. */
18008
18009 static bool
18010 aarch64_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
18011 rtx op1, const vec_perm_indices &sel)
18012 {
18013 struct expand_vec_perm_d d;
18014
18015 /* Check whether the mask can be applied to a single vector. */
18016 if (sel.ninputs () == 1
18017 || (op0 && rtx_equal_p (op0, op1)))
18018 d.one_vector_p = true;
18019 else if (sel.all_from_input_p (0))
18020 {
18021 d.one_vector_p = true;
18022 op1 = op0;
18023 }
18024 else if (sel.all_from_input_p (1))
18025 {
18026 d.one_vector_p = true;
18027 op0 = op1;
18028 }
18029 else
18030 d.one_vector_p = false;
18031
18032 d.perm.new_vector (sel.encoding (), d.one_vector_p ? 1 : 2,
18033 sel.nelts_per_input ());
18034 d.vmode = vmode;
18035 d.vec_flags = aarch64_classify_vector_mode (d.vmode);
18036 d.target = target;
18037 d.op0 = op0;
18038 d.op1 = op1;
18039 d.testing_p = !target;
18040
18041 if (!d.testing_p)
18042 return aarch64_expand_vec_perm_const_1 (&d);
18043
18044 rtx_insn *last = get_last_insn ();
18045 bool ret = aarch64_expand_vec_perm_const_1 (&d);
18046 gcc_assert (last == get_last_insn ());
18047
18048 return ret;
18049 }
18050
18051 /* Generate a byte permute mask for a register of mode MODE,
18052 which has NUNITS units. */
18053
18054 rtx
18055 aarch64_reverse_mask (machine_mode mode, unsigned int nunits)
18056 {
18057 /* We have to reverse each vector because we dont have
18058 a permuted load that can reverse-load according to ABI rules. */
18059 rtx mask;
18060 rtvec v = rtvec_alloc (16);
18061 unsigned int i, j;
18062 unsigned int usize = GET_MODE_UNIT_SIZE (mode);
18063
18064 gcc_assert (BYTES_BIG_ENDIAN);
18065 gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
18066
18067 for (i = 0; i < nunits; i++)
18068 for (j = 0; j < usize; j++)
18069 RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
18070 mask = gen_rtx_CONST_VECTOR (V16QImode, v);
18071 return force_reg (V16QImode, mask);
18072 }
18073
18074 /* Expand an SVE integer comparison using the SVE equivalent of:
18075
18076 (set TARGET (CODE OP0 OP1)). */
18077
18078 void
18079 aarch64_expand_sve_vec_cmp_int (rtx target, rtx_code code, rtx op0, rtx op1)
18080 {
18081 machine_mode pred_mode = GET_MODE (target);
18082 machine_mode data_mode = GET_MODE (op0);
18083 rtx res = aarch64_sve_emit_int_cmp (target, pred_mode, code, data_mode,
18084 op0, op1);
18085 if (!rtx_equal_p (target, res))
18086 emit_move_insn (target, res);
18087 }
18088
18089 /* Return the UNSPEC_COND_* code for comparison CODE. */
18090
18091 static unsigned int
18092 aarch64_unspec_cond_code (rtx_code code)
18093 {
18094 switch (code)
18095 {
18096 case NE:
18097 return UNSPEC_COND_FCMNE;
18098 case EQ:
18099 return UNSPEC_COND_FCMEQ;
18100 case LT:
18101 return UNSPEC_COND_FCMLT;
18102 case GT:
18103 return UNSPEC_COND_FCMGT;
18104 case LE:
18105 return UNSPEC_COND_FCMLE;
18106 case GE:
18107 return UNSPEC_COND_FCMGE;
18108 case UNORDERED:
18109 return UNSPEC_COND_FCMUO;
18110 default:
18111 gcc_unreachable ();
18112 }
18113 }
18114
18115 /* Emit:
18116
18117 (set TARGET (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X>))
18118
18119 where <X> is the operation associated with comparison CODE.
18120 KNOWN_PTRUE_P is true if PRED is known to be a PTRUE. */
18121
18122 static void
18123 aarch64_emit_sve_fp_cond (rtx target, rtx_code code, rtx pred,
18124 bool known_ptrue_p, rtx op0, rtx op1)
18125 {
18126 rtx flag = gen_int_mode (known_ptrue_p, SImode);
18127 rtx unspec = gen_rtx_UNSPEC (GET_MODE (pred),
18128 gen_rtvec (4, pred, flag, op0, op1),
18129 aarch64_unspec_cond_code (code));
18130 emit_set_insn (target, unspec);
18131 }
18132
18133 /* Emit the SVE equivalent of:
18134
18135 (set TMP1 (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X1>))
18136 (set TMP2 (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X2>))
18137 (set TARGET (ior:PRED_MODE TMP1 TMP2))
18138
18139 where <Xi> is the operation associated with comparison CODEi.
18140 KNOWN_PTRUE_P is true if PRED is known to be a PTRUE. */
18141
18142 static void
18143 aarch64_emit_sve_or_fp_conds (rtx target, rtx_code code1, rtx_code code2,
18144 rtx pred, bool known_ptrue_p, rtx op0, rtx op1)
18145 {
18146 machine_mode pred_mode = GET_MODE (pred);
18147 rtx tmp1 = gen_reg_rtx (pred_mode);
18148 aarch64_emit_sve_fp_cond (tmp1, code1, pred, known_ptrue_p, op0, op1);
18149 rtx tmp2 = gen_reg_rtx (pred_mode);
18150 aarch64_emit_sve_fp_cond (tmp2, code2, pred, known_ptrue_p, op0, op1);
18151 aarch64_emit_binop (target, ior_optab, tmp1, tmp2);
18152 }
18153
18154 /* Emit the SVE equivalent of:
18155
18156 (set TMP (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X>))
18157 (set TARGET (not TMP))
18158
18159 where <X> is the operation associated with comparison CODE.
18160 KNOWN_PTRUE_P is true if PRED is known to be a PTRUE. */
18161
18162 static void
18163 aarch64_emit_sve_invert_fp_cond (rtx target, rtx_code code, rtx pred,
18164 bool known_ptrue_p, rtx op0, rtx op1)
18165 {
18166 machine_mode pred_mode = GET_MODE (pred);
18167 rtx tmp = gen_reg_rtx (pred_mode);
18168 aarch64_emit_sve_fp_cond (tmp, code, pred, known_ptrue_p, op0, op1);
18169 aarch64_emit_unop (target, one_cmpl_optab, tmp);
18170 }
18171
18172 /* Expand an SVE floating-point comparison using the SVE equivalent of:
18173
18174 (set TARGET (CODE OP0 OP1))
18175
18176 If CAN_INVERT_P is true, the caller can also handle inverted results;
18177 return true if the result is in fact inverted. */
18178
18179 bool
18180 aarch64_expand_sve_vec_cmp_float (rtx target, rtx_code code,
18181 rtx op0, rtx op1, bool can_invert_p)
18182 {
18183 machine_mode pred_mode = GET_MODE (target);
18184 machine_mode data_mode = GET_MODE (op0);
18185
18186 rtx ptrue = aarch64_ptrue_reg (pred_mode);
18187 switch (code)
18188 {
18189 case UNORDERED:
18190 /* UNORDERED has no immediate form. */
18191 op1 = force_reg (data_mode, op1);
18192 /* fall through */
18193 case LT:
18194 case LE:
18195 case GT:
18196 case GE:
18197 case EQ:
18198 case NE:
18199 {
18200 /* There is native support for the comparison. */
18201 aarch64_emit_sve_fp_cond (target, code, ptrue, true, op0, op1);
18202 return false;
18203 }
18204
18205 case LTGT:
18206 /* This is a trapping operation (LT or GT). */
18207 aarch64_emit_sve_or_fp_conds (target, LT, GT, ptrue, true, op0, op1);
18208 return false;
18209
18210 case UNEQ:
18211 if (!flag_trapping_math)
18212 {
18213 /* This would trap for signaling NaNs. */
18214 op1 = force_reg (data_mode, op1);
18215 aarch64_emit_sve_or_fp_conds (target, UNORDERED, EQ,
18216 ptrue, true, op0, op1);
18217 return false;
18218 }
18219 /* fall through */
18220 case UNLT:
18221 case UNLE:
18222 case UNGT:
18223 case UNGE:
18224 if (flag_trapping_math)
18225 {
18226 /* Work out which elements are ordered. */
18227 rtx ordered = gen_reg_rtx (pred_mode);
18228 op1 = force_reg (data_mode, op1);
18229 aarch64_emit_sve_invert_fp_cond (ordered, UNORDERED,
18230 ptrue, true, op0, op1);
18231
18232 /* Test the opposite condition for the ordered elements,
18233 then invert the result. */
18234 if (code == UNEQ)
18235 code = NE;
18236 else
18237 code = reverse_condition_maybe_unordered (code);
18238 if (can_invert_p)
18239 {
18240 aarch64_emit_sve_fp_cond (target, code,
18241 ordered, false, op0, op1);
18242 return true;
18243 }
18244 aarch64_emit_sve_invert_fp_cond (target, code,
18245 ordered, false, op0, op1);
18246 return false;
18247 }
18248 break;
18249
18250 case ORDERED:
18251 /* ORDERED has no immediate form. */
18252 op1 = force_reg (data_mode, op1);
18253 break;
18254
18255 default:
18256 gcc_unreachable ();
18257 }
18258
18259 /* There is native support for the inverse comparison. */
18260 code = reverse_condition_maybe_unordered (code);
18261 if (can_invert_p)
18262 {
18263 aarch64_emit_sve_fp_cond (target, code, ptrue, true, op0, op1);
18264 return true;
18265 }
18266 aarch64_emit_sve_invert_fp_cond (target, code, ptrue, true, op0, op1);
18267 return false;
18268 }
18269
18270 /* Expand an SVE vcond pattern with operands OPS. DATA_MODE is the mode
18271 of the data being selected and CMP_MODE is the mode of the values being
18272 compared. */
18273
18274 void
18275 aarch64_expand_sve_vcond (machine_mode data_mode, machine_mode cmp_mode,
18276 rtx *ops)
18277 {
18278 machine_mode pred_mode
18279 = aarch64_get_mask_mode (GET_MODE_NUNITS (cmp_mode),
18280 GET_MODE_SIZE (cmp_mode)).require ();
18281 rtx pred = gen_reg_rtx (pred_mode);
18282 if (FLOAT_MODE_P (cmp_mode))
18283 {
18284 if (aarch64_expand_sve_vec_cmp_float (pred, GET_CODE (ops[3]),
18285 ops[4], ops[5], true))
18286 std::swap (ops[1], ops[2]);
18287 }
18288 else
18289 aarch64_expand_sve_vec_cmp_int (pred, GET_CODE (ops[3]), ops[4], ops[5]);
18290
18291 if (!aarch64_sve_reg_or_dup_imm (ops[1], data_mode))
18292 ops[1] = force_reg (data_mode, ops[1]);
18293 /* The "false" value can only be zero if the "true" value is a constant. */
18294 if (register_operand (ops[1], data_mode)
18295 || !aarch64_simd_reg_or_zero (ops[2], data_mode))
18296 ops[2] = force_reg (data_mode, ops[2]);
18297
18298 rtvec vec = gen_rtvec (3, pred, ops[1], ops[2]);
18299 emit_set_insn (ops[0], gen_rtx_UNSPEC (data_mode, vec, UNSPEC_SEL));
18300 }
18301
18302 /* Implement TARGET_MODES_TIEABLE_P. In principle we should always return
18303 true. However due to issues with register allocation it is preferable
18304 to avoid tieing integer scalar and FP scalar modes. Executing integer
18305 operations in general registers is better than treating them as scalar
18306 vector operations. This reduces latency and avoids redundant int<->FP
18307 moves. So tie modes if they are either the same class, or vector modes
18308 with other vector modes, vector structs or any scalar mode. */
18309
18310 static bool
18311 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
18312 {
18313 if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
18314 return true;
18315
18316 /* We specifically want to allow elements of "structure" modes to
18317 be tieable to the structure. This more general condition allows
18318 other rarer situations too. The reason we don't extend this to
18319 predicate modes is that there are no predicate structure modes
18320 nor any specific instructions for extracting part of a predicate
18321 register. */
18322 if (aarch64_vector_data_mode_p (mode1)
18323 && aarch64_vector_data_mode_p (mode2))
18324 return true;
18325
18326 /* Also allow any scalar modes with vectors. */
18327 if (aarch64_vector_mode_supported_p (mode1)
18328 || aarch64_vector_mode_supported_p (mode2))
18329 return true;
18330
18331 return false;
18332 }
18333
18334 /* Return a new RTX holding the result of moving POINTER forward by
18335 AMOUNT bytes. */
18336
18337 static rtx
18338 aarch64_move_pointer (rtx pointer, poly_int64 amount)
18339 {
18340 rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
18341
18342 return adjust_automodify_address (pointer, GET_MODE (pointer),
18343 next, amount);
18344 }
18345
18346 /* Return a new RTX holding the result of moving POINTER forward by the
18347 size of the mode it points to. */
18348
18349 static rtx
18350 aarch64_progress_pointer (rtx pointer)
18351 {
18352 return aarch64_move_pointer (pointer, GET_MODE_SIZE (GET_MODE (pointer)));
18353 }
18354
18355 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
18356 MODE bytes. */
18357
18358 static void
18359 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
18360 machine_mode mode)
18361 {
18362 rtx reg = gen_reg_rtx (mode);
18363
18364 /* "Cast" the pointers to the correct mode. */
18365 *src = adjust_address (*src, mode, 0);
18366 *dst = adjust_address (*dst, mode, 0);
18367 /* Emit the memcpy. */
18368 emit_move_insn (reg, *src);
18369 emit_move_insn (*dst, reg);
18370 /* Move the pointers forward. */
18371 *src = aarch64_progress_pointer (*src);
18372 *dst = aarch64_progress_pointer (*dst);
18373 }
18374
18375 /* Expand cpymem, as if from a __builtin_memcpy. Return true if
18376 we succeed, otherwise return false. */
18377
18378 bool
18379 aarch64_expand_cpymem (rtx *operands)
18380 {
18381 int n, mode_bits;
18382 rtx dst = operands[0];
18383 rtx src = operands[1];
18384 rtx base;
18385 machine_mode cur_mode = BLKmode, next_mode;
18386 bool speed_p = !optimize_function_for_size_p (cfun);
18387
18388 /* When optimizing for size, give a better estimate of the length of a
18389 memcpy call, but use the default otherwise. Moves larger than 8 bytes
18390 will always require an even number of instructions to do now. And each
18391 operation requires both a load+store, so devide the max number by 2. */
18392 int max_num_moves = (speed_p ? 16 : AARCH64_CALL_RATIO) / 2;
18393
18394 /* We can't do anything smart if the amount to copy is not constant. */
18395 if (!CONST_INT_P (operands[2]))
18396 return false;
18397
18398 n = INTVAL (operands[2]);
18399
18400 /* Try to keep the number of instructions low. For all cases we will do at
18401 most two moves for the residual amount, since we'll always overlap the
18402 remainder. */
18403 if (((n / 16) + (n % 16 ? 2 : 0)) > max_num_moves)
18404 return false;
18405
18406 base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
18407 dst = adjust_automodify_address (dst, VOIDmode, base, 0);
18408
18409 base = copy_to_mode_reg (Pmode, XEXP (src, 0));
18410 src = adjust_automodify_address (src, VOIDmode, base, 0);
18411
18412 /* Convert n to bits to make the rest of the code simpler. */
18413 n = n * BITS_PER_UNIT;
18414
18415 /* Maximum amount to copy in one go. The AArch64 back-end has integer modes
18416 larger than TImode, but we should not use them for loads/stores here. */
18417 const int copy_limit = GET_MODE_BITSIZE (TImode);
18418
18419 while (n > 0)
18420 {
18421 /* Find the largest mode in which to do the copy in without over reading
18422 or writing. */
18423 opt_scalar_int_mode mode_iter;
18424 FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
18425 if (GET_MODE_BITSIZE (mode_iter.require ()) <= MIN (n, copy_limit))
18426 cur_mode = mode_iter.require ();
18427
18428 gcc_assert (cur_mode != BLKmode);
18429
18430 mode_bits = GET_MODE_BITSIZE (cur_mode).to_constant ();
18431 aarch64_copy_one_block_and_progress_pointers (&src, &dst, cur_mode);
18432
18433 n -= mode_bits;
18434
18435 /* Do certain trailing copies as overlapping if it's going to be
18436 cheaper. i.e. less instructions to do so. For instance doing a 15
18437 byte copy it's more efficient to do two overlapping 8 byte copies than
18438 8 + 6 + 1. */
18439 if (n > 0 && n <= 8 * BITS_PER_UNIT)
18440 {
18441 next_mode = smallest_mode_for_size (n, MODE_INT);
18442 int n_bits = GET_MODE_BITSIZE (next_mode).to_constant ();
18443 src = aarch64_move_pointer (src, (n - n_bits) / BITS_PER_UNIT);
18444 dst = aarch64_move_pointer (dst, (n - n_bits) / BITS_PER_UNIT);
18445 n = n_bits;
18446 }
18447 }
18448
18449 return true;
18450 }
18451
18452 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
18453 SImode stores. Handle the case when the constant has identical
18454 bottom and top halves. This is beneficial when the two stores can be
18455 merged into an STP and we avoid synthesising potentially expensive
18456 immediates twice. Return true if such a split is possible. */
18457
18458 bool
18459 aarch64_split_dimode_const_store (rtx dst, rtx src)
18460 {
18461 rtx lo = gen_lowpart (SImode, src);
18462 rtx hi = gen_highpart_mode (SImode, DImode, src);
18463
18464 bool size_p = optimize_function_for_size_p (cfun);
18465
18466 if (!rtx_equal_p (lo, hi))
18467 return false;
18468
18469 unsigned int orig_cost
18470 = aarch64_internal_mov_immediate (NULL_RTX, src, false, DImode);
18471 unsigned int lo_cost
18472 = aarch64_internal_mov_immediate (NULL_RTX, lo, false, SImode);
18473
18474 /* We want to transform:
18475 MOV x1, 49370
18476 MOVK x1, 0x140, lsl 16
18477 MOVK x1, 0xc0da, lsl 32
18478 MOVK x1, 0x140, lsl 48
18479 STR x1, [x0]
18480 into:
18481 MOV w1, 49370
18482 MOVK w1, 0x140, lsl 16
18483 STP w1, w1, [x0]
18484 So we want to perform this only when we save two instructions
18485 or more. When optimizing for size, however, accept any code size
18486 savings we can. */
18487 if (size_p && orig_cost <= lo_cost)
18488 return false;
18489
18490 if (!size_p
18491 && (orig_cost <= lo_cost + 1))
18492 return false;
18493
18494 rtx mem_lo = adjust_address (dst, SImode, 0);
18495 if (!aarch64_mem_pair_operand (mem_lo, SImode))
18496 return false;
18497
18498 rtx tmp_reg = gen_reg_rtx (SImode);
18499 aarch64_expand_mov_immediate (tmp_reg, lo);
18500 rtx mem_hi = aarch64_move_pointer (mem_lo, GET_MODE_SIZE (SImode));
18501 /* Don't emit an explicit store pair as this may not be always profitable.
18502 Let the sched-fusion logic decide whether to merge them. */
18503 emit_move_insn (mem_lo, tmp_reg);
18504 emit_move_insn (mem_hi, tmp_reg);
18505
18506 return true;
18507 }
18508
18509 /* Generate RTL for a conditional branch with rtx comparison CODE in
18510 mode CC_MODE. The destination of the unlikely conditional branch
18511 is LABEL_REF. */
18512
18513 void
18514 aarch64_gen_unlikely_cbranch (enum rtx_code code, machine_mode cc_mode,
18515 rtx label_ref)
18516 {
18517 rtx x;
18518 x = gen_rtx_fmt_ee (code, VOIDmode,
18519 gen_rtx_REG (cc_mode, CC_REGNUM),
18520 const0_rtx);
18521
18522 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
18523 gen_rtx_LABEL_REF (VOIDmode, label_ref),
18524 pc_rtx);
18525 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
18526 }
18527
18528 /* Generate DImode scratch registers for 128-bit (TImode) addition.
18529
18530 OP1 represents the TImode destination operand 1
18531 OP2 represents the TImode destination operand 2
18532 LOW_DEST represents the low half (DImode) of TImode operand 0
18533 LOW_IN1 represents the low half (DImode) of TImode operand 1
18534 LOW_IN2 represents the low half (DImode) of TImode operand 2
18535 HIGH_DEST represents the high half (DImode) of TImode operand 0
18536 HIGH_IN1 represents the high half (DImode) of TImode operand 1
18537 HIGH_IN2 represents the high half (DImode) of TImode operand 2. */
18538
18539 void
18540 aarch64_addti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
18541 rtx *low_in1, rtx *low_in2,
18542 rtx *high_dest, rtx *high_in1,
18543 rtx *high_in2)
18544 {
18545 *low_dest = gen_reg_rtx (DImode);
18546 *low_in1 = gen_lowpart (DImode, op1);
18547 *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
18548 subreg_lowpart_offset (DImode, TImode));
18549 *high_dest = gen_reg_rtx (DImode);
18550 *high_in1 = gen_highpart (DImode, op1);
18551 *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
18552 subreg_highpart_offset (DImode, TImode));
18553 }
18554
18555 /* Generate DImode scratch registers for 128-bit (TImode) subtraction.
18556
18557 This function differs from 'arch64_addti_scratch_regs' in that
18558 OP1 can be an immediate constant (zero). We must call
18559 subreg_highpart_offset with DImode and TImode arguments, otherwise
18560 VOIDmode will be used for the const_int which generates an internal
18561 error from subreg_size_highpart_offset which does not expect a size of zero.
18562
18563 OP1 represents the TImode destination operand 1
18564 OP2 represents the TImode destination operand 2
18565 LOW_DEST represents the low half (DImode) of TImode operand 0
18566 LOW_IN1 represents the low half (DImode) of TImode operand 1
18567 LOW_IN2 represents the low half (DImode) of TImode operand 2
18568 HIGH_DEST represents the high half (DImode) of TImode operand 0
18569 HIGH_IN1 represents the high half (DImode) of TImode operand 1
18570 HIGH_IN2 represents the high half (DImode) of TImode operand 2. */
18571
18572
18573 void
18574 aarch64_subvti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
18575 rtx *low_in1, rtx *low_in2,
18576 rtx *high_dest, rtx *high_in1,
18577 rtx *high_in2)
18578 {
18579 *low_dest = gen_reg_rtx (DImode);
18580 *low_in1 = simplify_gen_subreg (DImode, op1, TImode,
18581 subreg_lowpart_offset (DImode, TImode));
18582
18583 *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
18584 subreg_lowpart_offset (DImode, TImode));
18585 *high_dest = gen_reg_rtx (DImode);
18586
18587 *high_in1 = simplify_gen_subreg (DImode, op1, TImode,
18588 subreg_highpart_offset (DImode, TImode));
18589 *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
18590 subreg_highpart_offset (DImode, TImode));
18591 }
18592
18593 /* Generate RTL for 128-bit (TImode) subtraction with overflow.
18594
18595 OP0 represents the TImode destination operand 0
18596 LOW_DEST represents the low half (DImode) of TImode operand 0
18597 LOW_IN1 represents the low half (DImode) of TImode operand 1
18598 LOW_IN2 represents the low half (DImode) of TImode operand 2
18599 HIGH_DEST represents the high half (DImode) of TImode operand 0
18600 HIGH_IN1 represents the high half (DImode) of TImode operand 1
18601 HIGH_IN2 represents the high half (DImode) of TImode operand 2
18602 UNSIGNED_P is true if the operation is being performed on unsigned
18603 values. */
18604 void
18605 aarch64_expand_subvti (rtx op0, rtx low_dest, rtx low_in1,
18606 rtx low_in2, rtx high_dest, rtx high_in1,
18607 rtx high_in2, bool unsigned_p)
18608 {
18609 if (low_in2 == const0_rtx)
18610 {
18611 low_dest = low_in1;
18612 high_in2 = force_reg (DImode, high_in2);
18613 if (unsigned_p)
18614 emit_insn (gen_subdi3_compare1 (high_dest, high_in1, high_in2));
18615 else
18616 emit_insn (gen_subvdi_insn (high_dest, high_in1, high_in2));
18617 }
18618 else
18619 {
18620 if (CONST_INT_P (low_in2))
18621 {
18622 high_in2 = force_reg (DImode, high_in2);
18623 emit_insn (gen_subdi3_compare1_imm (low_dest, low_in1, low_in2,
18624 GEN_INT (-INTVAL (low_in2))));
18625 }
18626 else
18627 emit_insn (gen_subdi3_compare1 (low_dest, low_in1, low_in2));
18628
18629 if (unsigned_p)
18630 emit_insn (gen_usubdi3_carryinC (high_dest, high_in1, high_in2));
18631 else
18632 emit_insn (gen_subdi3_carryinV (high_dest, high_in1, high_in2));
18633 }
18634
18635 emit_move_insn (gen_lowpart (DImode, op0), low_dest);
18636 emit_move_insn (gen_highpart (DImode, op0), high_dest);
18637
18638 }
18639
18640 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
18641
18642 static unsigned HOST_WIDE_INT
18643 aarch64_asan_shadow_offset (void)
18644 {
18645 if (TARGET_ILP32)
18646 return (HOST_WIDE_INT_1 << 29);
18647 else
18648 return (HOST_WIDE_INT_1 << 36);
18649 }
18650
18651 static rtx
18652 aarch64_gen_ccmp_first (rtx_insn **prep_seq, rtx_insn **gen_seq,
18653 int code, tree treeop0, tree treeop1)
18654 {
18655 machine_mode op_mode, cmp_mode, cc_mode = CCmode;
18656 rtx op0, op1;
18657 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
18658 insn_code icode;
18659 struct expand_operand ops[4];
18660
18661 start_sequence ();
18662 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
18663
18664 op_mode = GET_MODE (op0);
18665 if (op_mode == VOIDmode)
18666 op_mode = GET_MODE (op1);
18667
18668 switch (op_mode)
18669 {
18670 case E_QImode:
18671 case E_HImode:
18672 case E_SImode:
18673 cmp_mode = SImode;
18674 icode = CODE_FOR_cmpsi;
18675 break;
18676
18677 case E_DImode:
18678 cmp_mode = DImode;
18679 icode = CODE_FOR_cmpdi;
18680 break;
18681
18682 case E_SFmode:
18683 cmp_mode = SFmode;
18684 cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
18685 icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpesf : CODE_FOR_fcmpsf;
18686 break;
18687
18688 case E_DFmode:
18689 cmp_mode = DFmode;
18690 cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
18691 icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpedf : CODE_FOR_fcmpdf;
18692 break;
18693
18694 default:
18695 end_sequence ();
18696 return NULL_RTX;
18697 }
18698
18699 op0 = prepare_operand (icode, op0, 0, op_mode, cmp_mode, unsignedp);
18700 op1 = prepare_operand (icode, op1, 1, op_mode, cmp_mode, unsignedp);
18701 if (!op0 || !op1)
18702 {
18703 end_sequence ();
18704 return NULL_RTX;
18705 }
18706 *prep_seq = get_insns ();
18707 end_sequence ();
18708
18709 create_fixed_operand (&ops[0], op0);
18710 create_fixed_operand (&ops[1], op1);
18711
18712 start_sequence ();
18713 if (!maybe_expand_insn (icode, 2, ops))
18714 {
18715 end_sequence ();
18716 return NULL_RTX;
18717 }
18718 *gen_seq = get_insns ();
18719 end_sequence ();
18720
18721 return gen_rtx_fmt_ee ((rtx_code) code, cc_mode,
18722 gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx);
18723 }
18724
18725 static rtx
18726 aarch64_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev,
18727 int cmp_code, tree treeop0, tree treeop1, int bit_code)
18728 {
18729 rtx op0, op1, target;
18730 machine_mode op_mode, cmp_mode, cc_mode = CCmode;
18731 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
18732 insn_code icode;
18733 struct expand_operand ops[6];
18734 int aarch64_cond;
18735
18736 push_to_sequence (*prep_seq);
18737 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
18738
18739 op_mode = GET_MODE (op0);
18740 if (op_mode == VOIDmode)
18741 op_mode = GET_MODE (op1);
18742
18743 switch (op_mode)
18744 {
18745 case E_QImode:
18746 case E_HImode:
18747 case E_SImode:
18748 cmp_mode = SImode;
18749 icode = CODE_FOR_ccmpsi;
18750 break;
18751
18752 case E_DImode:
18753 cmp_mode = DImode;
18754 icode = CODE_FOR_ccmpdi;
18755 break;
18756
18757 case E_SFmode:
18758 cmp_mode = SFmode;
18759 cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
18760 icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpesf : CODE_FOR_fccmpsf;
18761 break;
18762
18763 case E_DFmode:
18764 cmp_mode = DFmode;
18765 cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
18766 icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpedf : CODE_FOR_fccmpdf;
18767 break;
18768
18769 default:
18770 end_sequence ();
18771 return NULL_RTX;
18772 }
18773
18774 op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
18775 op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
18776 if (!op0 || !op1)
18777 {
18778 end_sequence ();
18779 return NULL_RTX;
18780 }
18781 *prep_seq = get_insns ();
18782 end_sequence ();
18783
18784 target = gen_rtx_REG (cc_mode, CC_REGNUM);
18785 aarch64_cond = aarch64_get_condition_code_1 (cc_mode, (rtx_code) cmp_code);
18786
18787 if (bit_code != AND)
18788 {
18789 prev = gen_rtx_fmt_ee (REVERSE_CONDITION (GET_CODE (prev),
18790 GET_MODE (XEXP (prev, 0))),
18791 VOIDmode, XEXP (prev, 0), const0_rtx);
18792 aarch64_cond = AARCH64_INVERSE_CONDITION_CODE (aarch64_cond);
18793 }
18794
18795 create_fixed_operand (&ops[0], XEXP (prev, 0));
18796 create_fixed_operand (&ops[1], target);
18797 create_fixed_operand (&ops[2], op0);
18798 create_fixed_operand (&ops[3], op1);
18799 create_fixed_operand (&ops[4], prev);
18800 create_fixed_operand (&ops[5], GEN_INT (aarch64_cond));
18801
18802 push_to_sequence (*gen_seq);
18803 if (!maybe_expand_insn (icode, 6, ops))
18804 {
18805 end_sequence ();
18806 return NULL_RTX;
18807 }
18808
18809 *gen_seq = get_insns ();
18810 end_sequence ();
18811
18812 return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
18813 }
18814
18815 #undef TARGET_GEN_CCMP_FIRST
18816 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
18817
18818 #undef TARGET_GEN_CCMP_NEXT
18819 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
18820
18821 /* Implement TARGET_SCHED_MACRO_FUSION_P. Return true if target supports
18822 instruction fusion of some sort. */
18823
18824 static bool
18825 aarch64_macro_fusion_p (void)
18826 {
18827 return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
18828 }
18829
18830
18831 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P. Return true if PREV and CURR
18832 should be kept together during scheduling. */
18833
18834 static bool
18835 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
18836 {
18837 rtx set_dest;
18838 rtx prev_set = single_set (prev);
18839 rtx curr_set = single_set (curr);
18840 /* prev and curr are simple SET insns i.e. no flag setting or branching. */
18841 bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
18842
18843 if (!aarch64_macro_fusion_p ())
18844 return false;
18845
18846 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK))
18847 {
18848 /* We are trying to match:
18849 prev (mov) == (set (reg r0) (const_int imm16))
18850 curr (movk) == (set (zero_extract (reg r0)
18851 (const_int 16)
18852 (const_int 16))
18853 (const_int imm16_1)) */
18854
18855 set_dest = SET_DEST (curr_set);
18856
18857 if (GET_CODE (set_dest) == ZERO_EXTRACT
18858 && CONST_INT_P (SET_SRC (curr_set))
18859 && CONST_INT_P (SET_SRC (prev_set))
18860 && CONST_INT_P (XEXP (set_dest, 2))
18861 && INTVAL (XEXP (set_dest, 2)) == 16
18862 && REG_P (XEXP (set_dest, 0))
18863 && REG_P (SET_DEST (prev_set))
18864 && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
18865 {
18866 return true;
18867 }
18868 }
18869
18870 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD))
18871 {
18872
18873 /* We're trying to match:
18874 prev (adrp) == (set (reg r1)
18875 (high (symbol_ref ("SYM"))))
18876 curr (add) == (set (reg r0)
18877 (lo_sum (reg r1)
18878 (symbol_ref ("SYM"))))
18879 Note that r0 need not necessarily be the same as r1, especially
18880 during pre-regalloc scheduling. */
18881
18882 if (satisfies_constraint_Ush (SET_SRC (prev_set))
18883 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
18884 {
18885 if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
18886 && REG_P (XEXP (SET_SRC (curr_set), 0))
18887 && REGNO (XEXP (SET_SRC (curr_set), 0))
18888 == REGNO (SET_DEST (prev_set))
18889 && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
18890 XEXP (SET_SRC (curr_set), 1)))
18891 return true;
18892 }
18893 }
18894
18895 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK))
18896 {
18897
18898 /* We're trying to match:
18899 prev (movk) == (set (zero_extract (reg r0)
18900 (const_int 16)
18901 (const_int 32))
18902 (const_int imm16_1))
18903 curr (movk) == (set (zero_extract (reg r0)
18904 (const_int 16)
18905 (const_int 48))
18906 (const_int imm16_2)) */
18907
18908 if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
18909 && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
18910 && REG_P (XEXP (SET_DEST (prev_set), 0))
18911 && REG_P (XEXP (SET_DEST (curr_set), 0))
18912 && REGNO (XEXP (SET_DEST (prev_set), 0))
18913 == REGNO (XEXP (SET_DEST (curr_set), 0))
18914 && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
18915 && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
18916 && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
18917 && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
18918 && CONST_INT_P (SET_SRC (prev_set))
18919 && CONST_INT_P (SET_SRC (curr_set)))
18920 return true;
18921
18922 }
18923 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR))
18924 {
18925 /* We're trying to match:
18926 prev (adrp) == (set (reg r0)
18927 (high (symbol_ref ("SYM"))))
18928 curr (ldr) == (set (reg r1)
18929 (mem (lo_sum (reg r0)
18930 (symbol_ref ("SYM")))))
18931 or
18932 curr (ldr) == (set (reg r1)
18933 (zero_extend (mem
18934 (lo_sum (reg r0)
18935 (symbol_ref ("SYM")))))) */
18936 if (satisfies_constraint_Ush (SET_SRC (prev_set))
18937 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
18938 {
18939 rtx curr_src = SET_SRC (curr_set);
18940
18941 if (GET_CODE (curr_src) == ZERO_EXTEND)
18942 curr_src = XEXP (curr_src, 0);
18943
18944 if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
18945 && REG_P (XEXP (XEXP (curr_src, 0), 0))
18946 && REGNO (XEXP (XEXP (curr_src, 0), 0))
18947 == REGNO (SET_DEST (prev_set))
18948 && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
18949 XEXP (SET_SRC (prev_set), 0)))
18950 return true;
18951 }
18952 }
18953
18954 if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH)
18955 && any_condjump_p (curr))
18956 {
18957 unsigned int condreg1, condreg2;
18958 rtx cc_reg_1;
18959 aarch64_fixed_condition_code_regs (&condreg1, &condreg2);
18960 cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
18961
18962 if (reg_referenced_p (cc_reg_1, PATTERN (curr))
18963 && prev
18964 && modified_in_p (cc_reg_1, prev))
18965 {
18966 enum attr_type prev_type = get_attr_type (prev);
18967
18968 /* FIXME: this misses some which is considered simple arthematic
18969 instructions for ThunderX. Simple shifts are missed here. */
18970 if (prev_type == TYPE_ALUS_SREG
18971 || prev_type == TYPE_ALUS_IMM
18972 || prev_type == TYPE_LOGICS_REG
18973 || prev_type == TYPE_LOGICS_IMM)
18974 return true;
18975 }
18976 }
18977
18978 if (prev_set
18979 && curr_set
18980 && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH)
18981 && any_condjump_p (curr))
18982 {
18983 /* We're trying to match:
18984 prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
18985 curr (cbz) == (set (pc) (if_then_else (eq/ne) (r0)
18986 (const_int 0))
18987 (label_ref ("SYM"))
18988 (pc)) */
18989 if (SET_DEST (curr_set) == (pc_rtx)
18990 && GET_CODE (SET_SRC (curr_set)) == IF_THEN_ELSE
18991 && REG_P (XEXP (XEXP (SET_SRC (curr_set), 0), 0))
18992 && REG_P (SET_DEST (prev_set))
18993 && REGNO (SET_DEST (prev_set))
18994 == REGNO (XEXP (XEXP (SET_SRC (curr_set), 0), 0)))
18995 {
18996 /* Fuse ALU operations followed by conditional branch instruction. */
18997 switch (get_attr_type (prev))
18998 {
18999 case TYPE_ALU_IMM:
19000 case TYPE_ALU_SREG:
19001 case TYPE_ADC_REG:
19002 case TYPE_ADC_IMM:
19003 case TYPE_ADCS_REG:
19004 case TYPE_ADCS_IMM:
19005 case TYPE_LOGIC_REG:
19006 case TYPE_LOGIC_IMM:
19007 case TYPE_CSEL:
19008 case TYPE_ADR:
19009 case TYPE_MOV_IMM:
19010 case TYPE_SHIFT_REG:
19011 case TYPE_SHIFT_IMM:
19012 case TYPE_BFM:
19013 case TYPE_RBIT:
19014 case TYPE_REV:
19015 case TYPE_EXTEND:
19016 return true;
19017
19018 default:;
19019 }
19020 }
19021 }
19022
19023 return false;
19024 }
19025
19026 /* Return true iff the instruction fusion described by OP is enabled. */
19027
19028 bool
19029 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op)
19030 {
19031 return (aarch64_tune_params.fusible_ops & op) != 0;
19032 }
19033
19034 /* If MEM is in the form of [base+offset], extract the two parts
19035 of address and set to BASE and OFFSET, otherwise return false
19036 after clearing BASE and OFFSET. */
19037
19038 bool
19039 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
19040 {
19041 rtx addr;
19042
19043 gcc_assert (MEM_P (mem));
19044
19045 addr = XEXP (mem, 0);
19046
19047 if (REG_P (addr))
19048 {
19049 *base = addr;
19050 *offset = const0_rtx;
19051 return true;
19052 }
19053
19054 if (GET_CODE (addr) == PLUS
19055 && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
19056 {
19057 *base = XEXP (addr, 0);
19058 *offset = XEXP (addr, 1);
19059 return true;
19060 }
19061
19062 *base = NULL_RTX;
19063 *offset = NULL_RTX;
19064
19065 return false;
19066 }
19067
19068 /* Types for scheduling fusion. */
19069 enum sched_fusion_type
19070 {
19071 SCHED_FUSION_NONE = 0,
19072 SCHED_FUSION_LD_SIGN_EXTEND,
19073 SCHED_FUSION_LD_ZERO_EXTEND,
19074 SCHED_FUSION_LD,
19075 SCHED_FUSION_ST,
19076 SCHED_FUSION_NUM
19077 };
19078
19079 /* If INSN is a load or store of address in the form of [base+offset],
19080 extract the two parts and set to BASE and OFFSET. Return scheduling
19081 fusion type this INSN is. */
19082
19083 static enum sched_fusion_type
19084 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
19085 {
19086 rtx x, dest, src;
19087 enum sched_fusion_type fusion = SCHED_FUSION_LD;
19088
19089 gcc_assert (INSN_P (insn));
19090 x = PATTERN (insn);
19091 if (GET_CODE (x) != SET)
19092 return SCHED_FUSION_NONE;
19093
19094 src = SET_SRC (x);
19095 dest = SET_DEST (x);
19096
19097 machine_mode dest_mode = GET_MODE (dest);
19098
19099 if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
19100 return SCHED_FUSION_NONE;
19101
19102 if (GET_CODE (src) == SIGN_EXTEND)
19103 {
19104 fusion = SCHED_FUSION_LD_SIGN_EXTEND;
19105 src = XEXP (src, 0);
19106 if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
19107 return SCHED_FUSION_NONE;
19108 }
19109 else if (GET_CODE (src) == ZERO_EXTEND)
19110 {
19111 fusion = SCHED_FUSION_LD_ZERO_EXTEND;
19112 src = XEXP (src, 0);
19113 if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
19114 return SCHED_FUSION_NONE;
19115 }
19116
19117 if (GET_CODE (src) == MEM && REG_P (dest))
19118 extract_base_offset_in_addr (src, base, offset);
19119 else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
19120 {
19121 fusion = SCHED_FUSION_ST;
19122 extract_base_offset_in_addr (dest, base, offset);
19123 }
19124 else
19125 return SCHED_FUSION_NONE;
19126
19127 if (*base == NULL_RTX || *offset == NULL_RTX)
19128 fusion = SCHED_FUSION_NONE;
19129
19130 return fusion;
19131 }
19132
19133 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
19134
19135 Currently we only support to fuse ldr or str instructions, so FUSION_PRI
19136 and PRI are only calculated for these instructions. For other instruction,
19137 FUSION_PRI and PRI are simply set to MAX_PRI - 1. In the future, other
19138 type instruction fusion can be added by returning different priorities.
19139
19140 It's important that irrelevant instructions get the largest FUSION_PRI. */
19141
19142 static void
19143 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
19144 int *fusion_pri, int *pri)
19145 {
19146 int tmp, off_val;
19147 rtx base, offset;
19148 enum sched_fusion_type fusion;
19149
19150 gcc_assert (INSN_P (insn));
19151
19152 tmp = max_pri - 1;
19153 fusion = fusion_load_store (insn, &base, &offset);
19154 if (fusion == SCHED_FUSION_NONE)
19155 {
19156 *pri = tmp;
19157 *fusion_pri = tmp;
19158 return;
19159 }
19160
19161 /* Set FUSION_PRI according to fusion type and base register. */
19162 *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
19163
19164 /* Calculate PRI. */
19165 tmp /= 2;
19166
19167 /* INSN with smaller offset goes first. */
19168 off_val = (int)(INTVAL (offset));
19169 if (off_val >= 0)
19170 tmp -= (off_val & 0xfffff);
19171 else
19172 tmp += ((- off_val) & 0xfffff);
19173
19174 *pri = tmp;
19175 return;
19176 }
19177
19178 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
19179 Adjust priority of sha1h instructions so they are scheduled before
19180 other SHA1 instructions. */
19181
19182 static int
19183 aarch64_sched_adjust_priority (rtx_insn *insn, int priority)
19184 {
19185 rtx x = PATTERN (insn);
19186
19187 if (GET_CODE (x) == SET)
19188 {
19189 x = SET_SRC (x);
19190
19191 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SHA1H)
19192 return priority + 10;
19193 }
19194
19195 return priority;
19196 }
19197
19198 /* Given OPERANDS of consecutive load/store, check if we can merge
19199 them into ldp/stp. LOAD is true if they are load instructions.
19200 MODE is the mode of memory operands. */
19201
19202 bool
19203 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
19204 machine_mode mode)
19205 {
19206 HOST_WIDE_INT offval_1, offval_2, msize;
19207 enum reg_class rclass_1, rclass_2;
19208 rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
19209
19210 if (load)
19211 {
19212 mem_1 = operands[1];
19213 mem_2 = operands[3];
19214 reg_1 = operands[0];
19215 reg_2 = operands[2];
19216 gcc_assert (REG_P (reg_1) && REG_P (reg_2));
19217 if (REGNO (reg_1) == REGNO (reg_2))
19218 return false;
19219 }
19220 else
19221 {
19222 mem_1 = operands[0];
19223 mem_2 = operands[2];
19224 reg_1 = operands[1];
19225 reg_2 = operands[3];
19226 }
19227
19228 /* The mems cannot be volatile. */
19229 if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
19230 return false;
19231
19232 /* If we have SImode and slow unaligned ldp,
19233 check the alignment to be at least 8 byte. */
19234 if (mode == SImode
19235 && (aarch64_tune_params.extra_tuning_flags
19236 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
19237 && !optimize_size
19238 && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
19239 return false;
19240
19241 /* Check if the addresses are in the form of [base+offset]. */
19242 extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
19243 if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
19244 return false;
19245 extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
19246 if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
19247 return false;
19248
19249 /* Check if the bases are same. */
19250 if (!rtx_equal_p (base_1, base_2))
19251 return false;
19252
19253 /* The operands must be of the same size. */
19254 gcc_assert (known_eq (GET_MODE_SIZE (GET_MODE (mem_1)),
19255 GET_MODE_SIZE (GET_MODE (mem_2))));
19256
19257 offval_1 = INTVAL (offset_1);
19258 offval_2 = INTVAL (offset_2);
19259 /* We should only be trying this for fixed-sized modes. There is no
19260 SVE LDP/STP instruction. */
19261 msize = GET_MODE_SIZE (mode).to_constant ();
19262 /* Check if the offsets are consecutive. */
19263 if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
19264 return false;
19265
19266 /* Check if the addresses are clobbered by load. */
19267 if (load)
19268 {
19269 if (reg_mentioned_p (reg_1, mem_1))
19270 return false;
19271
19272 /* In increasing order, the last load can clobber the address. */
19273 if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
19274 return false;
19275 }
19276
19277 /* One of the memory accesses must be a mempair operand.
19278 If it is not the first one, they need to be swapped by the
19279 peephole. */
19280 if (!aarch64_mem_pair_operand (mem_1, GET_MODE (mem_1))
19281 && !aarch64_mem_pair_operand (mem_2, GET_MODE (mem_2)))
19282 return false;
19283
19284 if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
19285 rclass_1 = FP_REGS;
19286 else
19287 rclass_1 = GENERAL_REGS;
19288
19289 if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
19290 rclass_2 = FP_REGS;
19291 else
19292 rclass_2 = GENERAL_REGS;
19293
19294 /* Check if the registers are of same class. */
19295 if (rclass_1 != rclass_2)
19296 return false;
19297
19298 return true;
19299 }
19300
19301 /* Given OPERANDS of consecutive load/store that can be merged,
19302 swap them if they are not in ascending order. */
19303 void
19304 aarch64_swap_ldrstr_operands (rtx* operands, bool load)
19305 {
19306 rtx mem_1, mem_2, base_1, base_2, offset_1, offset_2;
19307 HOST_WIDE_INT offval_1, offval_2;
19308
19309 if (load)
19310 {
19311 mem_1 = operands[1];
19312 mem_2 = operands[3];
19313 }
19314 else
19315 {
19316 mem_1 = operands[0];
19317 mem_2 = operands[2];
19318 }
19319
19320 extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
19321 extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
19322
19323 offval_1 = INTVAL (offset_1);
19324 offval_2 = INTVAL (offset_2);
19325
19326 if (offval_1 > offval_2)
19327 {
19328 /* Irrespective of whether this is a load or a store,
19329 we do the same swap. */
19330 std::swap (operands[0], operands[2]);
19331 std::swap (operands[1], operands[3]);
19332 }
19333 }
19334
19335 /* Taking X and Y to be HOST_WIDE_INT pointers, return the result of a
19336 comparison between the two. */
19337 int
19338 aarch64_host_wide_int_compare (const void *x, const void *y)
19339 {
19340 return wi::cmps (* ((const HOST_WIDE_INT *) x),
19341 * ((const HOST_WIDE_INT *) y));
19342 }
19343
19344 /* Taking X and Y to be pairs of RTX, one pointing to a MEM rtx and the
19345 other pointing to a REG rtx containing an offset, compare the offsets
19346 of the two pairs.
19347
19348 Return:
19349
19350 1 iff offset (X) > offset (Y)
19351 0 iff offset (X) == offset (Y)
19352 -1 iff offset (X) < offset (Y) */
19353 int
19354 aarch64_ldrstr_offset_compare (const void *x, const void *y)
19355 {
19356 const rtx * operands_1 = (const rtx *) x;
19357 const rtx * operands_2 = (const rtx *) y;
19358 rtx mem_1, mem_2, base, offset_1, offset_2;
19359
19360 if (MEM_P (operands_1[0]))
19361 mem_1 = operands_1[0];
19362 else
19363 mem_1 = operands_1[1];
19364
19365 if (MEM_P (operands_2[0]))
19366 mem_2 = operands_2[0];
19367 else
19368 mem_2 = operands_2[1];
19369
19370 /* Extract the offsets. */
19371 extract_base_offset_in_addr (mem_1, &base, &offset_1);
19372 extract_base_offset_in_addr (mem_2, &base, &offset_2);
19373
19374 gcc_assert (offset_1 != NULL_RTX && offset_2 != NULL_RTX);
19375
19376 return wi::cmps (INTVAL (offset_1), INTVAL (offset_2));
19377 }
19378
19379 /* Given OPERANDS of consecutive load/store, check if we can merge
19380 them into ldp/stp by adjusting the offset. LOAD is true if they
19381 are load instructions. MODE is the mode of memory operands.
19382
19383 Given below consecutive stores:
19384
19385 str w1, [xb, 0x100]
19386 str w1, [xb, 0x104]
19387 str w1, [xb, 0x108]
19388 str w1, [xb, 0x10c]
19389
19390 Though the offsets are out of the range supported by stp, we can
19391 still pair them after adjusting the offset, like:
19392
19393 add scratch, xb, 0x100
19394 stp w1, w1, [scratch]
19395 stp w1, w1, [scratch, 0x8]
19396
19397 The peephole patterns detecting this opportunity should guarantee
19398 the scratch register is avaliable. */
19399
19400 bool
19401 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
19402 scalar_mode mode)
19403 {
19404 const int num_insns = 4;
19405 enum reg_class rclass;
19406 HOST_WIDE_INT offvals[num_insns], msize;
19407 rtx mem[num_insns], reg[num_insns], base[num_insns], offset[num_insns];
19408
19409 if (load)
19410 {
19411 for (int i = 0; i < num_insns; i++)
19412 {
19413 reg[i] = operands[2 * i];
19414 mem[i] = operands[2 * i + 1];
19415
19416 gcc_assert (REG_P (reg[i]));
19417 }
19418
19419 /* Do not attempt to merge the loads if the loads clobber each other. */
19420 for (int i = 0; i < 8; i += 2)
19421 for (int j = i + 2; j < 8; j += 2)
19422 if (reg_overlap_mentioned_p (operands[i], operands[j]))
19423 return false;
19424 }
19425 else
19426 for (int i = 0; i < num_insns; i++)
19427 {
19428 mem[i] = operands[2 * i];
19429 reg[i] = operands[2 * i + 1];
19430 }
19431
19432 /* Skip if memory operand is by itself valid for ldp/stp. */
19433 if (!MEM_P (mem[0]) || aarch64_mem_pair_operand (mem[0], mode))
19434 return false;
19435
19436 for (int i = 0; i < num_insns; i++)
19437 {
19438 /* The mems cannot be volatile. */
19439 if (MEM_VOLATILE_P (mem[i]))
19440 return false;
19441
19442 /* Check if the addresses are in the form of [base+offset]. */
19443 extract_base_offset_in_addr (mem[i], base + i, offset + i);
19444 if (base[i] == NULL_RTX || offset[i] == NULL_RTX)
19445 return false;
19446 }
19447
19448 /* Check if the registers are of same class. */
19449 rclass = REG_P (reg[0]) && FP_REGNUM_P (REGNO (reg[0]))
19450 ? FP_REGS : GENERAL_REGS;
19451
19452 for (int i = 1; i < num_insns; i++)
19453 if (REG_P (reg[i]) && FP_REGNUM_P (REGNO (reg[i])))
19454 {
19455 if (rclass != FP_REGS)
19456 return false;
19457 }
19458 else
19459 {
19460 if (rclass != GENERAL_REGS)
19461 return false;
19462 }
19463
19464 /* Only the last register in the order in which they occur
19465 may be clobbered by the load. */
19466 if (rclass == GENERAL_REGS && load)
19467 for (int i = 0; i < num_insns - 1; i++)
19468 if (reg_mentioned_p (reg[i], mem[i]))
19469 return false;
19470
19471 /* Check if the bases are same. */
19472 for (int i = 0; i < num_insns - 1; i++)
19473 if (!rtx_equal_p (base[i], base[i + 1]))
19474 return false;
19475
19476 for (int i = 0; i < num_insns; i++)
19477 offvals[i] = INTVAL (offset[i]);
19478
19479 msize = GET_MODE_SIZE (mode);
19480
19481 /* Check if the offsets can be put in the right order to do a ldp/stp. */
19482 qsort (offvals, num_insns, sizeof (HOST_WIDE_INT),
19483 aarch64_host_wide_int_compare);
19484
19485 if (!(offvals[1] == offvals[0] + msize
19486 && offvals[3] == offvals[2] + msize))
19487 return false;
19488
19489 /* Check that offsets are within range of each other. The ldp/stp
19490 instructions have 7 bit immediate offsets, so use 0x80. */
19491 if (offvals[2] - offvals[0] >= msize * 0x80)
19492 return false;
19493
19494 /* The offsets must be aligned with respect to each other. */
19495 if (offvals[0] % msize != offvals[2] % msize)
19496 return false;
19497
19498 /* If we have SImode and slow unaligned ldp,
19499 check the alignment to be at least 8 byte. */
19500 if (mode == SImode
19501 && (aarch64_tune_params.extra_tuning_flags
19502 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
19503 && !optimize_size
19504 && MEM_ALIGN (mem[0]) < 8 * BITS_PER_UNIT)
19505 return false;
19506
19507 return true;
19508 }
19509
19510 /* Given OPERANDS of consecutive load/store, this function pairs them
19511 into LDP/STP after adjusting the offset. It depends on the fact
19512 that the operands can be sorted so the offsets are correct for STP.
19513 MODE is the mode of memory operands. CODE is the rtl operator
19514 which should be applied to all memory operands, it's SIGN_EXTEND,
19515 ZERO_EXTEND or UNKNOWN. */
19516
19517 bool
19518 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
19519 scalar_mode mode, RTX_CODE code)
19520 {
19521 rtx base, offset_1, offset_3, t1, t2;
19522 rtx mem_1, mem_2, mem_3, mem_4;
19523 rtx temp_operands[8];
19524 HOST_WIDE_INT off_val_1, off_val_3, base_off, new_off_1, new_off_3,
19525 stp_off_upper_limit, stp_off_lower_limit, msize;
19526
19527 /* We make changes on a copy as we may still bail out. */
19528 for (int i = 0; i < 8; i ++)
19529 temp_operands[i] = operands[i];
19530
19531 /* Sort the operands. */
19532 qsort (temp_operands, 4, 2 * sizeof (rtx *), aarch64_ldrstr_offset_compare);
19533
19534 /* Copy the memory operands so that if we have to bail for some
19535 reason the original addresses are unchanged. */
19536 if (load)
19537 {
19538 mem_1 = copy_rtx (temp_operands[1]);
19539 mem_2 = copy_rtx (temp_operands[3]);
19540 mem_3 = copy_rtx (temp_operands[5]);
19541 mem_4 = copy_rtx (temp_operands[7]);
19542 }
19543 else
19544 {
19545 mem_1 = copy_rtx (temp_operands[0]);
19546 mem_2 = copy_rtx (temp_operands[2]);
19547 mem_3 = copy_rtx (temp_operands[4]);
19548 mem_4 = copy_rtx (temp_operands[6]);
19549 gcc_assert (code == UNKNOWN);
19550 }
19551
19552 extract_base_offset_in_addr (mem_1, &base, &offset_1);
19553 extract_base_offset_in_addr (mem_3, &base, &offset_3);
19554 gcc_assert (base != NULL_RTX && offset_1 != NULL_RTX
19555 && offset_3 != NULL_RTX);
19556
19557 /* Adjust offset so it can fit in LDP/STP instruction. */
19558 msize = GET_MODE_SIZE (mode);
19559 stp_off_upper_limit = msize * (0x40 - 1);
19560 stp_off_lower_limit = - msize * 0x40;
19561
19562 off_val_1 = INTVAL (offset_1);
19563 off_val_3 = INTVAL (offset_3);
19564
19565 /* The base offset is optimally half way between the two STP/LDP offsets. */
19566 if (msize <= 4)
19567 base_off = (off_val_1 + off_val_3) / 2;
19568 else
19569 /* However, due to issues with negative LDP/STP offset generation for
19570 larger modes, for DF, DI and vector modes. we must not use negative
19571 addresses smaller than 9 signed unadjusted bits can store. This
19572 provides the most range in this case. */
19573 base_off = off_val_1;
19574
19575 /* Adjust the base so that it is aligned with the addresses but still
19576 optimal. */
19577 if (base_off % msize != off_val_1 % msize)
19578 /* Fix the offset, bearing in mind we want to make it bigger not
19579 smaller. */
19580 base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
19581 else if (msize <= 4)
19582 /* The negative range of LDP/STP is one larger than the positive range. */
19583 base_off += msize;
19584
19585 /* Check if base offset is too big or too small. We can attempt to resolve
19586 this issue by setting it to the maximum value and seeing if the offsets
19587 still fit. */
19588 if (base_off >= 0x1000)
19589 {
19590 base_off = 0x1000 - 1;
19591 /* We must still make sure that the base offset is aligned with respect
19592 to the address. But it may may not be made any bigger. */
19593 base_off -= (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
19594 }
19595
19596 /* Likewise for the case where the base is too small. */
19597 if (base_off <= -0x1000)
19598 {
19599 base_off = -0x1000 + 1;
19600 base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
19601 }
19602
19603 /* Offset of the first STP/LDP. */
19604 new_off_1 = off_val_1 - base_off;
19605
19606 /* Offset of the second STP/LDP. */
19607 new_off_3 = off_val_3 - base_off;
19608
19609 /* The offsets must be within the range of the LDP/STP instructions. */
19610 if (new_off_1 > stp_off_upper_limit || new_off_1 < stp_off_lower_limit
19611 || new_off_3 > stp_off_upper_limit || new_off_3 < stp_off_lower_limit)
19612 return false;
19613
19614 replace_equiv_address_nv (mem_1, plus_constant (Pmode, operands[8],
19615 new_off_1), true);
19616 replace_equiv_address_nv (mem_2, plus_constant (Pmode, operands[8],
19617 new_off_1 + msize), true);
19618 replace_equiv_address_nv (mem_3, plus_constant (Pmode, operands[8],
19619 new_off_3), true);
19620 replace_equiv_address_nv (mem_4, plus_constant (Pmode, operands[8],
19621 new_off_3 + msize), true);
19622
19623 if (!aarch64_mem_pair_operand (mem_1, mode)
19624 || !aarch64_mem_pair_operand (mem_3, mode))
19625 return false;
19626
19627 if (code == ZERO_EXTEND)
19628 {
19629 mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
19630 mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
19631 mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
19632 mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
19633 }
19634 else if (code == SIGN_EXTEND)
19635 {
19636 mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
19637 mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
19638 mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
19639 mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
19640 }
19641
19642 if (load)
19643 {
19644 operands[0] = temp_operands[0];
19645 operands[1] = mem_1;
19646 operands[2] = temp_operands[2];
19647 operands[3] = mem_2;
19648 operands[4] = temp_operands[4];
19649 operands[5] = mem_3;
19650 operands[6] = temp_operands[6];
19651 operands[7] = mem_4;
19652 }
19653 else
19654 {
19655 operands[0] = mem_1;
19656 operands[1] = temp_operands[1];
19657 operands[2] = mem_2;
19658 operands[3] = temp_operands[3];
19659 operands[4] = mem_3;
19660 operands[5] = temp_operands[5];
19661 operands[6] = mem_4;
19662 operands[7] = temp_operands[7];
19663 }
19664
19665 /* Emit adjusting instruction. */
19666 emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, base_off)));
19667 /* Emit ldp/stp instructions. */
19668 t1 = gen_rtx_SET (operands[0], operands[1]);
19669 t2 = gen_rtx_SET (operands[2], operands[3]);
19670 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
19671 t1 = gen_rtx_SET (operands[4], operands[5]);
19672 t2 = gen_rtx_SET (operands[6], operands[7]);
19673 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
19674 return true;
19675 }
19676
19677 /* Implement TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE. Assume for now that
19678 it isn't worth branching around empty masked ops (including masked
19679 stores). */
19680
19681 static bool
19682 aarch64_empty_mask_is_expensive (unsigned)
19683 {
19684 return false;
19685 }
19686
19687 /* Return 1 if pseudo register should be created and used to hold
19688 GOT address for PIC code. */
19689
19690 bool
19691 aarch64_use_pseudo_pic_reg (void)
19692 {
19693 return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
19694 }
19695
19696 /* Implement TARGET_UNSPEC_MAY_TRAP_P. */
19697
19698 static int
19699 aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
19700 {
19701 switch (XINT (x, 1))
19702 {
19703 case UNSPEC_GOTSMALLPIC:
19704 case UNSPEC_GOTSMALLPIC28K:
19705 case UNSPEC_GOTTINYPIC:
19706 return 0;
19707 default:
19708 break;
19709 }
19710
19711 return default_unspec_may_trap_p (x, flags);
19712 }
19713
19714
19715 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
19716 return the log2 of that value. Otherwise return -1. */
19717
19718 int
19719 aarch64_fpconst_pow_of_2 (rtx x)
19720 {
19721 const REAL_VALUE_TYPE *r;
19722
19723 if (!CONST_DOUBLE_P (x))
19724 return -1;
19725
19726 r = CONST_DOUBLE_REAL_VALUE (x);
19727
19728 if (REAL_VALUE_NEGATIVE (*r)
19729 || REAL_VALUE_ISNAN (*r)
19730 || REAL_VALUE_ISINF (*r)
19731 || !real_isinteger (r, DFmode))
19732 return -1;
19733
19734 return exact_log2 (real_to_integer (r));
19735 }
19736
19737 /* If X is a vector of equal CONST_DOUBLE values and that value is
19738 Y, return the aarch64_fpconst_pow_of_2 of Y. Otherwise return -1. */
19739
19740 int
19741 aarch64_vec_fpconst_pow_of_2 (rtx x)
19742 {
19743 int nelts;
19744 if (GET_CODE (x) != CONST_VECTOR
19745 || !CONST_VECTOR_NUNITS (x).is_constant (&nelts))
19746 return -1;
19747
19748 if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
19749 return -1;
19750
19751 int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
19752 if (firstval <= 0)
19753 return -1;
19754
19755 for (int i = 1; i < nelts; i++)
19756 if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
19757 return -1;
19758
19759 return firstval;
19760 }
19761
19762 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
19763 to float.
19764
19765 __fp16 always promotes through this hook.
19766 _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
19767 through the generic excess precision logic rather than here. */
19768
19769 static tree
19770 aarch64_promoted_type (const_tree t)
19771 {
19772 if (SCALAR_FLOAT_TYPE_P (t)
19773 && TYPE_MAIN_VARIANT (t) == aarch64_fp16_type_node)
19774 return float_type_node;
19775
19776 return NULL_TREE;
19777 }
19778
19779 /* Implement the TARGET_OPTAB_SUPPORTED_P hook. */
19780
19781 static bool
19782 aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
19783 optimization_type opt_type)
19784 {
19785 switch (op)
19786 {
19787 case rsqrt_optab:
19788 return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
19789
19790 default:
19791 return true;
19792 }
19793 }
19794
19795 /* Implement the TARGET_DWARF_POLY_INDETERMINATE_VALUE hook. */
19796
19797 static unsigned int
19798 aarch64_dwarf_poly_indeterminate_value (unsigned int i, unsigned int *factor,
19799 int *offset)
19800 {
19801 /* Polynomial invariant 1 == (VG / 2) - 1. */
19802 gcc_assert (i == 1);
19803 *factor = 2;
19804 *offset = 1;
19805 return AARCH64_DWARF_VG;
19806 }
19807
19808 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
19809 if MODE is HFmode, and punt to the generic implementation otherwise. */
19810
19811 static bool
19812 aarch64_libgcc_floating_mode_supported_p (scalar_float_mode mode)
19813 {
19814 return (mode == HFmode
19815 ? true
19816 : default_libgcc_floating_mode_supported_p (mode));
19817 }
19818
19819 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
19820 if MODE is HFmode, and punt to the generic implementation otherwise. */
19821
19822 static bool
19823 aarch64_scalar_mode_supported_p (scalar_mode mode)
19824 {
19825 return (mode == HFmode
19826 ? true
19827 : default_scalar_mode_supported_p (mode));
19828 }
19829
19830 /* Set the value of FLT_EVAL_METHOD.
19831 ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
19832
19833 0: evaluate all operations and constants, whose semantic type has at
19834 most the range and precision of type float, to the range and
19835 precision of float; evaluate all other operations and constants to
19836 the range and precision of the semantic type;
19837
19838 N, where _FloatN is a supported interchange floating type
19839 evaluate all operations and constants, whose semantic type has at
19840 most the range and precision of _FloatN type, to the range and
19841 precision of the _FloatN type; evaluate all other operations and
19842 constants to the range and precision of the semantic type;
19843
19844 If we have the ARMv8.2-A extensions then we support _Float16 in native
19845 precision, so we should set this to 16. Otherwise, we support the type,
19846 but want to evaluate expressions in float precision, so set this to
19847 0. */
19848
19849 static enum flt_eval_method
19850 aarch64_excess_precision (enum excess_precision_type type)
19851 {
19852 switch (type)
19853 {
19854 case EXCESS_PRECISION_TYPE_FAST:
19855 case EXCESS_PRECISION_TYPE_STANDARD:
19856 /* We can calculate either in 16-bit range and precision or
19857 32-bit range and precision. Make that decision based on whether
19858 we have native support for the ARMv8.2-A 16-bit floating-point
19859 instructions or not. */
19860 return (TARGET_FP_F16INST
19861 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
19862 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT);
19863 case EXCESS_PRECISION_TYPE_IMPLICIT:
19864 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16;
19865 default:
19866 gcc_unreachable ();
19867 }
19868 return FLT_EVAL_METHOD_UNPREDICTABLE;
19869 }
19870
19871 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN. Return true if INSN can be
19872 scheduled for speculative execution. Reject the long-running division
19873 and square-root instructions. */
19874
19875 static bool
19876 aarch64_sched_can_speculate_insn (rtx_insn *insn)
19877 {
19878 switch (get_attr_type (insn))
19879 {
19880 case TYPE_SDIV:
19881 case TYPE_UDIV:
19882 case TYPE_FDIVS:
19883 case TYPE_FDIVD:
19884 case TYPE_FSQRTS:
19885 case TYPE_FSQRTD:
19886 case TYPE_NEON_FP_SQRT_S:
19887 case TYPE_NEON_FP_SQRT_D:
19888 case TYPE_NEON_FP_SQRT_S_Q:
19889 case TYPE_NEON_FP_SQRT_D_Q:
19890 case TYPE_NEON_FP_DIV_S:
19891 case TYPE_NEON_FP_DIV_D:
19892 case TYPE_NEON_FP_DIV_S_Q:
19893 case TYPE_NEON_FP_DIV_D_Q:
19894 return false;
19895 default:
19896 return true;
19897 }
19898 }
19899
19900 /* Implement TARGET_COMPUTE_PRESSURE_CLASSES. */
19901
19902 static int
19903 aarch64_compute_pressure_classes (reg_class *classes)
19904 {
19905 int i = 0;
19906 classes[i++] = GENERAL_REGS;
19907 classes[i++] = FP_REGS;
19908 /* PR_REGS isn't a useful pressure class because many predicate pseudo
19909 registers need to go in PR_LO_REGS at some point during their
19910 lifetime. Splitting it into two halves has the effect of making
19911 all predicates count against PR_LO_REGS, so that we try whenever
19912 possible to restrict the number of live predicates to 8. This
19913 greatly reduces the amount of spilling in certain loops. */
19914 classes[i++] = PR_LO_REGS;
19915 classes[i++] = PR_HI_REGS;
19916 return i;
19917 }
19918
19919 /* Implement TARGET_CAN_CHANGE_MODE_CLASS. */
19920
19921 static bool
19922 aarch64_can_change_mode_class (machine_mode from,
19923 machine_mode to, reg_class_t)
19924 {
19925 if (BYTES_BIG_ENDIAN)
19926 {
19927 bool from_sve_p = aarch64_sve_data_mode_p (from);
19928 bool to_sve_p = aarch64_sve_data_mode_p (to);
19929
19930 /* Don't allow changes between SVE data modes and non-SVE modes.
19931 See the comment at the head of aarch64-sve.md for details. */
19932 if (from_sve_p != to_sve_p)
19933 return false;
19934
19935 /* Don't allow changes in element size: lane 0 of the new vector
19936 would not then be lane 0 of the old vector. See the comment
19937 above aarch64_maybe_expand_sve_subreg_move for a more detailed
19938 description.
19939
19940 In the worst case, this forces a register to be spilled in
19941 one mode and reloaded in the other, which handles the
19942 endianness correctly. */
19943 if (from_sve_p && GET_MODE_UNIT_SIZE (from) != GET_MODE_UNIT_SIZE (to))
19944 return false;
19945 }
19946 return true;
19947 }
19948
19949 /* Implement TARGET_EARLY_REMAT_MODES. */
19950
19951 static void
19952 aarch64_select_early_remat_modes (sbitmap modes)
19953 {
19954 /* SVE values are not normally live across a call, so it should be
19955 worth doing early rematerialization even in VL-specific mode. */
19956 for (int i = 0; i < NUM_MACHINE_MODES; ++i)
19957 {
19958 machine_mode mode = (machine_mode) i;
19959 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
19960 if (vec_flags & VEC_ANY_SVE)
19961 bitmap_set_bit (modes, i);
19962 }
19963 }
19964
19965 /* Override the default target speculation_safe_value. */
19966 static rtx
19967 aarch64_speculation_safe_value (machine_mode mode,
19968 rtx result, rtx val, rtx failval)
19969 {
19970 /* Maybe we should warn if falling back to hard barriers. They are
19971 likely to be noticably more expensive than the alternative below. */
19972 if (!aarch64_track_speculation)
19973 return default_speculation_safe_value (mode, result, val, failval);
19974
19975 if (!REG_P (val))
19976 val = copy_to_mode_reg (mode, val);
19977
19978 if (!aarch64_reg_or_zero (failval, mode))
19979 failval = copy_to_mode_reg (mode, failval);
19980
19981 emit_insn (gen_despeculate_copy (mode, result, val, failval));
19982 return result;
19983 }
19984
19985 /* Implement TARGET_ESTIMATED_POLY_VALUE.
19986 Look into the tuning structure for an estimate.
19987 VAL.coeffs[1] is multiplied by the number of VQ chunks over the initial
19988 Advanced SIMD 128 bits. */
19989
19990 static HOST_WIDE_INT
19991 aarch64_estimated_poly_value (poly_int64 val)
19992 {
19993 enum aarch64_sve_vector_bits_enum width_source
19994 = aarch64_tune_params.sve_width;
19995
19996 /* If we still don't have an estimate, use the default. */
19997 if (width_source == SVE_SCALABLE)
19998 return default_estimated_poly_value (val);
19999
20000 HOST_WIDE_INT over_128 = width_source - 128;
20001 return val.coeffs[0] + val.coeffs[1] * over_128 / 128;
20002 }
20003
20004
20005 /* Return true for types that could be supported as SIMD return or
20006 argument types. */
20007
20008 static bool
20009 supported_simd_type (tree t)
20010 {
20011 if (SCALAR_FLOAT_TYPE_P (t) || INTEGRAL_TYPE_P (t) || POINTER_TYPE_P (t))
20012 {
20013 HOST_WIDE_INT s = tree_to_shwi (TYPE_SIZE_UNIT (t));
20014 return s == 1 || s == 2 || s == 4 || s == 8;
20015 }
20016 return false;
20017 }
20018
20019 /* Return true for types that currently are supported as SIMD return
20020 or argument types. */
20021
20022 static bool
20023 currently_supported_simd_type (tree t, tree b)
20024 {
20025 if (COMPLEX_FLOAT_TYPE_P (t))
20026 return false;
20027
20028 if (TYPE_SIZE (t) != TYPE_SIZE (b))
20029 return false;
20030
20031 return supported_simd_type (t);
20032 }
20033
20034 /* Implement TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN. */
20035
20036 static int
20037 aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
20038 struct cgraph_simd_clone *clonei,
20039 tree base_type, int num)
20040 {
20041 tree t, ret_type, arg_type;
20042 unsigned int elt_bits, vec_bits, count;
20043
20044 if (!TARGET_SIMD)
20045 return 0;
20046
20047 if (clonei->simdlen
20048 && (clonei->simdlen < 2
20049 || clonei->simdlen > 1024
20050 || (clonei->simdlen & (clonei->simdlen - 1)) != 0))
20051 {
20052 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
20053 "unsupported simdlen %d", clonei->simdlen);
20054 return 0;
20055 }
20056
20057 ret_type = TREE_TYPE (TREE_TYPE (node->decl));
20058 if (TREE_CODE (ret_type) != VOID_TYPE
20059 && !currently_supported_simd_type (ret_type, base_type))
20060 {
20061 if (TYPE_SIZE (ret_type) != TYPE_SIZE (base_type))
20062 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
20063 "GCC does not currently support mixed size types "
20064 "for %<simd%> functions");
20065 else if (supported_simd_type (ret_type))
20066 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
20067 "GCC does not currently support return type %qT "
20068 "for %<simd%> functions", ret_type);
20069 else
20070 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
20071 "unsupported return type %qT for %<simd%> functions",
20072 ret_type);
20073 return 0;
20074 }
20075
20076 for (t = DECL_ARGUMENTS (node->decl); t; t = DECL_CHAIN (t))
20077 {
20078 arg_type = TREE_TYPE (t);
20079
20080 if (!currently_supported_simd_type (arg_type, base_type))
20081 {
20082 if (TYPE_SIZE (arg_type) != TYPE_SIZE (base_type))
20083 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
20084 "GCC does not currently support mixed size types "
20085 "for %<simd%> functions");
20086 else
20087 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
20088 "GCC does not currently support argument type %qT "
20089 "for %<simd%> functions", arg_type);
20090 return 0;
20091 }
20092 }
20093
20094 clonei->vecsize_mangle = 'n';
20095 clonei->mask_mode = VOIDmode;
20096 elt_bits = GET_MODE_BITSIZE (SCALAR_TYPE_MODE (base_type));
20097 if (clonei->simdlen == 0)
20098 {
20099 count = 2;
20100 vec_bits = (num == 0 ? 64 : 128);
20101 clonei->simdlen = vec_bits / elt_bits;
20102 }
20103 else
20104 {
20105 count = 1;
20106 vec_bits = clonei->simdlen * elt_bits;
20107 if (vec_bits != 64 && vec_bits != 128)
20108 {
20109 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
20110 "GCC does not currently support simdlen %d for type %qT",
20111 clonei->simdlen, base_type);
20112 return 0;
20113 }
20114 }
20115 clonei->vecsize_int = vec_bits;
20116 clonei->vecsize_float = vec_bits;
20117 return count;
20118 }
20119
20120 /* Implement TARGET_SIMD_CLONE_ADJUST. */
20121
20122 static void
20123 aarch64_simd_clone_adjust (struct cgraph_node *node)
20124 {
20125 /* Add aarch64_vector_pcs target attribute to SIMD clones so they
20126 use the correct ABI. */
20127
20128 tree t = TREE_TYPE (node->decl);
20129 TYPE_ATTRIBUTES (t) = make_attribute ("aarch64_vector_pcs", "default",
20130 TYPE_ATTRIBUTES (t));
20131 }
20132
20133 /* Implement TARGET_SIMD_CLONE_USABLE. */
20134
20135 static int
20136 aarch64_simd_clone_usable (struct cgraph_node *node)
20137 {
20138 switch (node->simdclone->vecsize_mangle)
20139 {
20140 case 'n':
20141 if (!TARGET_SIMD)
20142 return -1;
20143 return 0;
20144 default:
20145 gcc_unreachable ();
20146 }
20147 }
20148
20149 /* Implement TARGET_COMP_TYPE_ATTRIBUTES */
20150
20151 static int
20152 aarch64_comp_type_attributes (const_tree type1, const_tree type2)
20153 {
20154 if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (type1))
20155 != lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (type2)))
20156 return 0;
20157 return 1;
20158 }
20159
20160 /* Implement TARGET_GET_MULTILIB_ABI_NAME */
20161
20162 static const char *
20163 aarch64_get_multilib_abi_name (void)
20164 {
20165 if (TARGET_BIG_END)
20166 return TARGET_ILP32 ? "aarch64_be_ilp32" : "aarch64_be";
20167 return TARGET_ILP32 ? "aarch64_ilp32" : "aarch64";
20168 }
20169
20170 /* Implement TARGET_STACK_PROTECT_GUARD. In case of a
20171 global variable based guard use the default else
20172 return a null tree. */
20173 static tree
20174 aarch64_stack_protect_guard (void)
20175 {
20176 if (aarch64_stack_protector_guard == SSP_GLOBAL)
20177 return default_stack_protect_guard ();
20178
20179 return NULL_TREE;
20180 }
20181
20182 /* Implement TARGET_ASM_FILE_END for AArch64. This adds the AArch64 GNU NOTE
20183 section at the end if needed. */
20184 #define GNU_PROPERTY_AARCH64_FEATURE_1_AND 0xc0000000
20185 #define GNU_PROPERTY_AARCH64_FEATURE_1_BTI (1U << 0)
20186 #define GNU_PROPERTY_AARCH64_FEATURE_1_PAC (1U << 1)
20187 void
20188 aarch64_file_end_indicate_exec_stack ()
20189 {
20190 file_end_indicate_exec_stack ();
20191
20192 unsigned feature_1_and = 0;
20193 if (aarch64_bti_enabled ())
20194 feature_1_and |= GNU_PROPERTY_AARCH64_FEATURE_1_BTI;
20195
20196 if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE)
20197 feature_1_and |= GNU_PROPERTY_AARCH64_FEATURE_1_PAC;
20198
20199 if (feature_1_and)
20200 {
20201 /* Generate .note.gnu.property section. */
20202 switch_to_section (get_section (".note.gnu.property",
20203 SECTION_NOTYPE, NULL));
20204
20205 /* PT_NOTE header: namesz, descsz, type.
20206 namesz = 4 ("GNU\0")
20207 descsz = 16 (Size of the program property array)
20208 [(12 + padding) * Number of array elements]
20209 type = 5 (NT_GNU_PROPERTY_TYPE_0). */
20210 assemble_align (POINTER_SIZE);
20211 assemble_integer (GEN_INT (4), 4, 32, 1);
20212 assemble_integer (GEN_INT (ROUND_UP (12, POINTER_BYTES)), 4, 32, 1);
20213 assemble_integer (GEN_INT (5), 4, 32, 1);
20214
20215 /* PT_NOTE name. */
20216 assemble_string ("GNU", 4);
20217
20218 /* PT_NOTE contents for NT_GNU_PROPERTY_TYPE_0:
20219 type = GNU_PROPERTY_AARCH64_FEATURE_1_AND
20220 datasz = 4
20221 data = feature_1_and. */
20222 assemble_integer (GEN_INT (GNU_PROPERTY_AARCH64_FEATURE_1_AND), 4, 32, 1);
20223 assemble_integer (GEN_INT (4), 4, 32, 1);
20224 assemble_integer (GEN_INT (feature_1_and), 4, 32, 1);
20225
20226 /* Pad the size of the note to the required alignment. */
20227 assemble_align (POINTER_SIZE);
20228 }
20229 }
20230 #undef GNU_PROPERTY_AARCH64_FEATURE_1_PAC
20231 #undef GNU_PROPERTY_AARCH64_FEATURE_1_BTI
20232 #undef GNU_PROPERTY_AARCH64_FEATURE_1_AND
20233
20234 /* Target-specific selftests. */
20235
20236 #if CHECKING_P
20237
20238 namespace selftest {
20239
20240 /* Selftest for the RTL loader.
20241 Verify that the RTL loader copes with a dump from
20242 print_rtx_function. This is essentially just a test that class
20243 function_reader can handle a real dump, but it also verifies
20244 that lookup_reg_by_dump_name correctly handles hard regs.
20245 The presence of hard reg names in the dump means that the test is
20246 target-specific, hence it is in this file. */
20247
20248 static void
20249 aarch64_test_loading_full_dump ()
20250 {
20251 rtl_dump_test t (SELFTEST_LOCATION, locate_file ("aarch64/times-two.rtl"));
20252
20253 ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
20254
20255 rtx_insn *insn_1 = get_insn_by_uid (1);
20256 ASSERT_EQ (NOTE, GET_CODE (insn_1));
20257
20258 rtx_insn *insn_15 = get_insn_by_uid (15);
20259 ASSERT_EQ (INSN, GET_CODE (insn_15));
20260 ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
20261
20262 /* Verify crtl->return_rtx. */
20263 ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
20264 ASSERT_EQ (0, REGNO (crtl->return_rtx));
20265 ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
20266 }
20267
20268 /* Run all target-specific selftests. */
20269
20270 static void
20271 aarch64_run_selftests (void)
20272 {
20273 aarch64_test_loading_full_dump ();
20274 }
20275
20276 } // namespace selftest
20277
20278 #endif /* #if CHECKING_P */
20279
20280 #undef TARGET_STACK_PROTECT_GUARD
20281 #define TARGET_STACK_PROTECT_GUARD aarch64_stack_protect_guard
20282
20283 #undef TARGET_ADDRESS_COST
20284 #define TARGET_ADDRESS_COST aarch64_address_cost
20285
20286 /* This hook will determines whether unnamed bitfields affect the alignment
20287 of the containing structure. The hook returns true if the structure
20288 should inherit the alignment requirements of an unnamed bitfield's
20289 type. */
20290 #undef TARGET_ALIGN_ANON_BITFIELD
20291 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
20292
20293 #undef TARGET_ASM_ALIGNED_DI_OP
20294 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
20295
20296 #undef TARGET_ASM_ALIGNED_HI_OP
20297 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
20298
20299 #undef TARGET_ASM_ALIGNED_SI_OP
20300 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
20301
20302 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
20303 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
20304 hook_bool_const_tree_hwi_hwi_const_tree_true
20305
20306 #undef TARGET_ASM_FILE_START
20307 #define TARGET_ASM_FILE_START aarch64_start_file
20308
20309 #undef TARGET_ASM_OUTPUT_MI_THUNK
20310 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
20311
20312 #undef TARGET_ASM_SELECT_RTX_SECTION
20313 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
20314
20315 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
20316 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
20317
20318 #undef TARGET_BUILD_BUILTIN_VA_LIST
20319 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
20320
20321 #undef TARGET_CALLEE_COPIES
20322 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
20323
20324 #undef TARGET_CAN_ELIMINATE
20325 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
20326
20327 #undef TARGET_CAN_INLINE_P
20328 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
20329
20330 #undef TARGET_CANNOT_FORCE_CONST_MEM
20331 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
20332
20333 #undef TARGET_CASE_VALUES_THRESHOLD
20334 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
20335
20336 #undef TARGET_CONDITIONAL_REGISTER_USAGE
20337 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
20338
20339 /* Only the least significant bit is used for initialization guard
20340 variables. */
20341 #undef TARGET_CXX_GUARD_MASK_BIT
20342 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
20343
20344 #undef TARGET_C_MODE_FOR_SUFFIX
20345 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
20346
20347 #ifdef TARGET_BIG_ENDIAN_DEFAULT
20348 #undef TARGET_DEFAULT_TARGET_FLAGS
20349 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
20350 #endif
20351
20352 #undef TARGET_CLASS_MAX_NREGS
20353 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
20354
20355 #undef TARGET_BUILTIN_DECL
20356 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
20357
20358 #undef TARGET_BUILTIN_RECIPROCAL
20359 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
20360
20361 #undef TARGET_C_EXCESS_PRECISION
20362 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
20363
20364 #undef TARGET_EXPAND_BUILTIN
20365 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
20366
20367 #undef TARGET_EXPAND_BUILTIN_VA_START
20368 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
20369
20370 #undef TARGET_FOLD_BUILTIN
20371 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
20372
20373 #undef TARGET_FUNCTION_ARG
20374 #define TARGET_FUNCTION_ARG aarch64_function_arg
20375
20376 #undef TARGET_FUNCTION_ARG_ADVANCE
20377 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
20378
20379 #undef TARGET_FUNCTION_ARG_BOUNDARY
20380 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
20381
20382 #undef TARGET_FUNCTION_ARG_PADDING
20383 #define TARGET_FUNCTION_ARG_PADDING aarch64_function_arg_padding
20384
20385 #undef TARGET_GET_RAW_RESULT_MODE
20386 #define TARGET_GET_RAW_RESULT_MODE aarch64_get_reg_raw_mode
20387 #undef TARGET_GET_RAW_ARG_MODE
20388 #define TARGET_GET_RAW_ARG_MODE aarch64_get_reg_raw_mode
20389
20390 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
20391 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
20392
20393 #undef TARGET_FUNCTION_VALUE
20394 #define TARGET_FUNCTION_VALUE aarch64_function_value
20395
20396 #undef TARGET_FUNCTION_VALUE_REGNO_P
20397 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
20398
20399 #undef TARGET_GIMPLE_FOLD_BUILTIN
20400 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
20401
20402 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
20403 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
20404
20405 #undef TARGET_INIT_BUILTINS
20406 #define TARGET_INIT_BUILTINS aarch64_init_builtins
20407
20408 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
20409 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
20410 aarch64_ira_change_pseudo_allocno_class
20411
20412 #undef TARGET_LEGITIMATE_ADDRESS_P
20413 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
20414
20415 #undef TARGET_LEGITIMATE_CONSTANT_P
20416 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
20417
20418 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
20419 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
20420 aarch64_legitimize_address_displacement
20421
20422 #undef TARGET_LIBGCC_CMP_RETURN_MODE
20423 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
20424
20425 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
20426 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
20427 aarch64_libgcc_floating_mode_supported_p
20428
20429 #undef TARGET_MANGLE_TYPE
20430 #define TARGET_MANGLE_TYPE aarch64_mangle_type
20431
20432 #undef TARGET_MEMORY_MOVE_COST
20433 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
20434
20435 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
20436 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
20437
20438 #undef TARGET_MUST_PASS_IN_STACK
20439 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
20440
20441 /* This target hook should return true if accesses to volatile bitfields
20442 should use the narrowest mode possible. It should return false if these
20443 accesses should use the bitfield container type. */
20444 #undef TARGET_NARROW_VOLATILE_BITFIELD
20445 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
20446
20447 #undef TARGET_OPTION_OVERRIDE
20448 #define TARGET_OPTION_OVERRIDE aarch64_override_options
20449
20450 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
20451 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
20452 aarch64_override_options_after_change
20453
20454 #undef TARGET_OPTION_SAVE
20455 #define TARGET_OPTION_SAVE aarch64_option_save
20456
20457 #undef TARGET_OPTION_RESTORE
20458 #define TARGET_OPTION_RESTORE aarch64_option_restore
20459
20460 #undef TARGET_OPTION_PRINT
20461 #define TARGET_OPTION_PRINT aarch64_option_print
20462
20463 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
20464 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
20465
20466 #undef TARGET_SET_CURRENT_FUNCTION
20467 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
20468
20469 #undef TARGET_PASS_BY_REFERENCE
20470 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
20471
20472 #undef TARGET_PREFERRED_RELOAD_CLASS
20473 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
20474
20475 #undef TARGET_SCHED_REASSOCIATION_WIDTH
20476 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
20477
20478 #undef TARGET_PROMOTED_TYPE
20479 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
20480
20481 #undef TARGET_SECONDARY_RELOAD
20482 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
20483
20484 #undef TARGET_SHIFT_TRUNCATION_MASK
20485 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
20486
20487 #undef TARGET_SETUP_INCOMING_VARARGS
20488 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
20489
20490 #undef TARGET_STRUCT_VALUE_RTX
20491 #define TARGET_STRUCT_VALUE_RTX aarch64_struct_value_rtx
20492
20493 #undef TARGET_REGISTER_MOVE_COST
20494 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
20495
20496 #undef TARGET_RETURN_IN_MEMORY
20497 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
20498
20499 #undef TARGET_RETURN_IN_MSB
20500 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
20501
20502 #undef TARGET_RTX_COSTS
20503 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
20504
20505 #undef TARGET_SCALAR_MODE_SUPPORTED_P
20506 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
20507
20508 #undef TARGET_SCHED_ISSUE_RATE
20509 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
20510
20511 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
20512 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
20513 aarch64_sched_first_cycle_multipass_dfa_lookahead
20514
20515 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
20516 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
20517 aarch64_first_cycle_multipass_dfa_lookahead_guard
20518
20519 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
20520 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
20521 aarch64_get_separate_components
20522
20523 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
20524 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
20525 aarch64_components_for_bb
20526
20527 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
20528 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
20529 aarch64_disqualify_components
20530
20531 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
20532 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
20533 aarch64_emit_prologue_components
20534
20535 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
20536 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
20537 aarch64_emit_epilogue_components
20538
20539 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
20540 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
20541 aarch64_set_handled_components
20542
20543 #undef TARGET_TRAMPOLINE_INIT
20544 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
20545
20546 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
20547 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
20548
20549 #undef TARGET_VECTOR_MODE_SUPPORTED_P
20550 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
20551
20552 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
20553 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
20554 aarch64_builtin_support_vector_misalignment
20555
20556 #undef TARGET_ARRAY_MODE
20557 #define TARGET_ARRAY_MODE aarch64_array_mode
20558
20559 #undef TARGET_ARRAY_MODE_SUPPORTED_P
20560 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
20561
20562 #undef TARGET_VECTORIZE_ADD_STMT_COST
20563 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
20564
20565 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
20566 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
20567 aarch64_builtin_vectorization_cost
20568
20569 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
20570 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
20571
20572 #undef TARGET_VECTORIZE_BUILTINS
20573 #define TARGET_VECTORIZE_BUILTINS
20574
20575 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
20576 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
20577 aarch64_builtin_vectorized_function
20578
20579 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
20580 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
20581 aarch64_autovectorize_vector_sizes
20582
20583 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
20584 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
20585 aarch64_atomic_assign_expand_fenv
20586
20587 /* Section anchor support. */
20588
20589 #undef TARGET_MIN_ANCHOR_OFFSET
20590 #define TARGET_MIN_ANCHOR_OFFSET -256
20591
20592 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
20593 byte offset; we can do much more for larger data types, but have no way
20594 to determine the size of the access. We assume accesses are aligned. */
20595 #undef TARGET_MAX_ANCHOR_OFFSET
20596 #define TARGET_MAX_ANCHOR_OFFSET 4095
20597
20598 #undef TARGET_VECTOR_ALIGNMENT
20599 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
20600
20601 #undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
20602 #define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
20603 aarch64_vectorize_preferred_vector_alignment
20604 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
20605 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
20606 aarch64_simd_vector_alignment_reachable
20607
20608 /* vec_perm support. */
20609
20610 #undef TARGET_VECTORIZE_VEC_PERM_CONST
20611 #define TARGET_VECTORIZE_VEC_PERM_CONST \
20612 aarch64_vectorize_vec_perm_const
20613
20614 #undef TARGET_VECTORIZE_GET_MASK_MODE
20615 #define TARGET_VECTORIZE_GET_MASK_MODE aarch64_get_mask_mode
20616 #undef TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE
20617 #define TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE \
20618 aarch64_empty_mask_is_expensive
20619 #undef TARGET_PREFERRED_ELSE_VALUE
20620 #define TARGET_PREFERRED_ELSE_VALUE \
20621 aarch64_preferred_else_value
20622
20623 #undef TARGET_INIT_LIBFUNCS
20624 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
20625
20626 #undef TARGET_FIXED_CONDITION_CODE_REGS
20627 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
20628
20629 #undef TARGET_FLAGS_REGNUM
20630 #define TARGET_FLAGS_REGNUM CC_REGNUM
20631
20632 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
20633 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
20634
20635 #undef TARGET_ASAN_SHADOW_OFFSET
20636 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
20637
20638 #undef TARGET_LEGITIMIZE_ADDRESS
20639 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
20640
20641 #undef TARGET_SCHED_CAN_SPECULATE_INSN
20642 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
20643
20644 #undef TARGET_CAN_USE_DOLOOP_P
20645 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
20646
20647 #undef TARGET_SCHED_ADJUST_PRIORITY
20648 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
20649
20650 #undef TARGET_SCHED_MACRO_FUSION_P
20651 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
20652
20653 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
20654 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
20655
20656 #undef TARGET_SCHED_FUSION_PRIORITY
20657 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
20658
20659 #undef TARGET_UNSPEC_MAY_TRAP_P
20660 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
20661
20662 #undef TARGET_USE_PSEUDO_PIC_REG
20663 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
20664
20665 #undef TARGET_PRINT_OPERAND
20666 #define TARGET_PRINT_OPERAND aarch64_print_operand
20667
20668 #undef TARGET_PRINT_OPERAND_ADDRESS
20669 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
20670
20671 #undef TARGET_OPTAB_SUPPORTED_P
20672 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
20673
20674 #undef TARGET_OMIT_STRUCT_RETURN_REG
20675 #define TARGET_OMIT_STRUCT_RETURN_REG true
20676
20677 #undef TARGET_DWARF_POLY_INDETERMINATE_VALUE
20678 #define TARGET_DWARF_POLY_INDETERMINATE_VALUE \
20679 aarch64_dwarf_poly_indeterminate_value
20680
20681 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors. */
20682 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
20683 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
20684
20685 #undef TARGET_HARD_REGNO_NREGS
20686 #define TARGET_HARD_REGNO_NREGS aarch64_hard_regno_nregs
20687 #undef TARGET_HARD_REGNO_MODE_OK
20688 #define TARGET_HARD_REGNO_MODE_OK aarch64_hard_regno_mode_ok
20689
20690 #undef TARGET_MODES_TIEABLE_P
20691 #define TARGET_MODES_TIEABLE_P aarch64_modes_tieable_p
20692
20693 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
20694 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
20695 aarch64_hard_regno_call_part_clobbered
20696
20697 #undef TARGET_REMOVE_EXTRA_CALL_PRESERVED_REGS
20698 #define TARGET_REMOVE_EXTRA_CALL_PRESERVED_REGS \
20699 aarch64_remove_extra_call_preserved_regs
20700
20701 #undef TARGET_RETURN_CALL_WITH_MAX_CLOBBERS
20702 #define TARGET_RETURN_CALL_WITH_MAX_CLOBBERS \
20703 aarch64_return_call_with_max_clobbers
20704
20705 #undef TARGET_CONSTANT_ALIGNMENT
20706 #define TARGET_CONSTANT_ALIGNMENT aarch64_constant_alignment
20707
20708 #undef TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE
20709 #define TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE \
20710 aarch64_stack_clash_protection_alloca_probe_range
20711
20712 #undef TARGET_COMPUTE_PRESSURE_CLASSES
20713 #define TARGET_COMPUTE_PRESSURE_CLASSES aarch64_compute_pressure_classes
20714
20715 #undef TARGET_CAN_CHANGE_MODE_CLASS
20716 #define TARGET_CAN_CHANGE_MODE_CLASS aarch64_can_change_mode_class
20717
20718 #undef TARGET_SELECT_EARLY_REMAT_MODES
20719 #define TARGET_SELECT_EARLY_REMAT_MODES aarch64_select_early_remat_modes
20720
20721 #undef TARGET_SPECULATION_SAFE_VALUE
20722 #define TARGET_SPECULATION_SAFE_VALUE aarch64_speculation_safe_value
20723
20724 #undef TARGET_ESTIMATED_POLY_VALUE
20725 #define TARGET_ESTIMATED_POLY_VALUE aarch64_estimated_poly_value
20726
20727 #undef TARGET_ATTRIBUTE_TABLE
20728 #define TARGET_ATTRIBUTE_TABLE aarch64_attribute_table
20729
20730 #undef TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN
20731 #define TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN \
20732 aarch64_simd_clone_compute_vecsize_and_simdlen
20733
20734 #undef TARGET_SIMD_CLONE_ADJUST
20735 #define TARGET_SIMD_CLONE_ADJUST aarch64_simd_clone_adjust
20736
20737 #undef TARGET_SIMD_CLONE_USABLE
20738 #define TARGET_SIMD_CLONE_USABLE aarch64_simd_clone_usable
20739
20740 #undef TARGET_COMP_TYPE_ATTRIBUTES
20741 #define TARGET_COMP_TYPE_ATTRIBUTES aarch64_comp_type_attributes
20742
20743 #undef TARGET_GET_MULTILIB_ABI_NAME
20744 #define TARGET_GET_MULTILIB_ABI_NAME aarch64_get_multilib_abi_name
20745
20746 #if CHECKING_P
20747 #undef TARGET_RUN_TARGET_SELFTESTS
20748 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
20749 #endif /* #if CHECKING_P */
20750
20751 #undef TARGET_ASM_POST_CFI_STARTPROC
20752 #define TARGET_ASM_POST_CFI_STARTPROC aarch64_post_cfi_startproc
20753
20754 struct gcc_target targetm = TARGET_INITIALIZER;
20755
20756 #include "gt-aarch64.h"