]> git.ipfire.org Git - thirdparty/gcc.git/blob - gcc/config/aarch64/aarch64.c
aarch64: Implement -moutline-atomics
[thirdparty/gcc.git] / gcc / config / aarch64 / aarch64.c
1 /* Machine description for AArch64 architecture.
2 Copyright (C) 2009-2019 Free Software Foundation, Inc.
3 Contributed by ARM Ltd.
4
5 This file is part of GCC.
6
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3, or (at your option)
10 any later version.
11
12 GCC is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 General Public License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
20
21 #define IN_TARGET_CODE 1
22
23 #include "config.h"
24 #define INCLUDE_STRING
25 #include "system.h"
26 #include "coretypes.h"
27 #include "backend.h"
28 #include "target.h"
29 #include "rtl.h"
30 #include "tree.h"
31 #include "memmodel.h"
32 #include "gimple.h"
33 #include "cfghooks.h"
34 #include "cfgloop.h"
35 #include "df.h"
36 #include "tm_p.h"
37 #include "stringpool.h"
38 #include "attribs.h"
39 #include "optabs.h"
40 #include "regs.h"
41 #include "emit-rtl.h"
42 #include "recog.h"
43 #include "cgraph.h"
44 #include "diagnostic.h"
45 #include "insn-attr.h"
46 #include "alias.h"
47 #include "fold-const.h"
48 #include "stor-layout.h"
49 #include "calls.h"
50 #include "varasm.h"
51 #include "output.h"
52 #include "flags.h"
53 #include "explow.h"
54 #include "expr.h"
55 #include "reload.h"
56 #include "langhooks.h"
57 #include "opts.h"
58 #include "params.h"
59 #include "gimplify.h"
60 #include "dwarf2.h"
61 #include "gimple-iterator.h"
62 #include "tree-vectorizer.h"
63 #include "aarch64-cost-tables.h"
64 #include "dumpfile.h"
65 #include "builtins.h"
66 #include "rtl-iter.h"
67 #include "tm-constrs.h"
68 #include "sched-int.h"
69 #include "target-globals.h"
70 #include "common/common-target.h"
71 #include "cfgrtl.h"
72 #include "selftest.h"
73 #include "selftest-rtl.h"
74 #include "rtx-vector-builder.h"
75 #include "intl.h"
76 #include "expmed.h"
77
78 /* This file should be included last. */
79 #include "target-def.h"
80
81 /* Defined for convenience. */
82 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
83
84 /* Information about a legitimate vector immediate operand. */
85 struct simd_immediate_info
86 {
87 enum insn_type { MOV, MVN, INDEX, PTRUE };
88 enum modifier_type { LSL, MSL };
89
90 simd_immediate_info () {}
91 simd_immediate_info (scalar_float_mode, rtx);
92 simd_immediate_info (scalar_int_mode, unsigned HOST_WIDE_INT,
93 insn_type = MOV, modifier_type = LSL,
94 unsigned int = 0);
95 simd_immediate_info (scalar_mode, rtx, rtx);
96 simd_immediate_info (scalar_int_mode, aarch64_svpattern);
97
98 /* The mode of the elements. */
99 scalar_mode elt_mode;
100
101 /* The instruction to use to move the immediate into a vector. */
102 insn_type insn;
103
104 union
105 {
106 /* For MOV and MVN. */
107 struct
108 {
109 /* The value of each element. */
110 rtx value;
111
112 /* The kind of shift modifier to use, and the number of bits to shift.
113 This is (LSL, 0) if no shift is needed. */
114 modifier_type modifier;
115 unsigned int shift;
116 } mov;
117
118 /* For INDEX. */
119 struct
120 {
121 /* The value of the first element and the step to be added for each
122 subsequent element. */
123 rtx base, step;
124 } index;
125
126 /* For PTRUE. */
127 aarch64_svpattern pattern;
128 } u;
129 };
130
131 /* Construct a floating-point immediate in which each element has mode
132 ELT_MODE_IN and value VALUE_IN. */
133 inline simd_immediate_info
134 ::simd_immediate_info (scalar_float_mode elt_mode_in, rtx value_in)
135 : elt_mode (elt_mode_in), insn (MOV)
136 {
137 u.mov.value = value_in;
138 u.mov.modifier = LSL;
139 u.mov.shift = 0;
140 }
141
142 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
143 and value VALUE_IN. The other parameters are as for the structure
144 fields. */
145 inline simd_immediate_info
146 ::simd_immediate_info (scalar_int_mode elt_mode_in,
147 unsigned HOST_WIDE_INT value_in,
148 insn_type insn_in, modifier_type modifier_in,
149 unsigned int shift_in)
150 : elt_mode (elt_mode_in), insn (insn_in)
151 {
152 u.mov.value = gen_int_mode (value_in, elt_mode_in);
153 u.mov.modifier = modifier_in;
154 u.mov.shift = shift_in;
155 }
156
157 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
158 and where element I is equal to BASE_IN + I * STEP_IN. */
159 inline simd_immediate_info
160 ::simd_immediate_info (scalar_mode elt_mode_in, rtx base_in, rtx step_in)
161 : elt_mode (elt_mode_in), insn (INDEX)
162 {
163 u.index.base = base_in;
164 u.index.step = step_in;
165 }
166
167 /* Construct a predicate that controls elements of mode ELT_MODE_IN
168 and has PTRUE pattern PATTERN_IN. */
169 inline simd_immediate_info
170 ::simd_immediate_info (scalar_int_mode elt_mode_in,
171 aarch64_svpattern pattern_in)
172 : elt_mode (elt_mode_in), insn (PTRUE)
173 {
174 u.pattern = pattern_in;
175 }
176
177 /* The current code model. */
178 enum aarch64_code_model aarch64_cmodel;
179
180 /* The number of 64-bit elements in an SVE vector. */
181 poly_uint16 aarch64_sve_vg;
182
183 #ifdef HAVE_AS_TLS
184 #undef TARGET_HAVE_TLS
185 #define TARGET_HAVE_TLS 1
186 #endif
187
188 static bool aarch64_composite_type_p (const_tree, machine_mode);
189 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
190 const_tree,
191 machine_mode *, int *,
192 bool *);
193 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
194 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
195 static void aarch64_override_options_after_change (void);
196 static bool aarch64_vector_mode_supported_p (machine_mode);
197 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
198 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
199 const_tree type,
200 int misalignment,
201 bool is_packed);
202 static machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64);
203 static bool aarch64_print_address_internal (FILE*, machine_mode, rtx,
204 aarch64_addr_query_type);
205 static HOST_WIDE_INT aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val);
206
207 /* Major revision number of the ARM Architecture implemented by the target. */
208 unsigned aarch64_architecture_version;
209
210 /* The processor for which instructions should be scheduled. */
211 enum aarch64_processor aarch64_tune = cortexa53;
212
213 /* Mask to specify which instruction scheduling options should be used. */
214 uint64_t aarch64_tune_flags = 0;
215
216 /* Global flag for PC relative loads. */
217 bool aarch64_pcrelative_literal_loads;
218
219 /* Global flag for whether frame pointer is enabled. */
220 bool aarch64_use_frame_pointer;
221
222 #define BRANCH_PROTECT_STR_MAX 255
223 char *accepted_branch_protection_string = NULL;
224
225 static enum aarch64_parse_opt_result
226 aarch64_parse_branch_protection (const char*, char**);
227
228 /* Support for command line parsing of boolean flags in the tuning
229 structures. */
230 struct aarch64_flag_desc
231 {
232 const char* name;
233 unsigned int flag;
234 };
235
236 #define AARCH64_FUSION_PAIR(name, internal_name) \
237 { name, AARCH64_FUSE_##internal_name },
238 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
239 {
240 { "none", AARCH64_FUSE_NOTHING },
241 #include "aarch64-fusion-pairs.def"
242 { "all", AARCH64_FUSE_ALL },
243 { NULL, AARCH64_FUSE_NOTHING }
244 };
245
246 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
247 { name, AARCH64_EXTRA_TUNE_##internal_name },
248 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
249 {
250 { "none", AARCH64_EXTRA_TUNE_NONE },
251 #include "aarch64-tuning-flags.def"
252 { "all", AARCH64_EXTRA_TUNE_ALL },
253 { NULL, AARCH64_EXTRA_TUNE_NONE }
254 };
255
256 /* Tuning parameters. */
257
258 static const struct cpu_addrcost_table generic_addrcost_table =
259 {
260 {
261 1, /* hi */
262 0, /* si */
263 0, /* di */
264 1, /* ti */
265 },
266 0, /* pre_modify */
267 0, /* post_modify */
268 0, /* register_offset */
269 0, /* register_sextend */
270 0, /* register_zextend */
271 0 /* imm_offset */
272 };
273
274 static const struct cpu_addrcost_table exynosm1_addrcost_table =
275 {
276 {
277 0, /* hi */
278 0, /* si */
279 0, /* di */
280 2, /* ti */
281 },
282 0, /* pre_modify */
283 0, /* post_modify */
284 1, /* register_offset */
285 1, /* register_sextend */
286 2, /* register_zextend */
287 0, /* imm_offset */
288 };
289
290 static const struct cpu_addrcost_table xgene1_addrcost_table =
291 {
292 {
293 1, /* hi */
294 0, /* si */
295 0, /* di */
296 1, /* ti */
297 },
298 1, /* pre_modify */
299 1, /* post_modify */
300 0, /* register_offset */
301 1, /* register_sextend */
302 1, /* register_zextend */
303 0, /* imm_offset */
304 };
305
306 static const struct cpu_addrcost_table thunderx2t99_addrcost_table =
307 {
308 {
309 1, /* hi */
310 1, /* si */
311 1, /* di */
312 2, /* ti */
313 },
314 0, /* pre_modify */
315 0, /* post_modify */
316 2, /* register_offset */
317 3, /* register_sextend */
318 3, /* register_zextend */
319 0, /* imm_offset */
320 };
321
322 static const struct cpu_addrcost_table tsv110_addrcost_table =
323 {
324 {
325 1, /* hi */
326 0, /* si */
327 0, /* di */
328 1, /* ti */
329 },
330 0, /* pre_modify */
331 0, /* post_modify */
332 0, /* register_offset */
333 1, /* register_sextend */
334 1, /* register_zextend */
335 0, /* imm_offset */
336 };
337
338 static const struct cpu_addrcost_table qdf24xx_addrcost_table =
339 {
340 {
341 1, /* hi */
342 1, /* si */
343 1, /* di */
344 2, /* ti */
345 },
346 1, /* pre_modify */
347 1, /* post_modify */
348 3, /* register_offset */
349 3, /* register_sextend */
350 3, /* register_zextend */
351 2, /* imm_offset */
352 };
353
354 static const struct cpu_regmove_cost generic_regmove_cost =
355 {
356 1, /* GP2GP */
357 /* Avoid the use of slow int<->fp moves for spilling by setting
358 their cost higher than memmov_cost. */
359 5, /* GP2FP */
360 5, /* FP2GP */
361 2 /* FP2FP */
362 };
363
364 static const struct cpu_regmove_cost cortexa57_regmove_cost =
365 {
366 1, /* GP2GP */
367 /* Avoid the use of slow int<->fp moves for spilling by setting
368 their cost higher than memmov_cost. */
369 5, /* GP2FP */
370 5, /* FP2GP */
371 2 /* FP2FP */
372 };
373
374 static const struct cpu_regmove_cost cortexa53_regmove_cost =
375 {
376 1, /* GP2GP */
377 /* Avoid the use of slow int<->fp moves for spilling by setting
378 their cost higher than memmov_cost. */
379 5, /* GP2FP */
380 5, /* FP2GP */
381 2 /* FP2FP */
382 };
383
384 static const struct cpu_regmove_cost exynosm1_regmove_cost =
385 {
386 1, /* GP2GP */
387 /* Avoid the use of slow int<->fp moves for spilling by setting
388 their cost higher than memmov_cost (actual, 4 and 9). */
389 9, /* GP2FP */
390 9, /* FP2GP */
391 1 /* FP2FP */
392 };
393
394 static const struct cpu_regmove_cost thunderx_regmove_cost =
395 {
396 2, /* GP2GP */
397 2, /* GP2FP */
398 6, /* FP2GP */
399 4 /* FP2FP */
400 };
401
402 static const struct cpu_regmove_cost xgene1_regmove_cost =
403 {
404 1, /* GP2GP */
405 /* Avoid the use of slow int<->fp moves for spilling by setting
406 their cost higher than memmov_cost. */
407 8, /* GP2FP */
408 8, /* FP2GP */
409 2 /* FP2FP */
410 };
411
412 static const struct cpu_regmove_cost qdf24xx_regmove_cost =
413 {
414 2, /* GP2GP */
415 /* Avoid the use of int<->fp moves for spilling. */
416 6, /* GP2FP */
417 6, /* FP2GP */
418 4 /* FP2FP */
419 };
420
421 static const struct cpu_regmove_cost thunderx2t99_regmove_cost =
422 {
423 1, /* GP2GP */
424 /* Avoid the use of int<->fp moves for spilling. */
425 8, /* GP2FP */
426 8, /* FP2GP */
427 4 /* FP2FP */
428 };
429
430 static const struct cpu_regmove_cost tsv110_regmove_cost =
431 {
432 1, /* GP2GP */
433 /* Avoid the use of slow int<->fp moves for spilling by setting
434 their cost higher than memmov_cost. */
435 2, /* GP2FP */
436 3, /* FP2GP */
437 2 /* FP2FP */
438 };
439
440 /* Generic costs for vector insn classes. */
441 static const struct cpu_vector_cost generic_vector_cost =
442 {
443 1, /* scalar_int_stmt_cost */
444 1, /* scalar_fp_stmt_cost */
445 1, /* scalar_load_cost */
446 1, /* scalar_store_cost */
447 1, /* vec_int_stmt_cost */
448 1, /* vec_fp_stmt_cost */
449 2, /* vec_permute_cost */
450 1, /* vec_to_scalar_cost */
451 1, /* scalar_to_vec_cost */
452 1, /* vec_align_load_cost */
453 1, /* vec_unalign_load_cost */
454 1, /* vec_unalign_store_cost */
455 1, /* vec_store_cost */
456 3, /* cond_taken_branch_cost */
457 1 /* cond_not_taken_branch_cost */
458 };
459
460 /* QDF24XX costs for vector insn classes. */
461 static const struct cpu_vector_cost qdf24xx_vector_cost =
462 {
463 1, /* scalar_int_stmt_cost */
464 1, /* scalar_fp_stmt_cost */
465 1, /* scalar_load_cost */
466 1, /* scalar_store_cost */
467 1, /* vec_int_stmt_cost */
468 3, /* vec_fp_stmt_cost */
469 2, /* vec_permute_cost */
470 1, /* vec_to_scalar_cost */
471 1, /* scalar_to_vec_cost */
472 1, /* vec_align_load_cost */
473 1, /* vec_unalign_load_cost */
474 1, /* vec_unalign_store_cost */
475 1, /* vec_store_cost */
476 3, /* cond_taken_branch_cost */
477 1 /* cond_not_taken_branch_cost */
478 };
479
480 /* ThunderX costs for vector insn classes. */
481 static const struct cpu_vector_cost thunderx_vector_cost =
482 {
483 1, /* scalar_int_stmt_cost */
484 1, /* scalar_fp_stmt_cost */
485 3, /* scalar_load_cost */
486 1, /* scalar_store_cost */
487 4, /* vec_int_stmt_cost */
488 1, /* vec_fp_stmt_cost */
489 4, /* vec_permute_cost */
490 2, /* vec_to_scalar_cost */
491 2, /* scalar_to_vec_cost */
492 3, /* vec_align_load_cost */
493 5, /* vec_unalign_load_cost */
494 5, /* vec_unalign_store_cost */
495 1, /* vec_store_cost */
496 3, /* cond_taken_branch_cost */
497 3 /* cond_not_taken_branch_cost */
498 };
499
500 static const struct cpu_vector_cost tsv110_vector_cost =
501 {
502 1, /* scalar_int_stmt_cost */
503 1, /* scalar_fp_stmt_cost */
504 5, /* scalar_load_cost */
505 1, /* scalar_store_cost */
506 2, /* vec_int_stmt_cost */
507 2, /* vec_fp_stmt_cost */
508 2, /* vec_permute_cost */
509 3, /* vec_to_scalar_cost */
510 2, /* scalar_to_vec_cost */
511 5, /* vec_align_load_cost */
512 5, /* vec_unalign_load_cost */
513 1, /* vec_unalign_store_cost */
514 1, /* vec_store_cost */
515 1, /* cond_taken_branch_cost */
516 1 /* cond_not_taken_branch_cost */
517 };
518
519 /* Generic costs for vector insn classes. */
520 static const struct cpu_vector_cost cortexa57_vector_cost =
521 {
522 1, /* scalar_int_stmt_cost */
523 1, /* scalar_fp_stmt_cost */
524 4, /* scalar_load_cost */
525 1, /* scalar_store_cost */
526 2, /* vec_int_stmt_cost */
527 2, /* vec_fp_stmt_cost */
528 3, /* vec_permute_cost */
529 8, /* vec_to_scalar_cost */
530 8, /* scalar_to_vec_cost */
531 4, /* vec_align_load_cost */
532 4, /* vec_unalign_load_cost */
533 1, /* vec_unalign_store_cost */
534 1, /* vec_store_cost */
535 1, /* cond_taken_branch_cost */
536 1 /* cond_not_taken_branch_cost */
537 };
538
539 static const struct cpu_vector_cost exynosm1_vector_cost =
540 {
541 1, /* scalar_int_stmt_cost */
542 1, /* scalar_fp_stmt_cost */
543 5, /* scalar_load_cost */
544 1, /* scalar_store_cost */
545 3, /* vec_int_stmt_cost */
546 3, /* vec_fp_stmt_cost */
547 3, /* vec_permute_cost */
548 3, /* vec_to_scalar_cost */
549 3, /* scalar_to_vec_cost */
550 5, /* vec_align_load_cost */
551 5, /* vec_unalign_load_cost */
552 1, /* vec_unalign_store_cost */
553 1, /* vec_store_cost */
554 1, /* cond_taken_branch_cost */
555 1 /* cond_not_taken_branch_cost */
556 };
557
558 /* Generic costs for vector insn classes. */
559 static const struct cpu_vector_cost xgene1_vector_cost =
560 {
561 1, /* scalar_int_stmt_cost */
562 1, /* scalar_fp_stmt_cost */
563 5, /* scalar_load_cost */
564 1, /* scalar_store_cost */
565 2, /* vec_int_stmt_cost */
566 2, /* vec_fp_stmt_cost */
567 2, /* vec_permute_cost */
568 4, /* vec_to_scalar_cost */
569 4, /* scalar_to_vec_cost */
570 10, /* vec_align_load_cost */
571 10, /* vec_unalign_load_cost */
572 2, /* vec_unalign_store_cost */
573 2, /* vec_store_cost */
574 2, /* cond_taken_branch_cost */
575 1 /* cond_not_taken_branch_cost */
576 };
577
578 /* Costs for vector insn classes for Vulcan. */
579 static const struct cpu_vector_cost thunderx2t99_vector_cost =
580 {
581 1, /* scalar_int_stmt_cost */
582 6, /* scalar_fp_stmt_cost */
583 4, /* scalar_load_cost */
584 1, /* scalar_store_cost */
585 5, /* vec_int_stmt_cost */
586 6, /* vec_fp_stmt_cost */
587 3, /* vec_permute_cost */
588 6, /* vec_to_scalar_cost */
589 5, /* scalar_to_vec_cost */
590 8, /* vec_align_load_cost */
591 8, /* vec_unalign_load_cost */
592 4, /* vec_unalign_store_cost */
593 4, /* vec_store_cost */
594 2, /* cond_taken_branch_cost */
595 1 /* cond_not_taken_branch_cost */
596 };
597
598 /* Generic costs for branch instructions. */
599 static const struct cpu_branch_cost generic_branch_cost =
600 {
601 1, /* Predictable. */
602 3 /* Unpredictable. */
603 };
604
605 /* Generic approximation modes. */
606 static const cpu_approx_modes generic_approx_modes =
607 {
608 AARCH64_APPROX_NONE, /* division */
609 AARCH64_APPROX_NONE, /* sqrt */
610 AARCH64_APPROX_NONE /* recip_sqrt */
611 };
612
613 /* Approximation modes for Exynos M1. */
614 static const cpu_approx_modes exynosm1_approx_modes =
615 {
616 AARCH64_APPROX_NONE, /* division */
617 AARCH64_APPROX_ALL, /* sqrt */
618 AARCH64_APPROX_ALL /* recip_sqrt */
619 };
620
621 /* Approximation modes for X-Gene 1. */
622 static const cpu_approx_modes xgene1_approx_modes =
623 {
624 AARCH64_APPROX_NONE, /* division */
625 AARCH64_APPROX_NONE, /* sqrt */
626 AARCH64_APPROX_ALL /* recip_sqrt */
627 };
628
629 /* Generic prefetch settings (which disable prefetch). */
630 static const cpu_prefetch_tune generic_prefetch_tune =
631 {
632 0, /* num_slots */
633 -1, /* l1_cache_size */
634 -1, /* l1_cache_line_size */
635 -1, /* l2_cache_size */
636 true, /* prefetch_dynamic_strides */
637 -1, /* minimum_stride */
638 -1 /* default_opt_level */
639 };
640
641 static const cpu_prefetch_tune exynosm1_prefetch_tune =
642 {
643 0, /* num_slots */
644 -1, /* l1_cache_size */
645 64, /* l1_cache_line_size */
646 -1, /* l2_cache_size */
647 true, /* prefetch_dynamic_strides */
648 -1, /* minimum_stride */
649 -1 /* default_opt_level */
650 };
651
652 static const cpu_prefetch_tune qdf24xx_prefetch_tune =
653 {
654 4, /* num_slots */
655 32, /* l1_cache_size */
656 64, /* l1_cache_line_size */
657 512, /* l2_cache_size */
658 false, /* prefetch_dynamic_strides */
659 2048, /* minimum_stride */
660 3 /* default_opt_level */
661 };
662
663 static const cpu_prefetch_tune thunderxt88_prefetch_tune =
664 {
665 8, /* num_slots */
666 32, /* l1_cache_size */
667 128, /* l1_cache_line_size */
668 16*1024, /* l2_cache_size */
669 true, /* prefetch_dynamic_strides */
670 -1, /* minimum_stride */
671 3 /* default_opt_level */
672 };
673
674 static const cpu_prefetch_tune thunderx_prefetch_tune =
675 {
676 8, /* num_slots */
677 32, /* l1_cache_size */
678 128, /* l1_cache_line_size */
679 -1, /* l2_cache_size */
680 true, /* prefetch_dynamic_strides */
681 -1, /* minimum_stride */
682 -1 /* default_opt_level */
683 };
684
685 static const cpu_prefetch_tune thunderx2t99_prefetch_tune =
686 {
687 8, /* num_slots */
688 32, /* l1_cache_size */
689 64, /* l1_cache_line_size */
690 256, /* l2_cache_size */
691 true, /* prefetch_dynamic_strides */
692 -1, /* minimum_stride */
693 -1 /* default_opt_level */
694 };
695
696 static const cpu_prefetch_tune tsv110_prefetch_tune =
697 {
698 0, /* num_slots */
699 64, /* l1_cache_size */
700 64, /* l1_cache_line_size */
701 512, /* l2_cache_size */
702 true, /* prefetch_dynamic_strides */
703 -1, /* minimum_stride */
704 -1 /* default_opt_level */
705 };
706
707 static const cpu_prefetch_tune xgene1_prefetch_tune =
708 {
709 8, /* num_slots */
710 32, /* l1_cache_size */
711 64, /* l1_cache_line_size */
712 256, /* l2_cache_size */
713 true, /* prefetch_dynamic_strides */
714 -1, /* minimum_stride */
715 -1 /* default_opt_level */
716 };
717
718 static const struct tune_params generic_tunings =
719 {
720 &cortexa57_extra_costs,
721 &generic_addrcost_table,
722 &generic_regmove_cost,
723 &generic_vector_cost,
724 &generic_branch_cost,
725 &generic_approx_modes,
726 SVE_NOT_IMPLEMENTED, /* sve_width */
727 4, /* memmov_cost */
728 2, /* issue_rate */
729 (AARCH64_FUSE_AES_AESMC), /* fusible_ops */
730 "16:12", /* function_align. */
731 "4", /* jump_align. */
732 "8", /* loop_align. */
733 2, /* int_reassoc_width. */
734 4, /* fp_reassoc_width. */
735 1, /* vec_reassoc_width. */
736 2, /* min_div_recip_mul_sf. */
737 2, /* min_div_recip_mul_df. */
738 0, /* max_case_values. */
739 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
740 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
741 &generic_prefetch_tune
742 };
743
744 static const struct tune_params cortexa35_tunings =
745 {
746 &cortexa53_extra_costs,
747 &generic_addrcost_table,
748 &cortexa53_regmove_cost,
749 &generic_vector_cost,
750 &generic_branch_cost,
751 &generic_approx_modes,
752 SVE_NOT_IMPLEMENTED, /* sve_width */
753 4, /* memmov_cost */
754 1, /* issue_rate */
755 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
756 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
757 "16", /* function_align. */
758 "4", /* jump_align. */
759 "8", /* loop_align. */
760 2, /* int_reassoc_width. */
761 4, /* fp_reassoc_width. */
762 1, /* vec_reassoc_width. */
763 2, /* min_div_recip_mul_sf. */
764 2, /* min_div_recip_mul_df. */
765 0, /* max_case_values. */
766 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
767 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
768 &generic_prefetch_tune
769 };
770
771 static const struct tune_params cortexa53_tunings =
772 {
773 &cortexa53_extra_costs,
774 &generic_addrcost_table,
775 &cortexa53_regmove_cost,
776 &generic_vector_cost,
777 &generic_branch_cost,
778 &generic_approx_modes,
779 SVE_NOT_IMPLEMENTED, /* sve_width */
780 4, /* memmov_cost */
781 2, /* issue_rate */
782 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
783 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
784 "16", /* function_align. */
785 "4", /* jump_align. */
786 "8", /* loop_align. */
787 2, /* int_reassoc_width. */
788 4, /* fp_reassoc_width. */
789 1, /* vec_reassoc_width. */
790 2, /* min_div_recip_mul_sf. */
791 2, /* min_div_recip_mul_df. */
792 0, /* max_case_values. */
793 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
794 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
795 &generic_prefetch_tune
796 };
797
798 static const struct tune_params cortexa57_tunings =
799 {
800 &cortexa57_extra_costs,
801 &generic_addrcost_table,
802 &cortexa57_regmove_cost,
803 &cortexa57_vector_cost,
804 &generic_branch_cost,
805 &generic_approx_modes,
806 SVE_NOT_IMPLEMENTED, /* sve_width */
807 4, /* memmov_cost */
808 3, /* issue_rate */
809 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
810 | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
811 "16", /* function_align. */
812 "4", /* jump_align. */
813 "8", /* loop_align. */
814 2, /* int_reassoc_width. */
815 4, /* fp_reassoc_width. */
816 1, /* vec_reassoc_width. */
817 2, /* min_div_recip_mul_sf. */
818 2, /* min_div_recip_mul_df. */
819 0, /* max_case_values. */
820 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
821 (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS), /* tune_flags. */
822 &generic_prefetch_tune
823 };
824
825 static const struct tune_params cortexa72_tunings =
826 {
827 &cortexa57_extra_costs,
828 &generic_addrcost_table,
829 &cortexa57_regmove_cost,
830 &cortexa57_vector_cost,
831 &generic_branch_cost,
832 &generic_approx_modes,
833 SVE_NOT_IMPLEMENTED, /* sve_width */
834 4, /* memmov_cost */
835 3, /* issue_rate */
836 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
837 | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
838 "16", /* function_align. */
839 "4", /* jump_align. */
840 "8", /* loop_align. */
841 2, /* int_reassoc_width. */
842 4, /* fp_reassoc_width. */
843 1, /* vec_reassoc_width. */
844 2, /* min_div_recip_mul_sf. */
845 2, /* min_div_recip_mul_df. */
846 0, /* max_case_values. */
847 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
848 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
849 &generic_prefetch_tune
850 };
851
852 static const struct tune_params cortexa73_tunings =
853 {
854 &cortexa57_extra_costs,
855 &generic_addrcost_table,
856 &cortexa57_regmove_cost,
857 &cortexa57_vector_cost,
858 &generic_branch_cost,
859 &generic_approx_modes,
860 SVE_NOT_IMPLEMENTED, /* sve_width */
861 4, /* memmov_cost. */
862 2, /* issue_rate. */
863 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
864 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
865 "16", /* function_align. */
866 "4", /* jump_align. */
867 "8", /* loop_align. */
868 2, /* int_reassoc_width. */
869 4, /* fp_reassoc_width. */
870 1, /* vec_reassoc_width. */
871 2, /* min_div_recip_mul_sf. */
872 2, /* min_div_recip_mul_df. */
873 0, /* max_case_values. */
874 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
875 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
876 &generic_prefetch_tune
877 };
878
879
880
881 static const struct tune_params exynosm1_tunings =
882 {
883 &exynosm1_extra_costs,
884 &exynosm1_addrcost_table,
885 &exynosm1_regmove_cost,
886 &exynosm1_vector_cost,
887 &generic_branch_cost,
888 &exynosm1_approx_modes,
889 SVE_NOT_IMPLEMENTED, /* sve_width */
890 4, /* memmov_cost */
891 3, /* issue_rate */
892 (AARCH64_FUSE_AES_AESMC), /* fusible_ops */
893 "4", /* function_align. */
894 "4", /* jump_align. */
895 "4", /* loop_align. */
896 2, /* int_reassoc_width. */
897 4, /* fp_reassoc_width. */
898 1, /* vec_reassoc_width. */
899 2, /* min_div_recip_mul_sf. */
900 2, /* min_div_recip_mul_df. */
901 48, /* max_case_values. */
902 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
903 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
904 &exynosm1_prefetch_tune
905 };
906
907 static const struct tune_params thunderxt88_tunings =
908 {
909 &thunderx_extra_costs,
910 &generic_addrcost_table,
911 &thunderx_regmove_cost,
912 &thunderx_vector_cost,
913 &generic_branch_cost,
914 &generic_approx_modes,
915 SVE_NOT_IMPLEMENTED, /* sve_width */
916 6, /* memmov_cost */
917 2, /* issue_rate */
918 AARCH64_FUSE_CMP_BRANCH, /* fusible_ops */
919 "8", /* function_align. */
920 "8", /* jump_align. */
921 "8", /* loop_align. */
922 2, /* int_reassoc_width. */
923 4, /* fp_reassoc_width. */
924 1, /* vec_reassoc_width. */
925 2, /* min_div_recip_mul_sf. */
926 2, /* min_div_recip_mul_df. */
927 0, /* max_case_values. */
928 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
929 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW), /* tune_flags. */
930 &thunderxt88_prefetch_tune
931 };
932
933 static const struct tune_params thunderx_tunings =
934 {
935 &thunderx_extra_costs,
936 &generic_addrcost_table,
937 &thunderx_regmove_cost,
938 &thunderx_vector_cost,
939 &generic_branch_cost,
940 &generic_approx_modes,
941 SVE_NOT_IMPLEMENTED, /* sve_width */
942 6, /* memmov_cost */
943 2, /* issue_rate */
944 AARCH64_FUSE_CMP_BRANCH, /* fusible_ops */
945 "8", /* function_align. */
946 "8", /* jump_align. */
947 "8", /* loop_align. */
948 2, /* int_reassoc_width. */
949 4, /* fp_reassoc_width. */
950 1, /* vec_reassoc_width. */
951 2, /* min_div_recip_mul_sf. */
952 2, /* min_div_recip_mul_df. */
953 0, /* max_case_values. */
954 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
955 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
956 | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND), /* tune_flags. */
957 &thunderx_prefetch_tune
958 };
959
960 static const struct tune_params tsv110_tunings =
961 {
962 &tsv110_extra_costs,
963 &tsv110_addrcost_table,
964 &tsv110_regmove_cost,
965 &tsv110_vector_cost,
966 &generic_branch_cost,
967 &generic_approx_modes,
968 SVE_NOT_IMPLEMENTED, /* sve_width */
969 4, /* memmov_cost */
970 4, /* issue_rate */
971 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH
972 | AARCH64_FUSE_ALU_BRANCH), /* fusible_ops */
973 "16", /* function_align. */
974 "4", /* jump_align. */
975 "8", /* loop_align. */
976 2, /* int_reassoc_width. */
977 4, /* fp_reassoc_width. */
978 1, /* vec_reassoc_width. */
979 2, /* min_div_recip_mul_sf. */
980 2, /* min_div_recip_mul_df. */
981 0, /* max_case_values. */
982 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
983 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
984 &tsv110_prefetch_tune
985 };
986
987 static const struct tune_params xgene1_tunings =
988 {
989 &xgene1_extra_costs,
990 &xgene1_addrcost_table,
991 &xgene1_regmove_cost,
992 &xgene1_vector_cost,
993 &generic_branch_cost,
994 &xgene1_approx_modes,
995 SVE_NOT_IMPLEMENTED, /* sve_width */
996 6, /* memmov_cost */
997 4, /* issue_rate */
998 AARCH64_FUSE_NOTHING, /* fusible_ops */
999 "16", /* function_align. */
1000 "16", /* jump_align. */
1001 "16", /* loop_align. */
1002 2, /* int_reassoc_width. */
1003 4, /* fp_reassoc_width. */
1004 1, /* vec_reassoc_width. */
1005 2, /* min_div_recip_mul_sf. */
1006 2, /* min_div_recip_mul_df. */
1007 17, /* max_case_values. */
1008 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
1009 (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS), /* tune_flags. */
1010 &xgene1_prefetch_tune
1011 };
1012
1013 static const struct tune_params emag_tunings =
1014 {
1015 &xgene1_extra_costs,
1016 &xgene1_addrcost_table,
1017 &xgene1_regmove_cost,
1018 &xgene1_vector_cost,
1019 &generic_branch_cost,
1020 &xgene1_approx_modes,
1021 SVE_NOT_IMPLEMENTED,
1022 6, /* memmov_cost */
1023 4, /* issue_rate */
1024 AARCH64_FUSE_NOTHING, /* fusible_ops */
1025 "16", /* function_align. */
1026 "16", /* jump_align. */
1027 "16", /* loop_align. */
1028 2, /* int_reassoc_width. */
1029 4, /* fp_reassoc_width. */
1030 1, /* vec_reassoc_width. */
1031 2, /* min_div_recip_mul_sf. */
1032 2, /* min_div_recip_mul_df. */
1033 17, /* max_case_values. */
1034 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
1035 (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS), /* tune_flags. */
1036 &xgene1_prefetch_tune
1037 };
1038
1039 static const struct tune_params qdf24xx_tunings =
1040 {
1041 &qdf24xx_extra_costs,
1042 &qdf24xx_addrcost_table,
1043 &qdf24xx_regmove_cost,
1044 &qdf24xx_vector_cost,
1045 &generic_branch_cost,
1046 &generic_approx_modes,
1047 SVE_NOT_IMPLEMENTED, /* sve_width */
1048 4, /* memmov_cost */
1049 4, /* issue_rate */
1050 (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1051 | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops */
1052 "16", /* function_align. */
1053 "8", /* jump_align. */
1054 "16", /* loop_align. */
1055 2, /* int_reassoc_width. */
1056 4, /* fp_reassoc_width. */
1057 1, /* vec_reassoc_width. */
1058 2, /* min_div_recip_mul_sf. */
1059 2, /* min_div_recip_mul_df. */
1060 0, /* max_case_values. */
1061 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1062 AARCH64_EXTRA_TUNE_RENAME_LOAD_REGS, /* tune_flags. */
1063 &qdf24xx_prefetch_tune
1064 };
1065
1066 /* Tuning structure for the Qualcomm Saphira core. Default to falkor values
1067 for now. */
1068 static const struct tune_params saphira_tunings =
1069 {
1070 &generic_extra_costs,
1071 &generic_addrcost_table,
1072 &generic_regmove_cost,
1073 &generic_vector_cost,
1074 &generic_branch_cost,
1075 &generic_approx_modes,
1076 SVE_NOT_IMPLEMENTED, /* sve_width */
1077 4, /* memmov_cost */
1078 4, /* issue_rate */
1079 (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1080 | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops */
1081 "16", /* function_align. */
1082 "8", /* jump_align. */
1083 "16", /* loop_align. */
1084 2, /* int_reassoc_width. */
1085 4, /* fp_reassoc_width. */
1086 1, /* vec_reassoc_width. */
1087 2, /* min_div_recip_mul_sf. */
1088 2, /* min_div_recip_mul_df. */
1089 0, /* max_case_values. */
1090 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1091 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1092 &generic_prefetch_tune
1093 };
1094
1095 static const struct tune_params thunderx2t99_tunings =
1096 {
1097 &thunderx2t99_extra_costs,
1098 &thunderx2t99_addrcost_table,
1099 &thunderx2t99_regmove_cost,
1100 &thunderx2t99_vector_cost,
1101 &generic_branch_cost,
1102 &generic_approx_modes,
1103 SVE_NOT_IMPLEMENTED, /* sve_width */
1104 4, /* memmov_cost. */
1105 4, /* issue_rate. */
1106 (AARCH64_FUSE_CMP_BRANCH | AARCH64_FUSE_AES_AESMC
1107 | AARCH64_FUSE_ALU_BRANCH), /* fusible_ops */
1108 "16", /* function_align. */
1109 "8", /* jump_align. */
1110 "16", /* loop_align. */
1111 3, /* int_reassoc_width. */
1112 2, /* fp_reassoc_width. */
1113 2, /* vec_reassoc_width. */
1114 2, /* min_div_recip_mul_sf. */
1115 2, /* min_div_recip_mul_df. */
1116 0, /* max_case_values. */
1117 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1118 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1119 &thunderx2t99_prefetch_tune
1120 };
1121
1122 static const struct tune_params neoversen1_tunings =
1123 {
1124 &cortexa57_extra_costs,
1125 &generic_addrcost_table,
1126 &generic_regmove_cost,
1127 &cortexa57_vector_cost,
1128 &generic_branch_cost,
1129 &generic_approx_modes,
1130 SVE_NOT_IMPLEMENTED, /* sve_width */
1131 4, /* memmov_cost */
1132 3, /* issue_rate */
1133 AARCH64_FUSE_AES_AESMC, /* fusible_ops */
1134 "32:16", /* function_align. */
1135 "32:16", /* jump_align. */
1136 "32:16", /* loop_align. */
1137 2, /* int_reassoc_width. */
1138 4, /* fp_reassoc_width. */
1139 2, /* vec_reassoc_width. */
1140 2, /* min_div_recip_mul_sf. */
1141 2, /* min_div_recip_mul_df. */
1142 0, /* max_case_values. */
1143 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1144 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1145 &generic_prefetch_tune
1146 };
1147
1148 /* Support for fine-grained override of the tuning structures. */
1149 struct aarch64_tuning_override_function
1150 {
1151 const char* name;
1152 void (*parse_override)(const char*, struct tune_params*);
1153 };
1154
1155 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
1156 static void aarch64_parse_tune_string (const char*, struct tune_params*);
1157 static void aarch64_parse_sve_width_string (const char*, struct tune_params*);
1158
1159 static const struct aarch64_tuning_override_function
1160 aarch64_tuning_override_functions[] =
1161 {
1162 { "fuse", aarch64_parse_fuse_string },
1163 { "tune", aarch64_parse_tune_string },
1164 { "sve_width", aarch64_parse_sve_width_string },
1165 { NULL, NULL }
1166 };
1167
1168 /* A processor implementing AArch64. */
1169 struct processor
1170 {
1171 const char *const name;
1172 enum aarch64_processor ident;
1173 enum aarch64_processor sched_core;
1174 enum aarch64_arch arch;
1175 unsigned architecture_version;
1176 const uint64_t flags;
1177 const struct tune_params *const tune;
1178 };
1179
1180 /* Architectures implementing AArch64. */
1181 static const struct processor all_architectures[] =
1182 {
1183 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
1184 {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
1185 #include "aarch64-arches.def"
1186 {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
1187 };
1188
1189 /* Processor cores implementing AArch64. */
1190 static const struct processor all_cores[] =
1191 {
1192 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART, VARIANT) \
1193 {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH, \
1194 all_architectures[AARCH64_ARCH_##ARCH].architecture_version, \
1195 FLAGS, &COSTS##_tunings},
1196 #include "aarch64-cores.def"
1197 {"generic", generic, cortexa53, AARCH64_ARCH_8A, 8,
1198 AARCH64_FL_FOR_ARCH8, &generic_tunings},
1199 {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
1200 };
1201
1202
1203 /* Target specification. These are populated by the -march, -mtune, -mcpu
1204 handling code or by target attributes. */
1205 static const struct processor *selected_arch;
1206 static const struct processor *selected_cpu;
1207 static const struct processor *selected_tune;
1208
1209 enum aarch64_key_type aarch64_ra_sign_key = AARCH64_KEY_A;
1210
1211 /* The current tuning set. */
1212 struct tune_params aarch64_tune_params = generic_tunings;
1213
1214 /* Table of machine attributes. */
1215 static const struct attribute_spec aarch64_attribute_table[] =
1216 {
1217 /* { name, min_len, max_len, decl_req, type_req, fn_type_req,
1218 affects_type_identity, handler, exclude } */
1219 { "aarch64_vector_pcs", 0, 0, false, true, true, true, NULL, NULL },
1220 { NULL, 0, 0, false, false, false, false, NULL, NULL }
1221 };
1222
1223 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
1224
1225 /* An ISA extension in the co-processor and main instruction set space. */
1226 struct aarch64_option_extension
1227 {
1228 const char *const name;
1229 const unsigned long flags_on;
1230 const unsigned long flags_off;
1231 };
1232
1233 typedef enum aarch64_cond_code
1234 {
1235 AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
1236 AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
1237 AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
1238 }
1239 aarch64_cc;
1240
1241 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
1242
1243 struct aarch64_branch_protect_type
1244 {
1245 /* The type's name that the user passes to the branch-protection option
1246 string. */
1247 const char* name;
1248 /* Function to handle the protection type and set global variables.
1249 First argument is the string token corresponding with this type and the
1250 second argument is the next token in the option string.
1251 Return values:
1252 * AARCH64_PARSE_OK: Handling was sucessful.
1253 * AARCH64_INVALID_ARG: The type is invalid in this context and the caller
1254 should print an error.
1255 * AARCH64_INVALID_FEATURE: The type is invalid and the handler prints its
1256 own error. */
1257 enum aarch64_parse_opt_result (*handler)(char*, char*);
1258 /* A list of types that can follow this type in the option string. */
1259 const aarch64_branch_protect_type* subtypes;
1260 unsigned int num_subtypes;
1261 };
1262
1263 static enum aarch64_parse_opt_result
1264 aarch64_handle_no_branch_protection (char* str, char* rest)
1265 {
1266 aarch64_ra_sign_scope = AARCH64_FUNCTION_NONE;
1267 aarch64_enable_bti = 0;
1268 if (rest)
1269 {
1270 error ("unexpected %<%s%> after %<%s%>", rest, str);
1271 return AARCH64_PARSE_INVALID_FEATURE;
1272 }
1273 return AARCH64_PARSE_OK;
1274 }
1275
1276 static enum aarch64_parse_opt_result
1277 aarch64_handle_standard_branch_protection (char* str, char* rest)
1278 {
1279 aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
1280 aarch64_ra_sign_key = AARCH64_KEY_A;
1281 aarch64_enable_bti = 1;
1282 if (rest)
1283 {
1284 error ("unexpected %<%s%> after %<%s%>", rest, str);
1285 return AARCH64_PARSE_INVALID_FEATURE;
1286 }
1287 return AARCH64_PARSE_OK;
1288 }
1289
1290 static enum aarch64_parse_opt_result
1291 aarch64_handle_pac_ret_protection (char* str ATTRIBUTE_UNUSED,
1292 char* rest ATTRIBUTE_UNUSED)
1293 {
1294 aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
1295 aarch64_ra_sign_key = AARCH64_KEY_A;
1296 return AARCH64_PARSE_OK;
1297 }
1298
1299 static enum aarch64_parse_opt_result
1300 aarch64_handle_pac_ret_leaf (char* str ATTRIBUTE_UNUSED,
1301 char* rest ATTRIBUTE_UNUSED)
1302 {
1303 aarch64_ra_sign_scope = AARCH64_FUNCTION_ALL;
1304 return AARCH64_PARSE_OK;
1305 }
1306
1307 static enum aarch64_parse_opt_result
1308 aarch64_handle_pac_ret_b_key (char* str ATTRIBUTE_UNUSED,
1309 char* rest ATTRIBUTE_UNUSED)
1310 {
1311 aarch64_ra_sign_key = AARCH64_KEY_B;
1312 return AARCH64_PARSE_OK;
1313 }
1314
1315 static enum aarch64_parse_opt_result
1316 aarch64_handle_bti_protection (char* str ATTRIBUTE_UNUSED,
1317 char* rest ATTRIBUTE_UNUSED)
1318 {
1319 aarch64_enable_bti = 1;
1320 return AARCH64_PARSE_OK;
1321 }
1322
1323 static const struct aarch64_branch_protect_type aarch64_pac_ret_subtypes[] = {
1324 { "leaf", aarch64_handle_pac_ret_leaf, NULL, 0 },
1325 { "b-key", aarch64_handle_pac_ret_b_key, NULL, 0 },
1326 { NULL, NULL, NULL, 0 }
1327 };
1328
1329 static const struct aarch64_branch_protect_type aarch64_branch_protect_types[] = {
1330 { "none", aarch64_handle_no_branch_protection, NULL, 0 },
1331 { "standard", aarch64_handle_standard_branch_protection, NULL, 0 },
1332 { "pac-ret", aarch64_handle_pac_ret_protection, aarch64_pac_ret_subtypes,
1333 ARRAY_SIZE (aarch64_pac_ret_subtypes) },
1334 { "bti", aarch64_handle_bti_protection, NULL, 0 },
1335 { NULL, NULL, NULL, 0 }
1336 };
1337
1338 /* The condition codes of the processor, and the inverse function. */
1339 static const char * const aarch64_condition_codes[] =
1340 {
1341 "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
1342 "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
1343 };
1344
1345 /* The preferred condition codes for SVE conditions. */
1346 static const char *const aarch64_sve_condition_codes[] =
1347 {
1348 "none", "any", "nlast", "last", "first", "nfrst", "vs", "vc",
1349 "pmore", "plast", "tcont", "tstop", "gt", "le", "al", "nv"
1350 };
1351
1352 /* Return the assembly token for svpattern value VALUE. */
1353
1354 static const char *
1355 svpattern_token (enum aarch64_svpattern pattern)
1356 {
1357 switch (pattern)
1358 {
1359 #define CASE(UPPER, LOWER, VALUE) case AARCH64_SV_##UPPER: return #LOWER;
1360 AARCH64_FOR_SVPATTERN (CASE)
1361 #undef CASE
1362 case AARCH64_NUM_SVPATTERNS:
1363 break;
1364 }
1365 gcc_unreachable ();
1366 }
1367
1368 /* Generate code to enable conditional branches in functions over 1 MiB. */
1369 const char *
1370 aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
1371 const char * branch_format)
1372 {
1373 rtx_code_label * tmp_label = gen_label_rtx ();
1374 char label_buf[256];
1375 char buffer[128];
1376 ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
1377 CODE_LABEL_NUMBER (tmp_label));
1378 const char *label_ptr = targetm.strip_name_encoding (label_buf);
1379 rtx dest_label = operands[pos_label];
1380 operands[pos_label] = tmp_label;
1381
1382 snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
1383 output_asm_insn (buffer, operands);
1384
1385 snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
1386 operands[pos_label] = dest_label;
1387 output_asm_insn (buffer, operands);
1388 return "";
1389 }
1390
1391 void
1392 aarch64_err_no_fpadvsimd (machine_mode mode)
1393 {
1394 if (TARGET_GENERAL_REGS_ONLY)
1395 if (FLOAT_MODE_P (mode))
1396 error ("%qs is incompatible with the use of floating-point types",
1397 "-mgeneral-regs-only");
1398 else
1399 error ("%qs is incompatible with the use of vector types",
1400 "-mgeneral-regs-only");
1401 else
1402 if (FLOAT_MODE_P (mode))
1403 error ("%qs feature modifier is incompatible with the use of"
1404 " floating-point types", "+nofp");
1405 else
1406 error ("%qs feature modifier is incompatible with the use of"
1407 " vector types", "+nofp");
1408 }
1409
1410 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
1411 The register allocator chooses POINTER_AND_FP_REGS if FP_REGS and
1412 GENERAL_REGS have the same cost - even if POINTER_AND_FP_REGS has a much
1413 higher cost. POINTER_AND_FP_REGS is also used if the cost of both FP_REGS
1414 and GENERAL_REGS is lower than the memory cost (in this case the best class
1415 is the lowest cost one). Using POINTER_AND_FP_REGS irrespectively of its
1416 cost results in bad allocations with many redundant int<->FP moves which
1417 are expensive on various cores.
1418 To avoid this we don't allow POINTER_AND_FP_REGS as the allocno class, but
1419 force a decision between FP_REGS and GENERAL_REGS. We use the allocno class
1420 if it isn't POINTER_AND_FP_REGS. Similarly, use the best class if it isn't
1421 POINTER_AND_FP_REGS. Otherwise set the allocno class depending on the mode.
1422 The result of this is that it is no longer inefficient to have a higher
1423 memory move cost than the register move cost.
1424 */
1425
1426 static reg_class_t
1427 aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class,
1428 reg_class_t best_class)
1429 {
1430 machine_mode mode;
1431
1432 if (!reg_class_subset_p (GENERAL_REGS, allocno_class)
1433 || !reg_class_subset_p (FP_REGS, allocno_class))
1434 return allocno_class;
1435
1436 if (!reg_class_subset_p (GENERAL_REGS, best_class)
1437 || !reg_class_subset_p (FP_REGS, best_class))
1438 return best_class;
1439
1440 mode = PSEUDO_REGNO_MODE (regno);
1441 return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS;
1442 }
1443
1444 static unsigned int
1445 aarch64_min_divisions_for_recip_mul (machine_mode mode)
1446 {
1447 if (GET_MODE_UNIT_SIZE (mode) == 4)
1448 return aarch64_tune_params.min_div_recip_mul_sf;
1449 return aarch64_tune_params.min_div_recip_mul_df;
1450 }
1451
1452 /* Return the reassociation width of treeop OPC with mode MODE. */
1453 static int
1454 aarch64_reassociation_width (unsigned opc, machine_mode mode)
1455 {
1456 if (VECTOR_MODE_P (mode))
1457 return aarch64_tune_params.vec_reassoc_width;
1458 if (INTEGRAL_MODE_P (mode))
1459 return aarch64_tune_params.int_reassoc_width;
1460 /* Avoid reassociating floating point addition so we emit more FMAs. */
1461 if (FLOAT_MODE_P (mode) && opc != PLUS_EXPR)
1462 return aarch64_tune_params.fp_reassoc_width;
1463 return 1;
1464 }
1465
1466 /* Provide a mapping from gcc register numbers to dwarf register numbers. */
1467 unsigned
1468 aarch64_dbx_register_number (unsigned regno)
1469 {
1470 if (GP_REGNUM_P (regno))
1471 return AARCH64_DWARF_R0 + regno - R0_REGNUM;
1472 else if (regno == SP_REGNUM)
1473 return AARCH64_DWARF_SP;
1474 else if (FP_REGNUM_P (regno))
1475 return AARCH64_DWARF_V0 + regno - V0_REGNUM;
1476 else if (PR_REGNUM_P (regno))
1477 return AARCH64_DWARF_P0 + regno - P0_REGNUM;
1478 else if (regno == VG_REGNUM)
1479 return AARCH64_DWARF_VG;
1480
1481 /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
1482 equivalent DWARF register. */
1483 return DWARF_FRAME_REGISTERS;
1484 }
1485
1486 /* If X is a CONST_DOUBLE, return its bit representation as a constant
1487 integer, otherwise return X unmodified. */
1488 static rtx
1489 aarch64_bit_representation (rtx x)
1490 {
1491 if (CONST_DOUBLE_P (x))
1492 x = gen_lowpart (int_mode_for_mode (GET_MODE (x)).require (), x);
1493 return x;
1494 }
1495
1496 /* Return true if MODE is any of the Advanced SIMD structure modes. */
1497 static bool
1498 aarch64_advsimd_struct_mode_p (machine_mode mode)
1499 {
1500 return (TARGET_SIMD
1501 && (mode == OImode || mode == CImode || mode == XImode));
1502 }
1503
1504 /* Return true if MODE is an SVE predicate mode. */
1505 static bool
1506 aarch64_sve_pred_mode_p (machine_mode mode)
1507 {
1508 return (TARGET_SVE
1509 && (mode == VNx16BImode
1510 || mode == VNx8BImode
1511 || mode == VNx4BImode
1512 || mode == VNx2BImode));
1513 }
1514
1515 /* Three mutually-exclusive flags describing a vector or predicate type. */
1516 const unsigned int VEC_ADVSIMD = 1;
1517 const unsigned int VEC_SVE_DATA = 2;
1518 const unsigned int VEC_SVE_PRED = 4;
1519 /* Can be used in combination with VEC_ADVSIMD or VEC_SVE_DATA to indicate
1520 a structure of 2, 3 or 4 vectors. */
1521 const unsigned int VEC_STRUCT = 8;
1522 /* Useful combinations of the above. */
1523 const unsigned int VEC_ANY_SVE = VEC_SVE_DATA | VEC_SVE_PRED;
1524 const unsigned int VEC_ANY_DATA = VEC_ADVSIMD | VEC_SVE_DATA;
1525
1526 /* Return a set of flags describing the vector properties of mode MODE.
1527 Ignore modes that are not supported by the current target. */
1528 static unsigned int
1529 aarch64_classify_vector_mode (machine_mode mode)
1530 {
1531 if (aarch64_advsimd_struct_mode_p (mode))
1532 return VEC_ADVSIMD | VEC_STRUCT;
1533
1534 if (aarch64_sve_pred_mode_p (mode))
1535 return VEC_SVE_PRED;
1536
1537 /* Make the decision based on the mode's enum value rather than its
1538 properties, so that we keep the correct classification regardless
1539 of -msve-vector-bits. */
1540 switch (mode)
1541 {
1542 /* Single SVE vectors. */
1543 case E_VNx16QImode:
1544 case E_VNx8HImode:
1545 case E_VNx4SImode:
1546 case E_VNx2DImode:
1547 case E_VNx8HFmode:
1548 case E_VNx4SFmode:
1549 case E_VNx2DFmode:
1550 return TARGET_SVE ? VEC_SVE_DATA : 0;
1551
1552 /* x2 SVE vectors. */
1553 case E_VNx32QImode:
1554 case E_VNx16HImode:
1555 case E_VNx8SImode:
1556 case E_VNx4DImode:
1557 case E_VNx16HFmode:
1558 case E_VNx8SFmode:
1559 case E_VNx4DFmode:
1560 /* x3 SVE vectors. */
1561 case E_VNx48QImode:
1562 case E_VNx24HImode:
1563 case E_VNx12SImode:
1564 case E_VNx6DImode:
1565 case E_VNx24HFmode:
1566 case E_VNx12SFmode:
1567 case E_VNx6DFmode:
1568 /* x4 SVE vectors. */
1569 case E_VNx64QImode:
1570 case E_VNx32HImode:
1571 case E_VNx16SImode:
1572 case E_VNx8DImode:
1573 case E_VNx32HFmode:
1574 case E_VNx16SFmode:
1575 case E_VNx8DFmode:
1576 return TARGET_SVE ? VEC_SVE_DATA | VEC_STRUCT : 0;
1577
1578 /* 64-bit Advanced SIMD vectors. */
1579 case E_V8QImode:
1580 case E_V4HImode:
1581 case E_V2SImode:
1582 /* ...E_V1DImode doesn't exist. */
1583 case E_V4HFmode:
1584 case E_V2SFmode:
1585 case E_V1DFmode:
1586 /* 128-bit Advanced SIMD vectors. */
1587 case E_V16QImode:
1588 case E_V8HImode:
1589 case E_V4SImode:
1590 case E_V2DImode:
1591 case E_V8HFmode:
1592 case E_V4SFmode:
1593 case E_V2DFmode:
1594 return TARGET_SIMD ? VEC_ADVSIMD : 0;
1595
1596 default:
1597 return 0;
1598 }
1599 }
1600
1601 /* Return true if MODE is any of the data vector modes, including
1602 structure modes. */
1603 static bool
1604 aarch64_vector_data_mode_p (machine_mode mode)
1605 {
1606 return aarch64_classify_vector_mode (mode) & VEC_ANY_DATA;
1607 }
1608
1609 /* Return true if MODE is any form of SVE mode, including predicates,
1610 vectors and structures. */
1611 bool
1612 aarch64_sve_mode_p (machine_mode mode)
1613 {
1614 return aarch64_classify_vector_mode (mode) & VEC_ANY_SVE;
1615 }
1616
1617 /* Return true if MODE is an SVE data vector mode; either a single vector
1618 or a structure of vectors. */
1619 static bool
1620 aarch64_sve_data_mode_p (machine_mode mode)
1621 {
1622 return aarch64_classify_vector_mode (mode) & VEC_SVE_DATA;
1623 }
1624
1625 /* Implement target hook TARGET_ARRAY_MODE. */
1626 static opt_machine_mode
1627 aarch64_array_mode (machine_mode mode, unsigned HOST_WIDE_INT nelems)
1628 {
1629 if (aarch64_classify_vector_mode (mode) == VEC_SVE_DATA
1630 && IN_RANGE (nelems, 2, 4))
1631 return mode_for_vector (GET_MODE_INNER (mode),
1632 GET_MODE_NUNITS (mode) * nelems);
1633
1634 return opt_machine_mode ();
1635 }
1636
1637 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P. */
1638 static bool
1639 aarch64_array_mode_supported_p (machine_mode mode,
1640 unsigned HOST_WIDE_INT nelems)
1641 {
1642 if (TARGET_SIMD
1643 && (AARCH64_VALID_SIMD_QREG_MODE (mode)
1644 || AARCH64_VALID_SIMD_DREG_MODE (mode))
1645 && (nelems >= 2 && nelems <= 4))
1646 return true;
1647
1648 return false;
1649 }
1650
1651 /* Return the SVE predicate mode to use for elements that have
1652 ELEM_NBYTES bytes, if such a mode exists. */
1653
1654 opt_machine_mode
1655 aarch64_sve_pred_mode (unsigned int elem_nbytes)
1656 {
1657 if (TARGET_SVE)
1658 {
1659 if (elem_nbytes == 1)
1660 return VNx16BImode;
1661 if (elem_nbytes == 2)
1662 return VNx8BImode;
1663 if (elem_nbytes == 4)
1664 return VNx4BImode;
1665 if (elem_nbytes == 8)
1666 return VNx2BImode;
1667 }
1668 return opt_machine_mode ();
1669 }
1670
1671 /* Implement TARGET_VECTORIZE_GET_MASK_MODE. */
1672
1673 static opt_machine_mode
1674 aarch64_get_mask_mode (poly_uint64 nunits, poly_uint64 nbytes)
1675 {
1676 if (TARGET_SVE && known_eq (nbytes, BYTES_PER_SVE_VECTOR))
1677 {
1678 unsigned int elem_nbytes = vector_element_size (nbytes, nunits);
1679 machine_mode pred_mode;
1680 if (aarch64_sve_pred_mode (elem_nbytes).exists (&pred_mode))
1681 return pred_mode;
1682 }
1683
1684 return default_get_mask_mode (nunits, nbytes);
1685 }
1686
1687 /* Return the SVE vector mode that has NUNITS elements of mode INNER_MODE. */
1688
1689 static opt_machine_mode
1690 aarch64_sve_data_mode (scalar_mode inner_mode, poly_uint64 nunits)
1691 {
1692 enum mode_class mclass = (is_a <scalar_float_mode> (inner_mode)
1693 ? MODE_VECTOR_FLOAT : MODE_VECTOR_INT);
1694 machine_mode mode;
1695 FOR_EACH_MODE_IN_CLASS (mode, mclass)
1696 if (inner_mode == GET_MODE_INNER (mode)
1697 && known_eq (nunits, GET_MODE_NUNITS (mode))
1698 && aarch64_sve_data_mode_p (mode))
1699 return mode;
1700 return opt_machine_mode ();
1701 }
1702
1703 /* Return the integer element mode associated with SVE mode MODE. */
1704
1705 static scalar_int_mode
1706 aarch64_sve_element_int_mode (machine_mode mode)
1707 {
1708 unsigned int elt_bits = vector_element_size (BITS_PER_SVE_VECTOR,
1709 GET_MODE_NUNITS (mode));
1710 return int_mode_for_size (elt_bits, 0).require ();
1711 }
1712
1713 /* Return the integer vector mode associated with SVE mode MODE.
1714 Unlike mode_for_int_vector, this can handle the case in which
1715 MODE is a predicate (and thus has a different total size). */
1716
1717 static machine_mode
1718 aarch64_sve_int_mode (machine_mode mode)
1719 {
1720 scalar_int_mode int_mode = aarch64_sve_element_int_mode (mode);
1721 return aarch64_sve_data_mode (int_mode, GET_MODE_NUNITS (mode)).require ();
1722 }
1723
1724 /* Implement TARGET_PREFERRED_ELSE_VALUE. For binary operations,
1725 prefer to use the first arithmetic operand as the else value if
1726 the else value doesn't matter, since that exactly matches the SVE
1727 destructive merging form. For ternary operations we could either
1728 pick the first operand and use FMAD-like instructions or the last
1729 operand and use FMLA-like instructions; the latter seems more
1730 natural. */
1731
1732 static tree
1733 aarch64_preferred_else_value (unsigned, tree, unsigned int nops, tree *ops)
1734 {
1735 return nops == 3 ? ops[2] : ops[0];
1736 }
1737
1738 /* Implement TARGET_HARD_REGNO_NREGS. */
1739
1740 static unsigned int
1741 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
1742 {
1743 /* ??? Logically we should only need to provide a value when
1744 HARD_REGNO_MODE_OK says that the combination is valid,
1745 but at the moment we need to handle all modes. Just ignore
1746 any runtime parts for registers that can't store them. */
1747 HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
1748 switch (aarch64_regno_regclass (regno))
1749 {
1750 case FP_REGS:
1751 case FP_LO_REGS:
1752 case FP_LO8_REGS:
1753 if (aarch64_sve_data_mode_p (mode))
1754 return exact_div (GET_MODE_SIZE (mode),
1755 BYTES_PER_SVE_VECTOR).to_constant ();
1756 return CEIL (lowest_size, UNITS_PER_VREG);
1757 case PR_REGS:
1758 case PR_LO_REGS:
1759 case PR_HI_REGS:
1760 return 1;
1761 default:
1762 return CEIL (lowest_size, UNITS_PER_WORD);
1763 }
1764 gcc_unreachable ();
1765 }
1766
1767 /* Implement TARGET_HARD_REGNO_MODE_OK. */
1768
1769 static bool
1770 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
1771 {
1772 if (GET_MODE_CLASS (mode) == MODE_CC)
1773 return regno == CC_REGNUM;
1774
1775 if (regno == VG_REGNUM)
1776 /* This must have the same size as _Unwind_Word. */
1777 return mode == DImode;
1778
1779 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1780 if (vec_flags & VEC_SVE_PRED)
1781 return PR_REGNUM_P (regno);
1782
1783 if (PR_REGNUM_P (regno))
1784 return 0;
1785
1786 if (regno == SP_REGNUM)
1787 /* The purpose of comparing with ptr_mode is to support the
1788 global register variable associated with the stack pointer
1789 register via the syntax of asm ("wsp") in ILP32. */
1790 return mode == Pmode || mode == ptr_mode;
1791
1792 if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
1793 return mode == Pmode;
1794
1795 if (GP_REGNUM_P (regno))
1796 {
1797 if (known_le (GET_MODE_SIZE (mode), 8))
1798 return true;
1799 else if (known_le (GET_MODE_SIZE (mode), 16))
1800 return (regno & 1) == 0;
1801 }
1802 else if (FP_REGNUM_P (regno))
1803 {
1804 if (vec_flags & VEC_STRUCT)
1805 return end_hard_regno (mode, regno) - 1 <= V31_REGNUM;
1806 else
1807 return !VECTOR_MODE_P (mode) || vec_flags != 0;
1808 }
1809
1810 return false;
1811 }
1812
1813 /* Return true if this is a definition of a vectorized simd function. */
1814
1815 static bool
1816 aarch64_simd_decl_p (tree fndecl)
1817 {
1818 tree fntype;
1819
1820 if (fndecl == NULL)
1821 return false;
1822 fntype = TREE_TYPE (fndecl);
1823 if (fntype == NULL)
1824 return false;
1825
1826 /* Functions with the aarch64_vector_pcs attribute use the simd ABI. */
1827 if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (fntype)) != NULL)
1828 return true;
1829
1830 return false;
1831 }
1832
1833 /* Return the mode a register save/restore should use. DImode for integer
1834 registers, DFmode for FP registers in non-SIMD functions (they only save
1835 the bottom half of a 128 bit register), or TFmode for FP registers in
1836 SIMD functions. */
1837
1838 static machine_mode
1839 aarch64_reg_save_mode (tree fndecl, unsigned regno)
1840 {
1841 return GP_REGNUM_P (regno)
1842 ? E_DImode
1843 : (aarch64_simd_decl_p (fndecl) ? E_TFmode : E_DFmode);
1844 }
1845
1846 /* Return true if the instruction is a call to a SIMD function, false
1847 if it is not a SIMD function or if we do not know anything about
1848 the function. */
1849
1850 static bool
1851 aarch64_simd_call_p (rtx_insn *insn)
1852 {
1853 rtx symbol;
1854 rtx call;
1855 tree fndecl;
1856
1857 gcc_assert (CALL_P (insn));
1858 call = get_call_rtx_from (insn);
1859 symbol = XEXP (XEXP (call, 0), 0);
1860 if (GET_CODE (symbol) != SYMBOL_REF)
1861 return false;
1862 fndecl = SYMBOL_REF_DECL (symbol);
1863 if (!fndecl)
1864 return false;
1865
1866 return aarch64_simd_decl_p (fndecl);
1867 }
1868
1869 /* Implement TARGET_REMOVE_EXTRA_CALL_PRESERVED_REGS. If INSN calls
1870 a function that uses the SIMD ABI, take advantage of the extra
1871 call-preserved registers that the ABI provides. */
1872
1873 void
1874 aarch64_remove_extra_call_preserved_regs (rtx_insn *insn,
1875 HARD_REG_SET *return_set)
1876 {
1877 if (aarch64_simd_call_p (insn))
1878 {
1879 for (int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
1880 if (FP_SIMD_SAVED_REGNUM_P (regno))
1881 CLEAR_HARD_REG_BIT (*return_set, regno);
1882 }
1883 }
1884
1885 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED. The callee only saves
1886 the lower 64 bits of a 128-bit register. Tell the compiler the callee
1887 clobbers the top 64 bits when restoring the bottom 64 bits. */
1888
1889 static bool
1890 aarch64_hard_regno_call_part_clobbered (rtx_insn *insn, unsigned int regno,
1891 machine_mode mode)
1892 {
1893 bool simd_p = insn && CALL_P (insn) && aarch64_simd_call_p (insn);
1894 return FP_REGNUM_P (regno)
1895 && maybe_gt (GET_MODE_SIZE (mode), simd_p ? 16 : 8);
1896 }
1897
1898 /* Implement TARGET_RETURN_CALL_WITH_MAX_CLOBBERS. */
1899
1900 rtx_insn *
1901 aarch64_return_call_with_max_clobbers (rtx_insn *call_1, rtx_insn *call_2)
1902 {
1903 gcc_assert (CALL_P (call_1) && CALL_P (call_2));
1904
1905 if (!aarch64_simd_call_p (call_1) || aarch64_simd_call_p (call_2))
1906 return call_1;
1907 else
1908 return call_2;
1909 }
1910
1911 /* Implement REGMODE_NATURAL_SIZE. */
1912 poly_uint64
1913 aarch64_regmode_natural_size (machine_mode mode)
1914 {
1915 /* The natural size for SVE data modes is one SVE data vector,
1916 and similarly for predicates. We can't independently modify
1917 anything smaller than that. */
1918 /* ??? For now, only do this for variable-width SVE registers.
1919 Doing it for constant-sized registers breaks lower-subreg.c. */
1920 /* ??? And once that's fixed, we should probably have similar
1921 code for Advanced SIMD. */
1922 if (!aarch64_sve_vg.is_constant ())
1923 {
1924 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1925 if (vec_flags & VEC_SVE_PRED)
1926 return BYTES_PER_SVE_PRED;
1927 if (vec_flags & VEC_SVE_DATA)
1928 return BYTES_PER_SVE_VECTOR;
1929 }
1930 return UNITS_PER_WORD;
1931 }
1932
1933 /* Implement HARD_REGNO_CALLER_SAVE_MODE. */
1934 machine_mode
1935 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned,
1936 machine_mode mode)
1937 {
1938 /* The predicate mode determines which bits are significant and
1939 which are "don't care". Decreasing the number of lanes would
1940 lose data while increasing the number of lanes would make bits
1941 unnecessarily significant. */
1942 if (PR_REGNUM_P (regno))
1943 return mode;
1944 if (known_ge (GET_MODE_SIZE (mode), 4))
1945 return mode;
1946 else
1947 return SImode;
1948 }
1949
1950 /* Return true if I's bits are consecutive ones from the MSB. */
1951 bool
1952 aarch64_high_bits_all_ones_p (HOST_WIDE_INT i)
1953 {
1954 return exact_log2 (-i) != HOST_WIDE_INT_M1;
1955 }
1956
1957 /* Implement TARGET_CONSTANT_ALIGNMENT. Make strings word-aligned so
1958 that strcpy from constants will be faster. */
1959
1960 static HOST_WIDE_INT
1961 aarch64_constant_alignment (const_tree exp, HOST_WIDE_INT align)
1962 {
1963 if (TREE_CODE (exp) == STRING_CST && !optimize_size)
1964 return MAX (align, BITS_PER_WORD);
1965 return align;
1966 }
1967
1968 /* Return true if calls to DECL should be treated as
1969 long-calls (ie called via a register). */
1970 static bool
1971 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
1972 {
1973 return false;
1974 }
1975
1976 /* Return true if calls to symbol-ref SYM should be treated as
1977 long-calls (ie called via a register). */
1978 bool
1979 aarch64_is_long_call_p (rtx sym)
1980 {
1981 return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
1982 }
1983
1984 /* Return true if calls to symbol-ref SYM should not go through
1985 plt stubs. */
1986
1987 bool
1988 aarch64_is_noplt_call_p (rtx sym)
1989 {
1990 const_tree decl = SYMBOL_REF_DECL (sym);
1991
1992 if (flag_pic
1993 && decl
1994 && (!flag_plt
1995 || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
1996 && !targetm.binds_local_p (decl))
1997 return true;
1998
1999 return false;
2000 }
2001
2002 /* Return true if the offsets to a zero/sign-extract operation
2003 represent an expression that matches an extend operation. The
2004 operands represent the paramters from
2005
2006 (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)). */
2007 bool
2008 aarch64_is_extend_from_extract (scalar_int_mode mode, rtx mult_imm,
2009 rtx extract_imm)
2010 {
2011 HOST_WIDE_INT mult_val, extract_val;
2012
2013 if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
2014 return false;
2015
2016 mult_val = INTVAL (mult_imm);
2017 extract_val = INTVAL (extract_imm);
2018
2019 if (extract_val > 8
2020 && extract_val < GET_MODE_BITSIZE (mode)
2021 && exact_log2 (extract_val & ~7) > 0
2022 && (extract_val & 7) <= 4
2023 && mult_val == (1 << (extract_val & 7)))
2024 return true;
2025
2026 return false;
2027 }
2028
2029 /* Emit an insn that's a simple single-set. Both the operands must be
2030 known to be valid. */
2031 inline static rtx_insn *
2032 emit_set_insn (rtx x, rtx y)
2033 {
2034 return emit_insn (gen_rtx_SET (x, y));
2035 }
2036
2037 /* X and Y are two things to compare using CODE. Emit the compare insn and
2038 return the rtx for register 0 in the proper mode. */
2039 rtx
2040 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
2041 {
2042 machine_mode cmp_mode = GET_MODE (x);
2043 machine_mode cc_mode;
2044 rtx cc_reg;
2045
2046 if (cmp_mode == TImode)
2047 {
2048 gcc_assert (code == NE);
2049
2050 cc_mode = CCmode;
2051 cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
2052
2053 rtx x_lo = operand_subword (x, 0, 0, TImode);
2054 rtx y_lo = operand_subword (y, 0, 0, TImode);
2055 emit_set_insn (cc_reg, gen_rtx_COMPARE (cc_mode, x_lo, y_lo));
2056
2057 rtx x_hi = operand_subword (x, 1, 0, TImode);
2058 rtx y_hi = operand_subword (y, 1, 0, TImode);
2059 emit_insn (gen_ccmpdi (cc_reg, cc_reg, x_hi, y_hi,
2060 gen_rtx_EQ (cc_mode, cc_reg, const0_rtx),
2061 GEN_INT (AARCH64_EQ)));
2062 }
2063 else
2064 {
2065 cc_mode = SELECT_CC_MODE (code, x, y);
2066 cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
2067 emit_set_insn (cc_reg, gen_rtx_COMPARE (cc_mode, x, y));
2068 }
2069 return cc_reg;
2070 }
2071
2072 /* Similarly, but maybe zero-extend Y if Y_MODE < SImode. */
2073
2074 static rtx
2075 aarch64_gen_compare_reg_maybe_ze (RTX_CODE code, rtx x, rtx y,
2076 machine_mode y_mode)
2077 {
2078 if (y_mode == E_QImode || y_mode == E_HImode)
2079 {
2080 if (CONST_INT_P (y))
2081 y = GEN_INT (INTVAL (y) & GET_MODE_MASK (y_mode));
2082 else
2083 {
2084 rtx t, cc_reg;
2085 machine_mode cc_mode;
2086
2087 t = gen_rtx_ZERO_EXTEND (SImode, y);
2088 t = gen_rtx_COMPARE (CC_SWPmode, t, x);
2089 cc_mode = CC_SWPmode;
2090 cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
2091 emit_set_insn (cc_reg, t);
2092 return cc_reg;
2093 }
2094 }
2095
2096 if (!aarch64_plus_operand (y, y_mode))
2097 y = force_reg (y_mode, y);
2098
2099 return aarch64_gen_compare_reg (code, x, y);
2100 }
2101
2102 /* Build the SYMBOL_REF for __tls_get_addr. */
2103
2104 static GTY(()) rtx tls_get_addr_libfunc;
2105
2106 rtx
2107 aarch64_tls_get_addr (void)
2108 {
2109 if (!tls_get_addr_libfunc)
2110 tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
2111 return tls_get_addr_libfunc;
2112 }
2113
2114 /* Return the TLS model to use for ADDR. */
2115
2116 static enum tls_model
2117 tls_symbolic_operand_type (rtx addr)
2118 {
2119 enum tls_model tls_kind = TLS_MODEL_NONE;
2120 if (GET_CODE (addr) == CONST)
2121 {
2122 poly_int64 addend;
2123 rtx sym = strip_offset (addr, &addend);
2124 if (GET_CODE (sym) == SYMBOL_REF)
2125 tls_kind = SYMBOL_REF_TLS_MODEL (sym);
2126 }
2127 else if (GET_CODE (addr) == SYMBOL_REF)
2128 tls_kind = SYMBOL_REF_TLS_MODEL (addr);
2129
2130 return tls_kind;
2131 }
2132
2133 /* We'll allow lo_sum's in addresses in our legitimate addresses
2134 so that combine would take care of combining addresses where
2135 necessary, but for generation purposes, we'll generate the address
2136 as :
2137 RTL Absolute
2138 tmp = hi (symbol_ref); adrp x1, foo
2139 dest = lo_sum (tmp, symbol_ref); add dest, x1, :lo_12:foo
2140 nop
2141
2142 PIC TLS
2143 adrp x1, :got:foo adrp tmp, :tlsgd:foo
2144 ldr x1, [:got_lo12:foo] add dest, tmp, :tlsgd_lo12:foo
2145 bl __tls_get_addr
2146 nop
2147
2148 Load TLS symbol, depending on TLS mechanism and TLS access model.
2149
2150 Global Dynamic - Traditional TLS:
2151 adrp tmp, :tlsgd:imm
2152 add dest, tmp, #:tlsgd_lo12:imm
2153 bl __tls_get_addr
2154
2155 Global Dynamic - TLS Descriptors:
2156 adrp dest, :tlsdesc:imm
2157 ldr tmp, [dest, #:tlsdesc_lo12:imm]
2158 add dest, dest, #:tlsdesc_lo12:imm
2159 blr tmp
2160 mrs tp, tpidr_el0
2161 add dest, dest, tp
2162
2163 Initial Exec:
2164 mrs tp, tpidr_el0
2165 adrp tmp, :gottprel:imm
2166 ldr dest, [tmp, #:gottprel_lo12:imm]
2167 add dest, dest, tp
2168
2169 Local Exec:
2170 mrs tp, tpidr_el0
2171 add t0, tp, #:tprel_hi12:imm, lsl #12
2172 add t0, t0, #:tprel_lo12_nc:imm
2173 */
2174
2175 static void
2176 aarch64_load_symref_appropriately (rtx dest, rtx imm,
2177 enum aarch64_symbol_type type)
2178 {
2179 switch (type)
2180 {
2181 case SYMBOL_SMALL_ABSOLUTE:
2182 {
2183 /* In ILP32, the mode of dest can be either SImode or DImode. */
2184 rtx tmp_reg = dest;
2185 machine_mode mode = GET_MODE (dest);
2186
2187 gcc_assert (mode == Pmode || mode == ptr_mode);
2188
2189 if (can_create_pseudo_p ())
2190 tmp_reg = gen_reg_rtx (mode);
2191
2192 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
2193 emit_insn (gen_add_losym (dest, tmp_reg, imm));
2194 return;
2195 }
2196
2197 case SYMBOL_TINY_ABSOLUTE:
2198 emit_insn (gen_rtx_SET (dest, imm));
2199 return;
2200
2201 case SYMBOL_SMALL_GOT_28K:
2202 {
2203 machine_mode mode = GET_MODE (dest);
2204 rtx gp_rtx = pic_offset_table_rtx;
2205 rtx insn;
2206 rtx mem;
2207
2208 /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
2209 here before rtl expand. Tree IVOPT will generate rtl pattern to
2210 decide rtx costs, in which case pic_offset_table_rtx is not
2211 initialized. For that case no need to generate the first adrp
2212 instruction as the final cost for global variable access is
2213 one instruction. */
2214 if (gp_rtx != NULL)
2215 {
2216 /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
2217 using the page base as GOT base, the first page may be wasted,
2218 in the worst scenario, there is only 28K space for GOT).
2219
2220 The generate instruction sequence for accessing global variable
2221 is:
2222
2223 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
2224
2225 Only one instruction needed. But we must initialize
2226 pic_offset_table_rtx properly. We generate initialize insn for
2227 every global access, and allow CSE to remove all redundant.
2228
2229 The final instruction sequences will look like the following
2230 for multiply global variables access.
2231
2232 adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
2233
2234 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
2235 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
2236 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
2237 ... */
2238
2239 rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
2240 crtl->uses_pic_offset_table = 1;
2241 emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
2242
2243 if (mode != GET_MODE (gp_rtx))
2244 gp_rtx = gen_lowpart (mode, gp_rtx);
2245
2246 }
2247
2248 if (mode == ptr_mode)
2249 {
2250 if (mode == DImode)
2251 insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
2252 else
2253 insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
2254
2255 mem = XVECEXP (SET_SRC (insn), 0, 0);
2256 }
2257 else
2258 {
2259 gcc_assert (mode == Pmode);
2260
2261 insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
2262 mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
2263 }
2264
2265 /* The operand is expected to be MEM. Whenever the related insn
2266 pattern changed, above code which calculate mem should be
2267 updated. */
2268 gcc_assert (GET_CODE (mem) == MEM);
2269 MEM_READONLY_P (mem) = 1;
2270 MEM_NOTRAP_P (mem) = 1;
2271 emit_insn (insn);
2272 return;
2273 }
2274
2275 case SYMBOL_SMALL_GOT_4G:
2276 {
2277 /* In ILP32, the mode of dest can be either SImode or DImode,
2278 while the got entry is always of SImode size. The mode of
2279 dest depends on how dest is used: if dest is assigned to a
2280 pointer (e.g. in the memory), it has SImode; it may have
2281 DImode if dest is dereferenced to access the memeory.
2282 This is why we have to handle three different ldr_got_small
2283 patterns here (two patterns for ILP32). */
2284
2285 rtx insn;
2286 rtx mem;
2287 rtx tmp_reg = dest;
2288 machine_mode mode = GET_MODE (dest);
2289
2290 if (can_create_pseudo_p ())
2291 tmp_reg = gen_reg_rtx (mode);
2292
2293 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
2294 if (mode == ptr_mode)
2295 {
2296 if (mode == DImode)
2297 insn = gen_ldr_got_small_di (dest, tmp_reg, imm);
2298 else
2299 insn = gen_ldr_got_small_si (dest, tmp_reg, imm);
2300
2301 mem = XVECEXP (SET_SRC (insn), 0, 0);
2302 }
2303 else
2304 {
2305 gcc_assert (mode == Pmode);
2306
2307 insn = gen_ldr_got_small_sidi (dest, tmp_reg, imm);
2308 mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
2309 }
2310
2311 gcc_assert (GET_CODE (mem) == MEM);
2312 MEM_READONLY_P (mem) = 1;
2313 MEM_NOTRAP_P (mem) = 1;
2314 emit_insn (insn);
2315 return;
2316 }
2317
2318 case SYMBOL_SMALL_TLSGD:
2319 {
2320 rtx_insn *insns;
2321 machine_mode mode = GET_MODE (dest);
2322 rtx result = gen_rtx_REG (mode, R0_REGNUM);
2323
2324 start_sequence ();
2325 if (TARGET_ILP32)
2326 aarch64_emit_call_insn (gen_tlsgd_small_si (result, imm));
2327 else
2328 aarch64_emit_call_insn (gen_tlsgd_small_di (result, imm));
2329 insns = get_insns ();
2330 end_sequence ();
2331
2332 RTL_CONST_CALL_P (insns) = 1;
2333 emit_libcall_block (insns, dest, result, imm);
2334 return;
2335 }
2336
2337 case SYMBOL_SMALL_TLSDESC:
2338 {
2339 machine_mode mode = GET_MODE (dest);
2340 rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
2341 rtx tp;
2342
2343 gcc_assert (mode == Pmode || mode == ptr_mode);
2344
2345 /* In ILP32, the got entry is always of SImode size. Unlike
2346 small GOT, the dest is fixed at reg 0. */
2347 if (TARGET_ILP32)
2348 emit_insn (gen_tlsdesc_small_si (imm));
2349 else
2350 emit_insn (gen_tlsdesc_small_di (imm));
2351 tp = aarch64_load_tp (NULL);
2352
2353 if (mode != Pmode)
2354 tp = gen_lowpart (mode, tp);
2355
2356 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
2357 if (REG_P (dest))
2358 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2359 return;
2360 }
2361
2362 case SYMBOL_SMALL_TLSIE:
2363 {
2364 /* In ILP32, the mode of dest can be either SImode or DImode,
2365 while the got entry is always of SImode size. The mode of
2366 dest depends on how dest is used: if dest is assigned to a
2367 pointer (e.g. in the memory), it has SImode; it may have
2368 DImode if dest is dereferenced to access the memeory.
2369 This is why we have to handle three different tlsie_small
2370 patterns here (two patterns for ILP32). */
2371 machine_mode mode = GET_MODE (dest);
2372 rtx tmp_reg = gen_reg_rtx (mode);
2373 rtx tp = aarch64_load_tp (NULL);
2374
2375 if (mode == ptr_mode)
2376 {
2377 if (mode == DImode)
2378 emit_insn (gen_tlsie_small_di (tmp_reg, imm));
2379 else
2380 {
2381 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
2382 tp = gen_lowpart (mode, tp);
2383 }
2384 }
2385 else
2386 {
2387 gcc_assert (mode == Pmode);
2388 emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
2389 }
2390
2391 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
2392 if (REG_P (dest))
2393 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2394 return;
2395 }
2396
2397 case SYMBOL_TLSLE12:
2398 case SYMBOL_TLSLE24:
2399 case SYMBOL_TLSLE32:
2400 case SYMBOL_TLSLE48:
2401 {
2402 machine_mode mode = GET_MODE (dest);
2403 rtx tp = aarch64_load_tp (NULL);
2404
2405 if (mode != Pmode)
2406 tp = gen_lowpart (mode, tp);
2407
2408 switch (type)
2409 {
2410 case SYMBOL_TLSLE12:
2411 emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
2412 (dest, tp, imm));
2413 break;
2414 case SYMBOL_TLSLE24:
2415 emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
2416 (dest, tp, imm));
2417 break;
2418 case SYMBOL_TLSLE32:
2419 emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
2420 (dest, imm));
2421 emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
2422 (dest, dest, tp));
2423 break;
2424 case SYMBOL_TLSLE48:
2425 emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
2426 (dest, imm));
2427 emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
2428 (dest, dest, tp));
2429 break;
2430 default:
2431 gcc_unreachable ();
2432 }
2433
2434 if (REG_P (dest))
2435 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2436 return;
2437 }
2438
2439 case SYMBOL_TINY_GOT:
2440 emit_insn (gen_ldr_got_tiny (dest, imm));
2441 return;
2442
2443 case SYMBOL_TINY_TLSIE:
2444 {
2445 machine_mode mode = GET_MODE (dest);
2446 rtx tp = aarch64_load_tp (NULL);
2447
2448 if (mode == ptr_mode)
2449 {
2450 if (mode == DImode)
2451 emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
2452 else
2453 {
2454 tp = gen_lowpart (mode, tp);
2455 emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
2456 }
2457 }
2458 else
2459 {
2460 gcc_assert (mode == Pmode);
2461 emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
2462 }
2463
2464 if (REG_P (dest))
2465 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2466 return;
2467 }
2468
2469 default:
2470 gcc_unreachable ();
2471 }
2472 }
2473
2474 /* Emit a move from SRC to DEST. Assume that the move expanders can
2475 handle all moves if !can_create_pseudo_p (). The distinction is
2476 important because, unlike emit_move_insn, the move expanders know
2477 how to force Pmode objects into the constant pool even when the
2478 constant pool address is not itself legitimate. */
2479 static rtx
2480 aarch64_emit_move (rtx dest, rtx src)
2481 {
2482 return (can_create_pseudo_p ()
2483 ? emit_move_insn (dest, src)
2484 : emit_move_insn_1 (dest, src));
2485 }
2486
2487 /* Apply UNOPTAB to OP and store the result in DEST. */
2488
2489 static void
2490 aarch64_emit_unop (rtx dest, optab unoptab, rtx op)
2491 {
2492 rtx tmp = expand_unop (GET_MODE (dest), unoptab, op, dest, 0);
2493 if (dest != tmp)
2494 emit_move_insn (dest, tmp);
2495 }
2496
2497 /* Apply BINOPTAB to OP0 and OP1 and store the result in DEST. */
2498
2499 static void
2500 aarch64_emit_binop (rtx dest, optab binoptab, rtx op0, rtx op1)
2501 {
2502 rtx tmp = expand_binop (GET_MODE (dest), binoptab, op0, op1, dest, 0,
2503 OPTAB_DIRECT);
2504 if (dest != tmp)
2505 emit_move_insn (dest, tmp);
2506 }
2507
2508 /* Split a 128-bit move operation into two 64-bit move operations,
2509 taking care to handle partial overlap of register to register
2510 copies. Special cases are needed when moving between GP regs and
2511 FP regs. SRC can be a register, constant or memory; DST a register
2512 or memory. If either operand is memory it must not have any side
2513 effects. */
2514 void
2515 aarch64_split_128bit_move (rtx dst, rtx src)
2516 {
2517 rtx dst_lo, dst_hi;
2518 rtx src_lo, src_hi;
2519
2520 machine_mode mode = GET_MODE (dst);
2521
2522 gcc_assert (mode == TImode || mode == TFmode);
2523 gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
2524 gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
2525
2526 if (REG_P (dst) && REG_P (src))
2527 {
2528 int src_regno = REGNO (src);
2529 int dst_regno = REGNO (dst);
2530
2531 /* Handle FP <-> GP regs. */
2532 if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
2533 {
2534 src_lo = gen_lowpart (word_mode, src);
2535 src_hi = gen_highpart (word_mode, src);
2536
2537 emit_insn (gen_aarch64_movlow_di (mode, dst, src_lo));
2538 emit_insn (gen_aarch64_movhigh_di (mode, dst, src_hi));
2539 return;
2540 }
2541 else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
2542 {
2543 dst_lo = gen_lowpart (word_mode, dst);
2544 dst_hi = gen_highpart (word_mode, dst);
2545
2546 emit_insn (gen_aarch64_movdi_low (mode, dst_lo, src));
2547 emit_insn (gen_aarch64_movdi_high (mode, dst_hi, src));
2548 return;
2549 }
2550 }
2551
2552 dst_lo = gen_lowpart (word_mode, dst);
2553 dst_hi = gen_highpart (word_mode, dst);
2554 src_lo = gen_lowpart (word_mode, src);
2555 src_hi = gen_highpart_mode (word_mode, mode, src);
2556
2557 /* At most one pairing may overlap. */
2558 if (reg_overlap_mentioned_p (dst_lo, src_hi))
2559 {
2560 aarch64_emit_move (dst_hi, src_hi);
2561 aarch64_emit_move (dst_lo, src_lo);
2562 }
2563 else
2564 {
2565 aarch64_emit_move (dst_lo, src_lo);
2566 aarch64_emit_move (dst_hi, src_hi);
2567 }
2568 }
2569
2570 bool
2571 aarch64_split_128bit_move_p (rtx dst, rtx src)
2572 {
2573 return (! REG_P (src)
2574 || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
2575 }
2576
2577 /* Split a complex SIMD combine. */
2578
2579 void
2580 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
2581 {
2582 machine_mode src_mode = GET_MODE (src1);
2583 machine_mode dst_mode = GET_MODE (dst);
2584
2585 gcc_assert (VECTOR_MODE_P (dst_mode));
2586 gcc_assert (register_operand (dst, dst_mode)
2587 && register_operand (src1, src_mode)
2588 && register_operand (src2, src_mode));
2589
2590 emit_insn (gen_aarch64_simd_combine (src_mode, dst, src1, src2));
2591 return;
2592 }
2593
2594 /* Split a complex SIMD move. */
2595
2596 void
2597 aarch64_split_simd_move (rtx dst, rtx src)
2598 {
2599 machine_mode src_mode = GET_MODE (src);
2600 machine_mode dst_mode = GET_MODE (dst);
2601
2602 gcc_assert (VECTOR_MODE_P (dst_mode));
2603
2604 if (REG_P (dst) && REG_P (src))
2605 {
2606 gcc_assert (VECTOR_MODE_P (src_mode));
2607 emit_insn (gen_aarch64_split_simd_mov (src_mode, dst, src));
2608 }
2609 }
2610
2611 bool
2612 aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
2613 machine_mode ymode, rtx y)
2614 {
2615 rtx r = simplify_const_unary_operation (ZERO_EXTEND, xmode, y, ymode);
2616 gcc_assert (r != NULL);
2617 return rtx_equal_p (x, r);
2618 }
2619
2620 /* Return TARGET if it is nonnull and a register of mode MODE.
2621 Otherwise, return a fresh register of mode MODE if we can,
2622 or TARGET reinterpreted as MODE if we can't. */
2623
2624 static rtx
2625 aarch64_target_reg (rtx target, machine_mode mode)
2626 {
2627 if (target && REG_P (target) && GET_MODE (target) == mode)
2628 return target;
2629 if (!can_create_pseudo_p ())
2630 {
2631 gcc_assert (target);
2632 return gen_lowpart (mode, target);
2633 }
2634 return gen_reg_rtx (mode);
2635 }
2636
2637 /* Return a register that contains the constant in BUILDER, given that
2638 the constant is a legitimate move operand. Use TARGET as the register
2639 if it is nonnull and convenient. */
2640
2641 static rtx
2642 aarch64_emit_set_immediate (rtx target, rtx_vector_builder &builder)
2643 {
2644 rtx src = builder.build ();
2645 target = aarch64_target_reg (target, GET_MODE (src));
2646 emit_insn (gen_rtx_SET (target, src));
2647 return target;
2648 }
2649
2650 static rtx
2651 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
2652 {
2653 if (can_create_pseudo_p ())
2654 return force_reg (mode, value);
2655 else
2656 {
2657 gcc_assert (x);
2658 aarch64_emit_move (x, value);
2659 return x;
2660 }
2661 }
2662
2663 /* Return true if predicate value X is a constant in which every element
2664 is a CONST_INT. When returning true, describe X in BUILDER as a VNx16BI
2665 value, i.e. as a predicate in which all bits are significant. */
2666
2667 static bool
2668 aarch64_get_sve_pred_bits (rtx_vector_builder &builder, rtx x)
2669 {
2670 if (GET_CODE (x) != CONST_VECTOR)
2671 return false;
2672
2673 unsigned int factor = vector_element_size (GET_MODE_NUNITS (VNx16BImode),
2674 GET_MODE_NUNITS (GET_MODE (x)));
2675 unsigned int npatterns = CONST_VECTOR_NPATTERNS (x) * factor;
2676 unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (x);
2677 builder.new_vector (VNx16BImode, npatterns, nelts_per_pattern);
2678
2679 unsigned int nelts = const_vector_encoded_nelts (x);
2680 for (unsigned int i = 0; i < nelts; ++i)
2681 {
2682 rtx elt = CONST_VECTOR_ENCODED_ELT (x, i);
2683 if (!CONST_INT_P (elt))
2684 return false;
2685
2686 builder.quick_push (elt);
2687 for (unsigned int j = 1; j < factor; ++j)
2688 builder.quick_push (const0_rtx);
2689 }
2690 builder.finalize ();
2691 return true;
2692 }
2693
2694 /* BUILDER contains a predicate constant of mode VNx16BI. Return the
2695 widest predicate element size it can have (that is, the largest size
2696 for which each element would still be 0 or 1). */
2697
2698 unsigned int
2699 aarch64_widest_sve_pred_elt_size (rtx_vector_builder &builder)
2700 {
2701 /* Start with the most optimistic assumption: that we only need
2702 one bit per pattern. This is what we will use if only the first
2703 bit in each pattern is ever set. */
2704 unsigned int mask = GET_MODE_SIZE (DImode);
2705 mask |= builder.npatterns ();
2706
2707 /* Look for set bits. */
2708 unsigned int nelts = builder.encoded_nelts ();
2709 for (unsigned int i = 1; i < nelts; ++i)
2710 if (INTVAL (builder.elt (i)) != 0)
2711 {
2712 if (i & 1)
2713 return 1;
2714 mask |= i;
2715 }
2716 return mask & -mask;
2717 }
2718
2719 /* BUILDER is a predicate constant of mode VNx16BI. Consider the value
2720 that the constant would have with predicate element size ELT_SIZE
2721 (ignoring the upper bits in each element) and return:
2722
2723 * -1 if all bits are set
2724 * N if the predicate has N leading set bits followed by all clear bits
2725 * 0 if the predicate does not have any of these forms. */
2726
2727 int
2728 aarch64_partial_ptrue_length (rtx_vector_builder &builder,
2729 unsigned int elt_size)
2730 {
2731 /* If nelts_per_pattern is 3, we have set bits followed by clear bits
2732 followed by set bits. */
2733 if (builder.nelts_per_pattern () == 3)
2734 return 0;
2735
2736 /* Skip over leading set bits. */
2737 unsigned int nelts = builder.encoded_nelts ();
2738 unsigned int i = 0;
2739 for (; i < nelts; i += elt_size)
2740 if (INTVAL (builder.elt (i)) == 0)
2741 break;
2742 unsigned int vl = i / elt_size;
2743
2744 /* Check for the all-true case. */
2745 if (i == nelts)
2746 return -1;
2747
2748 /* If nelts_per_pattern is 1, then either VL is zero, or we have a
2749 repeating pattern of set bits followed by clear bits. */
2750 if (builder.nelts_per_pattern () != 2)
2751 return 0;
2752
2753 /* We have a "foreground" value and a duplicated "background" value.
2754 If the background might repeat and the last set bit belongs to it,
2755 we might have set bits followed by clear bits followed by set bits. */
2756 if (i > builder.npatterns () && maybe_ne (nelts, builder.full_nelts ()))
2757 return 0;
2758
2759 /* Make sure that the rest are all clear. */
2760 for (; i < nelts; i += elt_size)
2761 if (INTVAL (builder.elt (i)) != 0)
2762 return 0;
2763
2764 return vl;
2765 }
2766
2767 /* See if there is an svpattern that encodes an SVE predicate of mode
2768 PRED_MODE in which the first VL bits are set and the rest are clear.
2769 Return the pattern if so, otherwise return AARCH64_NUM_SVPATTERNS.
2770 A VL of -1 indicates an all-true vector. */
2771
2772 aarch64_svpattern
2773 aarch64_svpattern_for_vl (machine_mode pred_mode, int vl)
2774 {
2775 if (vl < 0)
2776 return AARCH64_SV_ALL;
2777
2778 if (maybe_gt (vl, GET_MODE_NUNITS (pred_mode)))
2779 return AARCH64_NUM_SVPATTERNS;
2780
2781 if (vl >= 1 && vl <= 8)
2782 return aarch64_svpattern (AARCH64_SV_VL1 + (vl - 1));
2783
2784 if (vl >= 16 && vl <= 256 && pow2p_hwi (vl))
2785 return aarch64_svpattern (AARCH64_SV_VL16 + (exact_log2 (vl) - 4));
2786
2787 int max_vl;
2788 if (GET_MODE_NUNITS (pred_mode).is_constant (&max_vl))
2789 {
2790 if (vl == (max_vl / 3) * 3)
2791 return AARCH64_SV_MUL3;
2792 /* These would only trigger for non-power-of-2 lengths. */
2793 if (vl == (max_vl & -4))
2794 return AARCH64_SV_MUL4;
2795 if (vl == (1 << floor_log2 (max_vl)))
2796 return AARCH64_SV_POW2;
2797 if (vl == max_vl)
2798 return AARCH64_SV_ALL;
2799 }
2800 return AARCH64_NUM_SVPATTERNS;
2801 }
2802
2803 /* Return a VNx16BImode constant in which every sequence of ELT_SIZE
2804 bits has the lowest bit set and the upper bits clear. This is the
2805 VNx16BImode equivalent of a PTRUE for controlling elements of
2806 ELT_SIZE bytes. However, because the constant is VNx16BImode,
2807 all bits are significant, even the upper zeros. */
2808
2809 rtx
2810 aarch64_ptrue_all (unsigned int elt_size)
2811 {
2812 rtx_vector_builder builder (VNx16BImode, elt_size, 1);
2813 builder.quick_push (const1_rtx);
2814 for (unsigned int i = 1; i < elt_size; ++i)
2815 builder.quick_push (const0_rtx);
2816 return builder.build ();
2817 }
2818
2819 /* Return an all-true predicate register of mode MODE. */
2820
2821 rtx
2822 aarch64_ptrue_reg (machine_mode mode)
2823 {
2824 gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL);
2825 rtx reg = force_reg (VNx16BImode, CONSTM1_RTX (VNx16BImode));
2826 return gen_lowpart (mode, reg);
2827 }
2828
2829 /* Return an all-false predicate register of mode MODE. */
2830
2831 rtx
2832 aarch64_pfalse_reg (machine_mode mode)
2833 {
2834 gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL);
2835 rtx reg = force_reg (VNx16BImode, CONST0_RTX (VNx16BImode));
2836 return gen_lowpart (mode, reg);
2837 }
2838
2839 /* Return true if predicate PRED1[0] is true whenever predicate PRED2 is
2840 true, or alternatively if we know that the operation predicated by
2841 PRED1[0] is safe to perform whenever PRED2 is true. PRED1[1] is a
2842 aarch64_sve_gp_strictness operand that describes the operation
2843 predicated by PRED1[0]. */
2844
2845 bool
2846 aarch64_sve_pred_dominates_p (rtx *pred1, rtx pred2)
2847 {
2848 machine_mode mode = GET_MODE (pred2);
2849 gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
2850 && mode == GET_MODE (pred1[0])
2851 && aarch64_sve_gp_strictness (pred1[1], SImode));
2852 return (pred1[0] == CONSTM1_RTX (mode)
2853 || INTVAL (pred1[1]) == SVE_RELAXED_GP
2854 || rtx_equal_p (pred1[0], pred2));
2855 }
2856
2857 /* PRED1[0] is a PTEST predicate and PRED1[1] is an aarch64_sve_ptrue_flag
2858 for it. PRED2[0] is the predicate for the instruction whose result
2859 is tested by the PTEST and PRED2[1] is again an aarch64_sve_ptrue_flag
2860 for it. Return true if we can prove that the two predicates are
2861 equivalent for PTEST purposes; that is, if we can replace PRED2[0]
2862 with PRED1[0] without changing behavior. */
2863
2864 bool
2865 aarch64_sve_same_pred_for_ptest_p (rtx *pred1, rtx *pred2)
2866 {
2867 machine_mode mode = GET_MODE (pred1[0]);
2868 gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
2869 && mode == GET_MODE (pred2[0])
2870 && aarch64_sve_ptrue_flag (pred1[1], SImode)
2871 && aarch64_sve_ptrue_flag (pred2[1], SImode));
2872
2873 bool ptrue1_p = (pred1[0] == CONSTM1_RTX (mode)
2874 || INTVAL (pred1[1]) == SVE_KNOWN_PTRUE);
2875 bool ptrue2_p = (pred2[0] == CONSTM1_RTX (mode)
2876 || INTVAL (pred2[1]) == SVE_KNOWN_PTRUE);
2877 return (ptrue1_p && ptrue2_p) || rtx_equal_p (pred1[0], pred2[0]);
2878 }
2879
2880 /* Emit a comparison CMP between OP0 and OP1, both of which have mode
2881 DATA_MODE, and return the result in a predicate of mode PRED_MODE.
2882 Use TARGET as the target register if nonnull and convenient. */
2883
2884 static rtx
2885 aarch64_sve_emit_int_cmp (rtx target, machine_mode pred_mode, rtx_code cmp,
2886 machine_mode data_mode, rtx op1, rtx op2)
2887 {
2888 insn_code icode = code_for_aarch64_pred_cmp (cmp, data_mode);
2889 expand_operand ops[5];
2890 create_output_operand (&ops[0], target, pred_mode);
2891 create_input_operand (&ops[1], CONSTM1_RTX (pred_mode), pred_mode);
2892 create_integer_operand (&ops[2], SVE_KNOWN_PTRUE);
2893 create_input_operand (&ops[3], op1, data_mode);
2894 create_input_operand (&ops[4], op2, data_mode);
2895 expand_insn (icode, 5, ops);
2896 return ops[0].value;
2897 }
2898
2899 /* Use a comparison to convert integer vector SRC into MODE, which is
2900 the corresponding SVE predicate mode. Use TARGET for the result
2901 if it's nonnull and convenient. */
2902
2903 static rtx
2904 aarch64_convert_sve_data_to_pred (rtx target, machine_mode mode, rtx src)
2905 {
2906 machine_mode src_mode = GET_MODE (src);
2907 return aarch64_sve_emit_int_cmp (target, mode, NE, src_mode,
2908 src, CONST0_RTX (src_mode));
2909 }
2910
2911 /* Return true if we can move VALUE into a register using a single
2912 CNT[BHWD] instruction. */
2913
2914 static bool
2915 aarch64_sve_cnt_immediate_p (poly_int64 value)
2916 {
2917 HOST_WIDE_INT factor = value.coeffs[0];
2918 /* The coefficient must be [1, 16] * {2, 4, 8, 16}. */
2919 return (value.coeffs[1] == factor
2920 && IN_RANGE (factor, 2, 16 * 16)
2921 && (factor & 1) == 0
2922 && factor <= 16 * (factor & -factor));
2923 }
2924
2925 /* Likewise for rtx X. */
2926
2927 bool
2928 aarch64_sve_cnt_immediate_p (rtx x)
2929 {
2930 poly_int64 value;
2931 return poly_int_rtx_p (x, &value) && aarch64_sve_cnt_immediate_p (value);
2932 }
2933
2934 /* Return the asm string for an instruction with a CNT-like vector size
2935 operand (a vector pattern followed by a multiplier in the range [1, 16]).
2936 PREFIX is the mnemonic without the size suffix and OPERANDS is the
2937 first part of the operands template (the part that comes before the
2938 vector size itself). PATTERN is the pattern to use. FACTOR is the
2939 number of quadwords. NELTS_PER_VQ, if nonzero, is the number of elements
2940 in each quadword. If it is zero, we can use any element size. */
2941
2942 static char *
2943 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
2944 aarch64_svpattern pattern,
2945 unsigned int factor,
2946 unsigned int nelts_per_vq)
2947 {
2948 static char buffer[sizeof ("sqincd\t%x0, %w0, vl256, mul #16")];
2949
2950 if (nelts_per_vq == 0)
2951 /* There is some overlap in the ranges of the four CNT instructions.
2952 Here we always use the smallest possible element size, so that the
2953 multiplier is 1 whereever possible. */
2954 nelts_per_vq = factor & -factor;
2955 int shift = std::min (exact_log2 (nelts_per_vq), 4);
2956 gcc_assert (IN_RANGE (shift, 1, 4));
2957 char suffix = "dwhb"[shift - 1];
2958
2959 factor >>= shift;
2960 unsigned int written;
2961 if (pattern == AARCH64_SV_ALL && factor == 1)
2962 written = snprintf (buffer, sizeof (buffer), "%s%c\t%s",
2963 prefix, suffix, operands);
2964 else if (factor == 1)
2965 written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, %s",
2966 prefix, suffix, operands, svpattern_token (pattern));
2967 else
2968 written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, %s, mul #%d",
2969 prefix, suffix, operands, svpattern_token (pattern),
2970 factor);
2971 gcc_assert (written < sizeof (buffer));
2972 return buffer;
2973 }
2974
2975 /* Return the asm string for an instruction with a CNT-like vector size
2976 operand (a vector pattern followed by a multiplier in the range [1, 16]).
2977 PREFIX is the mnemonic without the size suffix and OPERANDS is the
2978 first part of the operands template (the part that comes before the
2979 vector size itself). X is the value of the vector size operand,
2980 as a polynomial integer rtx; we need to convert this into an "all"
2981 pattern with a multiplier. */
2982
2983 char *
2984 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
2985 rtx x)
2986 {
2987 poly_int64 value = rtx_to_poly_int64 (x);
2988 gcc_assert (aarch64_sve_cnt_immediate_p (value));
2989 return aarch64_output_sve_cnt_immediate (prefix, operands, AARCH64_SV_ALL,
2990 value.coeffs[1], 0);
2991 }
2992
2993 /* Return true if we can add X using a single SVE INC or DEC instruction. */
2994
2995 bool
2996 aarch64_sve_scalar_inc_dec_immediate_p (rtx x)
2997 {
2998 poly_int64 value;
2999 return (poly_int_rtx_p (x, &value)
3000 && (aarch64_sve_cnt_immediate_p (value)
3001 || aarch64_sve_cnt_immediate_p (-value)));
3002 }
3003
3004 /* Return the asm string for adding SVE INC/DEC immediate OFFSET to
3005 operand 0. */
3006
3007 char *
3008 aarch64_output_sve_scalar_inc_dec (rtx offset)
3009 {
3010 poly_int64 offset_value = rtx_to_poly_int64 (offset);
3011 gcc_assert (offset_value.coeffs[0] == offset_value.coeffs[1]);
3012 if (offset_value.coeffs[1] > 0)
3013 return aarch64_output_sve_cnt_immediate ("inc", "%x0", AARCH64_SV_ALL,
3014 offset_value.coeffs[1], 0);
3015 else
3016 return aarch64_output_sve_cnt_immediate ("dec", "%x0", AARCH64_SV_ALL,
3017 -offset_value.coeffs[1], 0);
3018 }
3019
3020 /* Return true if we can add VALUE to a register using a single ADDVL
3021 or ADDPL instruction. */
3022
3023 static bool
3024 aarch64_sve_addvl_addpl_immediate_p (poly_int64 value)
3025 {
3026 HOST_WIDE_INT factor = value.coeffs[0];
3027 if (factor == 0 || value.coeffs[1] != factor)
3028 return false;
3029 /* FACTOR counts VG / 2, so a value of 2 is one predicate width
3030 and a value of 16 is one vector width. */
3031 return (((factor & 15) == 0 && IN_RANGE (factor, -32 * 16, 31 * 16))
3032 || ((factor & 1) == 0 && IN_RANGE (factor, -32 * 2, 31 * 2)));
3033 }
3034
3035 /* Likewise for rtx X. */
3036
3037 bool
3038 aarch64_sve_addvl_addpl_immediate_p (rtx x)
3039 {
3040 poly_int64 value;
3041 return (poly_int_rtx_p (x, &value)
3042 && aarch64_sve_addvl_addpl_immediate_p (value));
3043 }
3044
3045 /* Return the asm string for adding ADDVL or ADDPL immediate OFFSET
3046 to operand 1 and storing the result in operand 0. */
3047
3048 char *
3049 aarch64_output_sve_addvl_addpl (rtx offset)
3050 {
3051 static char buffer[sizeof ("addpl\t%x0, %x1, #-") + 3 * sizeof (int)];
3052 poly_int64 offset_value = rtx_to_poly_int64 (offset);
3053 gcc_assert (aarch64_sve_addvl_addpl_immediate_p (offset_value));
3054
3055 int factor = offset_value.coeffs[1];
3056 if ((factor & 15) == 0)
3057 snprintf (buffer, sizeof (buffer), "addvl\t%%x0, %%x1, #%d", factor / 16);
3058 else
3059 snprintf (buffer, sizeof (buffer), "addpl\t%%x0, %%x1, #%d", factor / 2);
3060 return buffer;
3061 }
3062
3063 /* Return true if X is a valid immediate for an SVE vector INC or DEC
3064 instruction. If it is, store the number of elements in each vector
3065 quadword in *NELTS_PER_VQ_OUT (if nonnull) and store the multiplication
3066 factor in *FACTOR_OUT (if nonnull). */
3067
3068 bool
3069 aarch64_sve_vector_inc_dec_immediate_p (rtx x, int *factor_out,
3070 unsigned int *nelts_per_vq_out)
3071 {
3072 rtx elt;
3073 poly_int64 value;
3074
3075 if (!const_vec_duplicate_p (x, &elt)
3076 || !poly_int_rtx_p (elt, &value))
3077 return false;
3078
3079 unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (GET_MODE (x));
3080 if (nelts_per_vq != 8 && nelts_per_vq != 4 && nelts_per_vq != 2)
3081 /* There's no vector INCB. */
3082 return false;
3083
3084 HOST_WIDE_INT factor = value.coeffs[0];
3085 if (value.coeffs[1] != factor)
3086 return false;
3087
3088 /* The coefficient must be [1, 16] * NELTS_PER_VQ. */
3089 if ((factor % nelts_per_vq) != 0
3090 || !IN_RANGE (abs (factor), nelts_per_vq, 16 * nelts_per_vq))
3091 return false;
3092
3093 if (factor_out)
3094 *factor_out = factor;
3095 if (nelts_per_vq_out)
3096 *nelts_per_vq_out = nelts_per_vq;
3097 return true;
3098 }
3099
3100 /* Return true if X is a valid immediate for an SVE vector INC or DEC
3101 instruction. */
3102
3103 bool
3104 aarch64_sve_vector_inc_dec_immediate_p (rtx x)
3105 {
3106 return aarch64_sve_vector_inc_dec_immediate_p (x, NULL, NULL);
3107 }
3108
3109 /* Return the asm template for an SVE vector INC or DEC instruction.
3110 OPERANDS gives the operands before the vector count and X is the
3111 value of the vector count operand itself. */
3112
3113 char *
3114 aarch64_output_sve_vector_inc_dec (const char *operands, rtx x)
3115 {
3116 int factor;
3117 unsigned int nelts_per_vq;
3118 if (!aarch64_sve_vector_inc_dec_immediate_p (x, &factor, &nelts_per_vq))
3119 gcc_unreachable ();
3120 if (factor < 0)
3121 return aarch64_output_sve_cnt_immediate ("dec", operands, AARCH64_SV_ALL,
3122 -factor, nelts_per_vq);
3123 else
3124 return aarch64_output_sve_cnt_immediate ("inc", operands, AARCH64_SV_ALL,
3125 factor, nelts_per_vq);
3126 }
3127
3128 static int
3129 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
3130 scalar_int_mode mode)
3131 {
3132 int i;
3133 unsigned HOST_WIDE_INT val, val2, mask;
3134 int one_match, zero_match;
3135 int num_insns;
3136
3137 val = INTVAL (imm);
3138
3139 if (aarch64_move_imm (val, mode))
3140 {
3141 if (generate)
3142 emit_insn (gen_rtx_SET (dest, imm));
3143 return 1;
3144 }
3145
3146 /* Check to see if the low 32 bits are either 0xffffXXXX or 0xXXXXffff
3147 (with XXXX non-zero). In that case check to see if the move can be done in
3148 a smaller mode. */
3149 val2 = val & 0xffffffff;
3150 if (mode == DImode
3151 && aarch64_move_imm (val2, SImode)
3152 && (((val >> 32) & 0xffff) == 0 || (val >> 48) == 0))
3153 {
3154 if (generate)
3155 emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
3156
3157 /* Check if we have to emit a second instruction by checking to see
3158 if any of the upper 32 bits of the original DI mode value is set. */
3159 if (val == val2)
3160 return 1;
3161
3162 i = (val >> 48) ? 48 : 32;
3163
3164 if (generate)
3165 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
3166 GEN_INT ((val >> i) & 0xffff)));
3167
3168 return 2;
3169 }
3170
3171 if ((val >> 32) == 0 || mode == SImode)
3172 {
3173 if (generate)
3174 {
3175 emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
3176 if (mode == SImode)
3177 emit_insn (gen_insv_immsi (dest, GEN_INT (16),
3178 GEN_INT ((val >> 16) & 0xffff)));
3179 else
3180 emit_insn (gen_insv_immdi (dest, GEN_INT (16),
3181 GEN_INT ((val >> 16) & 0xffff)));
3182 }
3183 return 2;
3184 }
3185
3186 /* Remaining cases are all for DImode. */
3187
3188 mask = 0xffff;
3189 zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
3190 ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
3191 one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
3192 ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
3193
3194 if (zero_match != 2 && one_match != 2)
3195 {
3196 /* Try emitting a bitmask immediate with a movk replacing 16 bits.
3197 For a 64-bit bitmask try whether changing 16 bits to all ones or
3198 zeroes creates a valid bitmask. To check any repeated bitmask,
3199 try using 16 bits from the other 32-bit half of val. */
3200
3201 for (i = 0; i < 64; i += 16, mask <<= 16)
3202 {
3203 val2 = val & ~mask;
3204 if (val2 != val && aarch64_bitmask_imm (val2, mode))
3205 break;
3206 val2 = val | mask;
3207 if (val2 != val && aarch64_bitmask_imm (val2, mode))
3208 break;
3209 val2 = val2 & ~mask;
3210 val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
3211 if (val2 != val && aarch64_bitmask_imm (val2, mode))
3212 break;
3213 }
3214 if (i != 64)
3215 {
3216 if (generate)
3217 {
3218 emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
3219 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
3220 GEN_INT ((val >> i) & 0xffff)));
3221 }
3222 return 2;
3223 }
3224 }
3225
3226 /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
3227 are emitted by the initial mov. If one_match > zero_match, skip set bits,
3228 otherwise skip zero bits. */
3229
3230 num_insns = 1;
3231 mask = 0xffff;
3232 val2 = one_match > zero_match ? ~val : val;
3233 i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
3234
3235 if (generate)
3236 emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
3237 ? (val | ~(mask << i))
3238 : (val & (mask << i)))));
3239 for (i += 16; i < 64; i += 16)
3240 {
3241 if ((val2 & (mask << i)) == 0)
3242 continue;
3243 if (generate)
3244 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
3245 GEN_INT ((val >> i) & 0xffff)));
3246 num_insns ++;
3247 }
3248
3249 return num_insns;
3250 }
3251
3252 /* Return whether imm is a 128-bit immediate which is simple enough to
3253 expand inline. */
3254 bool
3255 aarch64_mov128_immediate (rtx imm)
3256 {
3257 if (GET_CODE (imm) == CONST_INT)
3258 return true;
3259
3260 gcc_assert (CONST_WIDE_INT_NUNITS (imm) == 2);
3261
3262 rtx lo = GEN_INT (CONST_WIDE_INT_ELT (imm, 0));
3263 rtx hi = GEN_INT (CONST_WIDE_INT_ELT (imm, 1));
3264
3265 return aarch64_internal_mov_immediate (NULL_RTX, lo, false, DImode)
3266 + aarch64_internal_mov_immediate (NULL_RTX, hi, false, DImode) <= 4;
3267 }
3268
3269
3270 /* Return the number of temporary registers that aarch64_add_offset_1
3271 would need to add OFFSET to a register. */
3272
3273 static unsigned int
3274 aarch64_add_offset_1_temporaries (HOST_WIDE_INT offset)
3275 {
3276 return abs_hwi (offset) < 0x1000000 ? 0 : 1;
3277 }
3278
3279 /* A subroutine of aarch64_add_offset. Set DEST to SRC + OFFSET for
3280 a non-polynomial OFFSET. MODE is the mode of the addition.
3281 FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
3282 be set and CFA adjustments added to the generated instructions.
3283
3284 TEMP1, if nonnull, is a register of mode MODE that can be used as a
3285 temporary if register allocation is already complete. This temporary
3286 register may overlap DEST but must not overlap SRC. If TEMP1 is known
3287 to hold abs (OFFSET), EMIT_MOVE_IMM can be set to false to avoid emitting
3288 the immediate again.
3289
3290 Since this function may be used to adjust the stack pointer, we must
3291 ensure that it cannot cause transient stack deallocation (for example
3292 by first incrementing SP and then decrementing when adjusting by a
3293 large immediate). */
3294
3295 static void
3296 aarch64_add_offset_1 (scalar_int_mode mode, rtx dest,
3297 rtx src, HOST_WIDE_INT offset, rtx temp1,
3298 bool frame_related_p, bool emit_move_imm)
3299 {
3300 gcc_assert (emit_move_imm || temp1 != NULL_RTX);
3301 gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
3302
3303 HOST_WIDE_INT moffset = abs_hwi (offset);
3304 rtx_insn *insn;
3305
3306 if (!moffset)
3307 {
3308 if (!rtx_equal_p (dest, src))
3309 {
3310 insn = emit_insn (gen_rtx_SET (dest, src));
3311 RTX_FRAME_RELATED_P (insn) = frame_related_p;
3312 }
3313 return;
3314 }
3315
3316 /* Single instruction adjustment. */
3317 if (aarch64_uimm12_shift (moffset))
3318 {
3319 insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (offset)));
3320 RTX_FRAME_RELATED_P (insn) = frame_related_p;
3321 return;
3322 }
3323
3324 /* Emit 2 additions/subtractions if the adjustment is less than 24 bits
3325 and either:
3326
3327 a) the offset cannot be loaded by a 16-bit move or
3328 b) there is no spare register into which we can move it. */
3329 if (moffset < 0x1000000
3330 && ((!temp1 && !can_create_pseudo_p ())
3331 || !aarch64_move_imm (moffset, mode)))
3332 {
3333 HOST_WIDE_INT low_off = moffset & 0xfff;
3334
3335 low_off = offset < 0 ? -low_off : low_off;
3336 insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (low_off)));
3337 RTX_FRAME_RELATED_P (insn) = frame_related_p;
3338 insn = emit_insn (gen_add2_insn (dest, GEN_INT (offset - low_off)));
3339 RTX_FRAME_RELATED_P (insn) = frame_related_p;
3340 return;
3341 }
3342
3343 /* Emit a move immediate if required and an addition/subtraction. */
3344 if (emit_move_imm)
3345 {
3346 gcc_assert (temp1 != NULL_RTX || can_create_pseudo_p ());
3347 temp1 = aarch64_force_temporary (mode, temp1, GEN_INT (moffset));
3348 }
3349 insn = emit_insn (offset < 0
3350 ? gen_sub3_insn (dest, src, temp1)
3351 : gen_add3_insn (dest, src, temp1));
3352 if (frame_related_p)
3353 {
3354 RTX_FRAME_RELATED_P (insn) = frame_related_p;
3355 rtx adj = plus_constant (mode, src, offset);
3356 add_reg_note (insn, REG_CFA_ADJUST_CFA, gen_rtx_SET (dest, adj));
3357 }
3358 }
3359
3360 /* Return the number of temporary registers that aarch64_add_offset
3361 would need to move OFFSET into a register or add OFFSET to a register;
3362 ADD_P is true if we want the latter rather than the former. */
3363
3364 static unsigned int
3365 aarch64_offset_temporaries (bool add_p, poly_int64 offset)
3366 {
3367 /* This follows the same structure as aarch64_add_offset. */
3368 if (add_p && aarch64_sve_addvl_addpl_immediate_p (offset))
3369 return 0;
3370
3371 unsigned int count = 0;
3372 HOST_WIDE_INT factor = offset.coeffs[1];
3373 HOST_WIDE_INT constant = offset.coeffs[0] - factor;
3374 poly_int64 poly_offset (factor, factor);
3375 if (add_p && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
3376 /* Need one register for the ADDVL/ADDPL result. */
3377 count += 1;
3378 else if (factor != 0)
3379 {
3380 factor = abs (factor);
3381 if (factor > 16 * (factor & -factor))
3382 /* Need one register for the CNT result and one for the multiplication
3383 factor. If necessary, the second temporary can be reused for the
3384 constant part of the offset. */
3385 return 2;
3386 /* Need one register for the CNT result (which might then
3387 be shifted). */
3388 count += 1;
3389 }
3390 return count + aarch64_add_offset_1_temporaries (constant);
3391 }
3392
3393 /* If X can be represented as a poly_int64, return the number
3394 of temporaries that are required to add it to a register.
3395 Return -1 otherwise. */
3396
3397 int
3398 aarch64_add_offset_temporaries (rtx x)
3399 {
3400 poly_int64 offset;
3401 if (!poly_int_rtx_p (x, &offset))
3402 return -1;
3403 return aarch64_offset_temporaries (true, offset);
3404 }
3405
3406 /* Set DEST to SRC + OFFSET. MODE is the mode of the addition.
3407 FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
3408 be set and CFA adjustments added to the generated instructions.
3409
3410 TEMP1, if nonnull, is a register of mode MODE that can be used as a
3411 temporary if register allocation is already complete. This temporary
3412 register may overlap DEST if !FRAME_RELATED_P but must not overlap SRC.
3413 If TEMP1 is known to hold abs (OFFSET), EMIT_MOVE_IMM can be set to
3414 false to avoid emitting the immediate again.
3415
3416 TEMP2, if nonnull, is a second temporary register that doesn't
3417 overlap either DEST or REG.
3418
3419 Since this function may be used to adjust the stack pointer, we must
3420 ensure that it cannot cause transient stack deallocation (for example
3421 by first incrementing SP and then decrementing when adjusting by a
3422 large immediate). */
3423
3424 static void
3425 aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx src,
3426 poly_int64 offset, rtx temp1, rtx temp2,
3427 bool frame_related_p, bool emit_move_imm = true)
3428 {
3429 gcc_assert (emit_move_imm || temp1 != NULL_RTX);
3430 gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
3431 gcc_assert (temp1 == NULL_RTX
3432 || !frame_related_p
3433 || !reg_overlap_mentioned_p (temp1, dest));
3434 gcc_assert (temp2 == NULL_RTX || !reg_overlap_mentioned_p (dest, temp2));
3435
3436 /* Try using ADDVL or ADDPL to add the whole value. */
3437 if (src != const0_rtx && aarch64_sve_addvl_addpl_immediate_p (offset))
3438 {
3439 rtx offset_rtx = gen_int_mode (offset, mode);
3440 rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
3441 RTX_FRAME_RELATED_P (insn) = frame_related_p;
3442 return;
3443 }
3444
3445 /* Coefficient 1 is multiplied by the number of 128-bit blocks in an
3446 SVE vector register, over and above the minimum size of 128 bits.
3447 This is equivalent to half the value returned by CNTD with a
3448 vector shape of ALL. */
3449 HOST_WIDE_INT factor = offset.coeffs[1];
3450 HOST_WIDE_INT constant = offset.coeffs[0] - factor;
3451
3452 /* Try using ADDVL or ADDPL to add the VG-based part. */
3453 poly_int64 poly_offset (factor, factor);
3454 if (src != const0_rtx
3455 && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
3456 {
3457 rtx offset_rtx = gen_int_mode (poly_offset, mode);
3458 if (frame_related_p)
3459 {
3460 rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
3461 RTX_FRAME_RELATED_P (insn) = true;
3462 src = dest;
3463 }
3464 else
3465 {
3466 rtx addr = gen_rtx_PLUS (mode, src, offset_rtx);
3467 src = aarch64_force_temporary (mode, temp1, addr);
3468 temp1 = temp2;
3469 temp2 = NULL_RTX;
3470 }
3471 }
3472 /* Otherwise use a CNT-based sequence. */
3473 else if (factor != 0)
3474 {
3475 /* Use a subtraction if we have a negative factor. */
3476 rtx_code code = PLUS;
3477 if (factor < 0)
3478 {
3479 factor = -factor;
3480 code = MINUS;
3481 }
3482
3483 /* Calculate CNTD * FACTOR / 2. First try to fold the division
3484 into the multiplication. */
3485 rtx val;
3486 int shift = 0;
3487 if (factor & 1)
3488 /* Use a right shift by 1. */
3489 shift = -1;
3490 else
3491 factor /= 2;
3492 HOST_WIDE_INT low_bit = factor & -factor;
3493 if (factor <= 16 * low_bit)
3494 {
3495 if (factor > 16 * 8)
3496 {
3497 /* "CNTB Xn, ALL, MUL #FACTOR" is out of range, so calculate
3498 the value with the minimum multiplier and shift it into
3499 position. */
3500 int extra_shift = exact_log2 (low_bit);
3501 shift += extra_shift;
3502 factor >>= extra_shift;
3503 }
3504 val = gen_int_mode (poly_int64 (factor * 2, factor * 2), mode);
3505 }
3506 else
3507 {
3508 /* Base the factor on LOW_BIT if we can calculate LOW_BIT
3509 directly, since that should increase the chances of being
3510 able to use a shift and add sequence. If LOW_BIT itself
3511 is out of range, just use CNTD. */
3512 if (low_bit <= 16 * 8)
3513 factor /= low_bit;
3514 else
3515 low_bit = 1;
3516
3517 val = gen_int_mode (poly_int64 (low_bit * 2, low_bit * 2), mode);
3518 val = aarch64_force_temporary (mode, temp1, val);
3519
3520 if (can_create_pseudo_p ())
3521 {
3522 rtx coeff1 = gen_int_mode (factor, mode);
3523 val = expand_mult (mode, val, coeff1, NULL_RTX, false, true);
3524 }
3525 else
3526 {
3527 /* Go back to using a negative multiplication factor if we have
3528 no register from which to subtract. */
3529 if (code == MINUS && src == const0_rtx)
3530 {
3531 factor = -factor;
3532 code = PLUS;
3533 }
3534 rtx coeff1 = gen_int_mode (factor, mode);
3535 coeff1 = aarch64_force_temporary (mode, temp2, coeff1);
3536 val = gen_rtx_MULT (mode, val, coeff1);
3537 }
3538 }
3539
3540 if (shift > 0)
3541 {
3542 /* Multiply by 1 << SHIFT. */
3543 val = aarch64_force_temporary (mode, temp1, val);
3544 val = gen_rtx_ASHIFT (mode, val, GEN_INT (shift));
3545 }
3546 else if (shift == -1)
3547 {
3548 /* Divide by 2. */
3549 val = aarch64_force_temporary (mode, temp1, val);
3550 val = gen_rtx_ASHIFTRT (mode, val, const1_rtx);
3551 }
3552
3553 /* Calculate SRC +/- CNTD * FACTOR / 2. */
3554 if (src != const0_rtx)
3555 {
3556 val = aarch64_force_temporary (mode, temp1, val);
3557 val = gen_rtx_fmt_ee (code, mode, src, val);
3558 }
3559 else if (code == MINUS)
3560 {
3561 val = aarch64_force_temporary (mode, temp1, val);
3562 val = gen_rtx_NEG (mode, val);
3563 }
3564
3565 if (constant == 0 || frame_related_p)
3566 {
3567 rtx_insn *insn = emit_insn (gen_rtx_SET (dest, val));
3568 if (frame_related_p)
3569 {
3570 RTX_FRAME_RELATED_P (insn) = true;
3571 add_reg_note (insn, REG_CFA_ADJUST_CFA,
3572 gen_rtx_SET (dest, plus_constant (Pmode, src,
3573 poly_offset)));
3574 }
3575 src = dest;
3576 if (constant == 0)
3577 return;
3578 }
3579 else
3580 {
3581 src = aarch64_force_temporary (mode, temp1, val);
3582 temp1 = temp2;
3583 temp2 = NULL_RTX;
3584 }
3585
3586 emit_move_imm = true;
3587 }
3588
3589 aarch64_add_offset_1 (mode, dest, src, constant, temp1,
3590 frame_related_p, emit_move_imm);
3591 }
3592
3593 /* Like aarch64_add_offset, but the offset is given as an rtx rather
3594 than a poly_int64. */
3595
3596 void
3597 aarch64_split_add_offset (scalar_int_mode mode, rtx dest, rtx src,
3598 rtx offset_rtx, rtx temp1, rtx temp2)
3599 {
3600 aarch64_add_offset (mode, dest, src, rtx_to_poly_int64 (offset_rtx),
3601 temp1, temp2, false);
3602 }
3603
3604 /* Add DELTA to the stack pointer, marking the instructions frame-related.
3605 TEMP1 is available as a temporary if nonnull. EMIT_MOVE_IMM is false
3606 if TEMP1 already contains abs (DELTA). */
3607
3608 static inline void
3609 aarch64_add_sp (rtx temp1, rtx temp2, poly_int64 delta, bool emit_move_imm)
3610 {
3611 aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, delta,
3612 temp1, temp2, true, emit_move_imm);
3613 }
3614
3615 /* Subtract DELTA from the stack pointer, marking the instructions
3616 frame-related if FRAME_RELATED_P. TEMP1 is available as a temporary
3617 if nonnull. */
3618
3619 static inline void
3620 aarch64_sub_sp (rtx temp1, rtx temp2, poly_int64 delta, bool frame_related_p,
3621 bool emit_move_imm = true)
3622 {
3623 aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, -delta,
3624 temp1, temp2, frame_related_p, emit_move_imm);
3625 }
3626
3627 /* Set DEST to (vec_series BASE STEP). */
3628
3629 static void
3630 aarch64_expand_vec_series (rtx dest, rtx base, rtx step)
3631 {
3632 machine_mode mode = GET_MODE (dest);
3633 scalar_mode inner = GET_MODE_INNER (mode);
3634
3635 /* Each operand can be a register or an immediate in the range [-16, 15]. */
3636 if (!aarch64_sve_index_immediate_p (base))
3637 base = force_reg (inner, base);
3638 if (!aarch64_sve_index_immediate_p (step))
3639 step = force_reg (inner, step);
3640
3641 emit_set_insn (dest, gen_rtx_VEC_SERIES (mode, base, step));
3642 }
3643
3644 /* Duplicate 128-bit Advanced SIMD vector SRC so that it fills an SVE
3645 register of mode MODE. Use TARGET for the result if it's nonnull
3646 and convenient.
3647
3648 The two vector modes must have the same element mode. The behavior
3649 is to duplicate architectural lane N of SRC into architectural lanes
3650 N + I * STEP of the result. On big-endian targets, architectural
3651 lane 0 of an Advanced SIMD vector is the last element of the vector
3652 in memory layout, so for big-endian targets this operation has the
3653 effect of reversing SRC before duplicating it. Callers need to
3654 account for this. */
3655
3656 rtx
3657 aarch64_expand_sve_dupq (rtx target, machine_mode mode, rtx src)
3658 {
3659 machine_mode src_mode = GET_MODE (src);
3660 gcc_assert (GET_MODE_INNER (mode) == GET_MODE_INNER (src_mode));
3661 insn_code icode = (BYTES_BIG_ENDIAN
3662 ? code_for_aarch64_vec_duplicate_vq_be (mode)
3663 : code_for_aarch64_vec_duplicate_vq_le (mode));
3664
3665 unsigned int i = 0;
3666 expand_operand ops[3];
3667 create_output_operand (&ops[i++], target, mode);
3668 create_output_operand (&ops[i++], src, src_mode);
3669 if (BYTES_BIG_ENDIAN)
3670 {
3671 /* Create a PARALLEL describing the reversal of SRC. */
3672 unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (mode);
3673 rtx sel = aarch64_gen_stepped_int_parallel (nelts_per_vq,
3674 nelts_per_vq - 1, -1);
3675 create_fixed_operand (&ops[i++], sel);
3676 }
3677 expand_insn (icode, i, ops);
3678 return ops[0].value;
3679 }
3680
3681 /* Try to force 128-bit vector value SRC into memory and use LD1RQ to fetch
3682 the memory image into DEST. Return true on success. */
3683
3684 static bool
3685 aarch64_expand_sve_ld1rq (rtx dest, rtx src)
3686 {
3687 src = force_const_mem (GET_MODE (src), src);
3688 if (!src)
3689 return false;
3690
3691 /* Make sure that the address is legitimate. */
3692 if (!aarch64_sve_ld1rq_operand_p (src))
3693 {
3694 rtx addr = force_reg (Pmode, XEXP (src, 0));
3695 src = replace_equiv_address (src, addr);
3696 }
3697
3698 machine_mode mode = GET_MODE (dest);
3699 unsigned int elem_bytes = GET_MODE_UNIT_SIZE (mode);
3700 machine_mode pred_mode = aarch64_sve_pred_mode (elem_bytes).require ();
3701 rtx ptrue = aarch64_ptrue_reg (pred_mode);
3702 emit_insn (gen_aarch64_sve_ld1rq (mode, dest, src, ptrue));
3703 return true;
3704 }
3705
3706 /* Return a register containing CONST_VECTOR SRC, given that SRC has an
3707 SVE data mode and isn't a legitimate constant. Use TARGET for the
3708 result if convenient.
3709
3710 The returned register can have whatever mode seems most natural
3711 given the contents of SRC. */
3712
3713 static rtx
3714 aarch64_expand_sve_const_vector (rtx target, rtx src)
3715 {
3716 machine_mode mode = GET_MODE (src);
3717 unsigned int npatterns = CONST_VECTOR_NPATTERNS (src);
3718 unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (src);
3719 scalar_mode elt_mode = GET_MODE_INNER (mode);
3720 unsigned int elt_bits = GET_MODE_BITSIZE (elt_mode);
3721 unsigned int encoded_bits = npatterns * nelts_per_pattern * elt_bits;
3722
3723 if (nelts_per_pattern == 1 && encoded_bits == 128)
3724 {
3725 /* The constant is a duplicated quadword but can't be narrowed
3726 beyond a quadword. Get the memory image of the first quadword
3727 as a 128-bit vector and try using LD1RQ to load it from memory.
3728
3729 The effect for both endiannesses is to load memory lane N into
3730 architectural lanes N + I * STEP of the result. On big-endian
3731 targets, the layout of the 128-bit vector in an Advanced SIMD
3732 register would be different from its layout in an SVE register,
3733 but this 128-bit vector is a memory value only. */
3734 machine_mode vq_mode = aarch64_vq_mode (elt_mode).require ();
3735 rtx vq_value = simplify_gen_subreg (vq_mode, src, mode, 0);
3736 if (vq_value && aarch64_expand_sve_ld1rq (target, vq_value))
3737 return target;
3738 }
3739
3740 if (nelts_per_pattern == 1 && encoded_bits < 128)
3741 {
3742 /* The vector is a repeating sequence of 64 bits or fewer.
3743 See if we can load them using an Advanced SIMD move and then
3744 duplicate it to fill a vector. This is better than using a GPR
3745 move because it keeps everything in the same register file. */
3746 machine_mode vq_mode = aarch64_vq_mode (elt_mode).require ();
3747 rtx_vector_builder builder (vq_mode, npatterns, 1);
3748 for (unsigned int i = 0; i < npatterns; ++i)
3749 {
3750 /* We want memory lane N to go into architectural lane N,
3751 so reverse for big-endian targets. The DUP .Q pattern
3752 has a compensating reverse built-in. */
3753 unsigned int srci = BYTES_BIG_ENDIAN ? npatterns - i - 1 : i;
3754 builder.quick_push (CONST_VECTOR_ENCODED_ELT (src, srci));
3755 }
3756 rtx vq_src = builder.build ();
3757 if (aarch64_simd_valid_immediate (vq_src, NULL))
3758 {
3759 vq_src = force_reg (vq_mode, vq_src);
3760 return aarch64_expand_sve_dupq (target, mode, vq_src);
3761 }
3762
3763 /* Get an integer representation of the repeating part of Advanced
3764 SIMD vector VQ_SRC. This preserves the endianness of VQ_SRC,
3765 which for big-endian targets is lane-swapped wrt a normal
3766 Advanced SIMD vector. This means that for both endiannesses,
3767 memory lane N of SVE vector SRC corresponds to architectural
3768 lane N of a register holding VQ_SRC. This in turn means that
3769 memory lane 0 of SVE vector SRC is in the lsb of VQ_SRC (viewed
3770 as a single 128-bit value) and thus that memory lane 0 of SRC is
3771 in the lsb of the integer. Duplicating the integer therefore
3772 ensures that memory lane N of SRC goes into architectural lane
3773 N + I * INDEX of the SVE register. */
3774 scalar_mode int_mode = int_mode_for_size (encoded_bits, 0).require ();
3775 rtx elt_value = simplify_gen_subreg (int_mode, vq_src, vq_mode, 0);
3776 if (elt_value)
3777 {
3778 /* Pretend that we had a vector of INT_MODE to start with. */
3779 elt_mode = int_mode;
3780 mode = aarch64_full_sve_mode (int_mode).require ();
3781
3782 /* If the integer can be moved into a general register by a
3783 single instruction, do that and duplicate the result. */
3784 if (CONST_INT_P (elt_value)
3785 && aarch64_move_imm (INTVAL (elt_value), elt_mode))
3786 {
3787 elt_value = force_reg (elt_mode, elt_value);
3788 return expand_vector_broadcast (mode, elt_value);
3789 }
3790 }
3791 else if (npatterns == 1)
3792 /* We're duplicating a single value, but can't do better than
3793 force it to memory and load from there. This handles things
3794 like symbolic constants. */
3795 elt_value = CONST_VECTOR_ENCODED_ELT (src, 0);
3796
3797 if (elt_value)
3798 {
3799 /* Load the element from memory if we can, otherwise move it into
3800 a register and use a DUP. */
3801 rtx op = force_const_mem (elt_mode, elt_value);
3802 if (!op)
3803 op = force_reg (elt_mode, elt_value);
3804 return expand_vector_broadcast (mode, op);
3805 }
3806 }
3807
3808 /* Try using INDEX. */
3809 rtx base, step;
3810 if (const_vec_series_p (src, &base, &step))
3811 {
3812 aarch64_expand_vec_series (target, base, step);
3813 return target;
3814 }
3815
3816 /* From here on, it's better to force the whole constant to memory
3817 if we can. */
3818 if (GET_MODE_NUNITS (mode).is_constant ())
3819 return NULL_RTX;
3820
3821 /* Expand each pattern individually. */
3822 gcc_assert (npatterns > 1);
3823 rtx_vector_builder builder;
3824 auto_vec<rtx, 16> vectors (npatterns);
3825 for (unsigned int i = 0; i < npatterns; ++i)
3826 {
3827 builder.new_vector (mode, 1, nelts_per_pattern);
3828 for (unsigned int j = 0; j < nelts_per_pattern; ++j)
3829 builder.quick_push (CONST_VECTOR_ELT (src, i + j * npatterns));
3830 vectors.quick_push (force_reg (mode, builder.build ()));
3831 }
3832
3833 /* Use permutes to interleave the separate vectors. */
3834 while (npatterns > 1)
3835 {
3836 npatterns /= 2;
3837 for (unsigned int i = 0; i < npatterns; ++i)
3838 {
3839 rtx tmp = (npatterns == 1 ? target : gen_reg_rtx (mode));
3840 rtvec v = gen_rtvec (2, vectors[i], vectors[i + npatterns]);
3841 emit_set_insn (tmp, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
3842 vectors[i] = tmp;
3843 }
3844 }
3845 gcc_assert (vectors[0] == target);
3846 return target;
3847 }
3848
3849 /* Use WHILE to set a predicate register of mode MODE in which the first
3850 VL bits are set and the rest are clear. Use TARGET for the register
3851 if it's nonnull and convenient. */
3852
3853 static rtx
3854 aarch64_sve_move_pred_via_while (rtx target, machine_mode mode,
3855 unsigned int vl)
3856 {
3857 rtx limit = force_reg (DImode, gen_int_mode (vl, DImode));
3858 target = aarch64_target_reg (target, mode);
3859 emit_insn (gen_while_ult (DImode, mode, target, const0_rtx, limit));
3860 return target;
3861 }
3862
3863 static rtx
3864 aarch64_expand_sve_const_pred_1 (rtx, rtx_vector_builder &, bool);
3865
3866 /* BUILDER is a constant predicate in which the index of every set bit
3867 is a multiple of ELT_SIZE (which is <= 8). Try to load the constant
3868 by inverting every element at a multiple of ELT_SIZE and EORing the
3869 result with an ELT_SIZE PTRUE.
3870
3871 Return a register that contains the constant on success, otherwise
3872 return null. Use TARGET as the register if it is nonnull and
3873 convenient. */
3874
3875 static rtx
3876 aarch64_expand_sve_const_pred_eor (rtx target, rtx_vector_builder &builder,
3877 unsigned int elt_size)
3878 {
3879 /* Invert every element at a multiple of ELT_SIZE, keeping the
3880 other bits zero. */
3881 rtx_vector_builder inv_builder (VNx16BImode, builder.npatterns (),
3882 builder.nelts_per_pattern ());
3883 for (unsigned int i = 0; i < builder.encoded_nelts (); ++i)
3884 if ((i & (elt_size - 1)) == 0 && INTVAL (builder.elt (i)) == 0)
3885 inv_builder.quick_push (const1_rtx);
3886 else
3887 inv_builder.quick_push (const0_rtx);
3888 inv_builder.finalize ();
3889
3890 /* See if we can load the constant cheaply. */
3891 rtx inv = aarch64_expand_sve_const_pred_1 (NULL_RTX, inv_builder, false);
3892 if (!inv)
3893 return NULL_RTX;
3894
3895 /* EOR the result with an ELT_SIZE PTRUE. */
3896 rtx mask = aarch64_ptrue_all (elt_size);
3897 mask = force_reg (VNx16BImode, mask);
3898 target = aarch64_target_reg (target, VNx16BImode);
3899 emit_insn (gen_aarch64_pred_z (XOR, VNx16BImode, target, mask, inv, mask));
3900 return target;
3901 }
3902
3903 /* BUILDER is a constant predicate in which the index of every set bit
3904 is a multiple of ELT_SIZE (which is <= 8). Try to load the constant
3905 using a TRN1 of size PERMUTE_SIZE, which is >= ELT_SIZE. Return the
3906 register on success, otherwise return null. Use TARGET as the register
3907 if nonnull and convenient. */
3908
3909 static rtx
3910 aarch64_expand_sve_const_pred_trn (rtx target, rtx_vector_builder &builder,
3911 unsigned int elt_size,
3912 unsigned int permute_size)
3913 {
3914 /* We're going to split the constant into two new constants A and B,
3915 with element I of BUILDER going into A if (I & PERMUTE_SIZE) == 0
3916 and into B otherwise. E.g. for PERMUTE_SIZE == 4 && ELT_SIZE == 1:
3917
3918 A: { 0, 1, 2, 3, _, _, _, _, 8, 9, 10, 11, _, _, _, _ }
3919 B: { 4, 5, 6, 7, _, _, _, _, 12, 13, 14, 15, _, _, _, _ }
3920
3921 where _ indicates elements that will be discarded by the permute.
3922
3923 First calculate the ELT_SIZEs for A and B. */
3924 unsigned int a_elt_size = GET_MODE_SIZE (DImode);
3925 unsigned int b_elt_size = GET_MODE_SIZE (DImode);
3926 for (unsigned int i = 0; i < builder.encoded_nelts (); i += elt_size)
3927 if (INTVAL (builder.elt (i)) != 0)
3928 {
3929 if (i & permute_size)
3930 b_elt_size |= i - permute_size;
3931 else
3932 a_elt_size |= i;
3933 }
3934 a_elt_size &= -a_elt_size;
3935 b_elt_size &= -b_elt_size;
3936
3937 /* Now construct the vectors themselves. */
3938 rtx_vector_builder a_builder (VNx16BImode, builder.npatterns (),
3939 builder.nelts_per_pattern ());
3940 rtx_vector_builder b_builder (VNx16BImode, builder.npatterns (),
3941 builder.nelts_per_pattern ());
3942 unsigned int nelts = builder.encoded_nelts ();
3943 for (unsigned int i = 0; i < nelts; ++i)
3944 if (i & (elt_size - 1))
3945 {
3946 a_builder.quick_push (const0_rtx);
3947 b_builder.quick_push (const0_rtx);
3948 }
3949 else if ((i & permute_size) == 0)
3950 {
3951 /* The A and B elements are significant. */
3952 a_builder.quick_push (builder.elt (i));
3953 b_builder.quick_push (builder.elt (i + permute_size));
3954 }
3955 else
3956 {
3957 /* The A and B elements are going to be discarded, so pick whatever
3958 is likely to give a nice constant. We are targeting element
3959 sizes A_ELT_SIZE and B_ELT_SIZE for A and B respectively,
3960 with the aim of each being a sequence of ones followed by
3961 a sequence of zeros. So:
3962
3963 * if X_ELT_SIZE <= PERMUTE_SIZE, the best approach is to
3964 duplicate the last X_ELT_SIZE element, to extend the
3965 current sequence of ones or zeros.
3966
3967 * if X_ELT_SIZE > PERMUTE_SIZE, the best approach is to add a
3968 zero, so that the constant really does have X_ELT_SIZE and
3969 not a smaller size. */
3970 if (a_elt_size > permute_size)
3971 a_builder.quick_push (const0_rtx);
3972 else
3973 a_builder.quick_push (a_builder.elt (i - a_elt_size));
3974 if (b_elt_size > permute_size)
3975 b_builder.quick_push (const0_rtx);
3976 else
3977 b_builder.quick_push (b_builder.elt (i - b_elt_size));
3978 }
3979 a_builder.finalize ();
3980 b_builder.finalize ();
3981
3982 /* Try loading A into a register. */
3983 rtx_insn *last = get_last_insn ();
3984 rtx a = aarch64_expand_sve_const_pred_1 (NULL_RTX, a_builder, false);
3985 if (!a)
3986 return NULL_RTX;
3987
3988 /* Try loading B into a register. */
3989 rtx b = a;
3990 if (a_builder != b_builder)
3991 {
3992 b = aarch64_expand_sve_const_pred_1 (NULL_RTX, b_builder, false);
3993 if (!b)
3994 {
3995 delete_insns_since (last);
3996 return NULL_RTX;
3997 }
3998 }
3999
4000 /* Emit the TRN1 itself. */
4001 machine_mode mode = aarch64_sve_pred_mode (permute_size).require ();
4002 target = aarch64_target_reg (target, mode);
4003 emit_insn (gen_aarch64_sve (UNSPEC_TRN1, mode, target,
4004 gen_lowpart (mode, a),
4005 gen_lowpart (mode, b)));
4006 return target;
4007 }
4008
4009 /* Subroutine of aarch64_expand_sve_const_pred. Try to load the VNx16BI
4010 constant in BUILDER into an SVE predicate register. Return the register
4011 on success, otherwise return null. Use TARGET for the register if
4012 nonnull and convenient.
4013
4014 ALLOW_RECURSE_P is true if we can use methods that would call this
4015 function recursively. */
4016
4017 static rtx
4018 aarch64_expand_sve_const_pred_1 (rtx target, rtx_vector_builder &builder,
4019 bool allow_recurse_p)
4020 {
4021 if (builder.encoded_nelts () == 1)
4022 /* A PFALSE or a PTRUE .B ALL. */
4023 return aarch64_emit_set_immediate (target, builder);
4024
4025 unsigned int elt_size = aarch64_widest_sve_pred_elt_size (builder);
4026 if (int vl = aarch64_partial_ptrue_length (builder, elt_size))
4027 {
4028 /* If we can load the constant using PTRUE, use it as-is. */
4029 machine_mode mode = aarch64_sve_pred_mode (elt_size).require ();
4030 if (aarch64_svpattern_for_vl (mode, vl) != AARCH64_NUM_SVPATTERNS)
4031 return aarch64_emit_set_immediate (target, builder);
4032
4033 /* Otherwise use WHILE to set the first VL bits. */
4034 return aarch64_sve_move_pred_via_while (target, mode, vl);
4035 }
4036
4037 if (!allow_recurse_p)
4038 return NULL_RTX;
4039
4040 /* Try inverting the vector in element size ELT_SIZE and then EORing
4041 the result with an ELT_SIZE PTRUE. */
4042 if (INTVAL (builder.elt (0)) == 0)
4043 if (rtx res = aarch64_expand_sve_const_pred_eor (target, builder,
4044 elt_size))
4045 return res;
4046
4047 /* Try using TRN1 to permute two simpler constants. */
4048 for (unsigned int i = elt_size; i <= 8; i *= 2)
4049 if (rtx res = aarch64_expand_sve_const_pred_trn (target, builder,
4050 elt_size, i))
4051 return res;
4052
4053 return NULL_RTX;
4054 }
4055
4056 /* Return an SVE predicate register that contains the VNx16BImode
4057 constant in BUILDER, without going through the move expanders.
4058
4059 The returned register can have whatever mode seems most natural
4060 given the contents of BUILDER. Use TARGET for the result if
4061 convenient. */
4062
4063 static rtx
4064 aarch64_expand_sve_const_pred (rtx target, rtx_vector_builder &builder)
4065 {
4066 /* Try loading the constant using pure predicate operations. */
4067 if (rtx res = aarch64_expand_sve_const_pred_1 (target, builder, true))
4068 return res;
4069
4070 /* Try forcing the constant to memory. */
4071 if (builder.full_nelts ().is_constant ())
4072 if (rtx mem = force_const_mem (VNx16BImode, builder.build ()))
4073 {
4074 target = aarch64_target_reg (target, VNx16BImode);
4075 emit_move_insn (target, mem);
4076 return target;
4077 }
4078
4079 /* The last resort is to load the constant as an integer and then
4080 compare it against zero. Use -1 for set bits in order to increase
4081 the changes of using SVE DUPM or an Advanced SIMD byte mask. */
4082 rtx_vector_builder int_builder (VNx16QImode, builder.npatterns (),
4083 builder.nelts_per_pattern ());
4084 for (unsigned int i = 0; i < builder.encoded_nelts (); ++i)
4085 int_builder.quick_push (INTVAL (builder.elt (i))
4086 ? constm1_rtx : const0_rtx);
4087 return aarch64_convert_sve_data_to_pred (target, VNx16BImode,
4088 int_builder.build ());
4089 }
4090
4091 /* Set DEST to immediate IMM. */
4092
4093 void
4094 aarch64_expand_mov_immediate (rtx dest, rtx imm)
4095 {
4096 machine_mode mode = GET_MODE (dest);
4097
4098 /* Check on what type of symbol it is. */
4099 scalar_int_mode int_mode;
4100 if ((GET_CODE (imm) == SYMBOL_REF
4101 || GET_CODE (imm) == LABEL_REF
4102 || GET_CODE (imm) == CONST
4103 || GET_CODE (imm) == CONST_POLY_INT)
4104 && is_a <scalar_int_mode> (mode, &int_mode))
4105 {
4106 rtx mem;
4107 poly_int64 offset;
4108 HOST_WIDE_INT const_offset;
4109 enum aarch64_symbol_type sty;
4110
4111 /* If we have (const (plus symbol offset)), separate out the offset
4112 before we start classifying the symbol. */
4113 rtx base = strip_offset (imm, &offset);
4114
4115 /* We must always add an offset involving VL separately, rather than
4116 folding it into the relocation. */
4117 if (!offset.is_constant (&const_offset))
4118 {
4119 if (base == const0_rtx && aarch64_sve_cnt_immediate_p (offset))
4120 emit_insn (gen_rtx_SET (dest, imm));
4121 else
4122 {
4123 /* Do arithmetic on 32-bit values if the result is smaller
4124 than that. */
4125 if (partial_subreg_p (int_mode, SImode))
4126 {
4127 /* It is invalid to do symbol calculations in modes
4128 narrower than SImode. */
4129 gcc_assert (base == const0_rtx);
4130 dest = gen_lowpart (SImode, dest);
4131 int_mode = SImode;
4132 }
4133 if (base != const0_rtx)
4134 {
4135 base = aarch64_force_temporary (int_mode, dest, base);
4136 aarch64_add_offset (int_mode, dest, base, offset,
4137 NULL_RTX, NULL_RTX, false);
4138 }
4139 else
4140 aarch64_add_offset (int_mode, dest, base, offset,
4141 dest, NULL_RTX, false);
4142 }
4143 return;
4144 }
4145
4146 sty = aarch64_classify_symbol (base, const_offset);
4147 switch (sty)
4148 {
4149 case SYMBOL_FORCE_TO_MEM:
4150 if (const_offset != 0
4151 && targetm.cannot_force_const_mem (int_mode, imm))
4152 {
4153 gcc_assert (can_create_pseudo_p ());
4154 base = aarch64_force_temporary (int_mode, dest, base);
4155 aarch64_add_offset (int_mode, dest, base, const_offset,
4156 NULL_RTX, NULL_RTX, false);
4157 return;
4158 }
4159
4160 mem = force_const_mem (ptr_mode, imm);
4161 gcc_assert (mem);
4162
4163 /* If we aren't generating PC relative literals, then
4164 we need to expand the literal pool access carefully.
4165 This is something that needs to be done in a number
4166 of places, so could well live as a separate function. */
4167 if (!aarch64_pcrelative_literal_loads)
4168 {
4169 gcc_assert (can_create_pseudo_p ());
4170 base = gen_reg_rtx (ptr_mode);
4171 aarch64_expand_mov_immediate (base, XEXP (mem, 0));
4172 if (ptr_mode != Pmode)
4173 base = convert_memory_address (Pmode, base);
4174 mem = gen_rtx_MEM (ptr_mode, base);
4175 }
4176
4177 if (int_mode != ptr_mode)
4178 mem = gen_rtx_ZERO_EXTEND (int_mode, mem);
4179
4180 emit_insn (gen_rtx_SET (dest, mem));
4181
4182 return;
4183
4184 case SYMBOL_SMALL_TLSGD:
4185 case SYMBOL_SMALL_TLSDESC:
4186 case SYMBOL_SMALL_TLSIE:
4187 case SYMBOL_SMALL_GOT_28K:
4188 case SYMBOL_SMALL_GOT_4G:
4189 case SYMBOL_TINY_GOT:
4190 case SYMBOL_TINY_TLSIE:
4191 if (const_offset != 0)
4192 {
4193 gcc_assert(can_create_pseudo_p ());
4194 base = aarch64_force_temporary (int_mode, dest, base);
4195 aarch64_add_offset (int_mode, dest, base, const_offset,
4196 NULL_RTX, NULL_RTX, false);
4197 return;
4198 }
4199 /* FALLTHRU */
4200
4201 case SYMBOL_SMALL_ABSOLUTE:
4202 case SYMBOL_TINY_ABSOLUTE:
4203 case SYMBOL_TLSLE12:
4204 case SYMBOL_TLSLE24:
4205 case SYMBOL_TLSLE32:
4206 case SYMBOL_TLSLE48:
4207 aarch64_load_symref_appropriately (dest, imm, sty);
4208 return;
4209
4210 default:
4211 gcc_unreachable ();
4212 }
4213 }
4214
4215 if (!CONST_INT_P (imm))
4216 {
4217 if (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL)
4218 {
4219 /* Only the low bit of each .H, .S and .D element is defined,
4220 so we can set the upper bits to whatever we like. If the
4221 predicate is all-true in MODE, prefer to set all the undefined
4222 bits as well, so that we can share a single .B predicate for
4223 all modes. */
4224 if (imm == CONSTM1_RTX (mode))
4225 imm = CONSTM1_RTX (VNx16BImode);
4226
4227 /* All methods for constructing predicate modes wider than VNx16BI
4228 will set the upper bits of each element to zero. Expose this
4229 by moving such constants as a VNx16BI, so that all bits are
4230 significant and so that constants for different modes can be
4231 shared. The wider constant will still be available as a
4232 REG_EQUAL note. */
4233 rtx_vector_builder builder;
4234 if (aarch64_get_sve_pred_bits (builder, imm))
4235 {
4236 rtx res = aarch64_expand_sve_const_pred (dest, builder);
4237 if (dest != res)
4238 emit_move_insn (dest, gen_lowpart (mode, res));
4239 return;
4240 }
4241 }
4242
4243 if (GET_CODE (imm) == HIGH
4244 || aarch64_simd_valid_immediate (imm, NULL))
4245 {
4246 emit_insn (gen_rtx_SET (dest, imm));
4247 return;
4248 }
4249
4250 if (GET_CODE (imm) == CONST_VECTOR && aarch64_sve_data_mode_p (mode))
4251 if (rtx res = aarch64_expand_sve_const_vector (dest, imm))
4252 {
4253 if (dest != res)
4254 emit_insn (gen_aarch64_sve_reinterpret (mode, dest, res));
4255 return;
4256 }
4257
4258 rtx mem = force_const_mem (mode, imm);
4259 gcc_assert (mem);
4260 emit_move_insn (dest, mem);
4261 return;
4262 }
4263
4264 aarch64_internal_mov_immediate (dest, imm, true,
4265 as_a <scalar_int_mode> (mode));
4266 }
4267
4268 /* Emit an SVE predicated move from SRC to DEST. PRED is a predicate
4269 that is known to contain PTRUE. */
4270
4271 void
4272 aarch64_emit_sve_pred_move (rtx dest, rtx pred, rtx src)
4273 {
4274 expand_operand ops[3];
4275 machine_mode mode = GET_MODE (dest);
4276 create_output_operand (&ops[0], dest, mode);
4277 create_input_operand (&ops[1], pred, GET_MODE(pred));
4278 create_input_operand (&ops[2], src, mode);
4279 temporary_volatile_ok v (true);
4280 expand_insn (code_for_aarch64_pred_mov (mode), 3, ops);
4281 }
4282
4283 /* Expand a pre-RA SVE data move from SRC to DEST in which at least one
4284 operand is in memory. In this case we need to use the predicated LD1
4285 and ST1 instead of LDR and STR, both for correctness on big-endian
4286 targets and because LD1 and ST1 support a wider range of addressing modes.
4287 PRED_MODE is the mode of the predicate.
4288
4289 See the comment at the head of aarch64-sve.md for details about the
4290 big-endian handling. */
4291
4292 void
4293 aarch64_expand_sve_mem_move (rtx dest, rtx src, machine_mode pred_mode)
4294 {
4295 machine_mode mode = GET_MODE (dest);
4296 rtx ptrue = aarch64_ptrue_reg (pred_mode);
4297 if (!register_operand (src, mode)
4298 && !register_operand (dest, mode))
4299 {
4300 rtx tmp = gen_reg_rtx (mode);
4301 if (MEM_P (src))
4302 aarch64_emit_sve_pred_move (tmp, ptrue, src);
4303 else
4304 emit_move_insn (tmp, src);
4305 src = tmp;
4306 }
4307 aarch64_emit_sve_pred_move (dest, ptrue, src);
4308 }
4309
4310 /* Called only on big-endian targets. See whether an SVE vector move
4311 from SRC to DEST is effectively a REV[BHW] instruction, because at
4312 least one operand is a subreg of an SVE vector that has wider or
4313 narrower elements. Return true and emit the instruction if so.
4314
4315 For example:
4316
4317 (set (reg:VNx8HI R1) (subreg:VNx8HI (reg:VNx16QI R2) 0))
4318
4319 represents a VIEW_CONVERT between the following vectors, viewed
4320 in memory order:
4321
4322 R2: { [0].high, [0].low, [1].high, [1].low, ... }
4323 R1: { [0], [1], [2], [3], ... }
4324
4325 The high part of lane X in R2 should therefore correspond to lane X*2
4326 of R1, but the register representations are:
4327
4328 msb lsb
4329 R2: ...... [1].high [1].low [0].high [0].low
4330 R1: ...... [3] [2] [1] [0]
4331
4332 where the low part of lane X in R2 corresponds to lane X*2 in R1.
4333 We therefore need a reverse operation to swap the high and low values
4334 around.
4335
4336 This is purely an optimization. Without it we would spill the
4337 subreg operand to the stack in one mode and reload it in the
4338 other mode, which has the same effect as the REV. */
4339
4340 bool
4341 aarch64_maybe_expand_sve_subreg_move (rtx dest, rtx src)
4342 {
4343 gcc_assert (BYTES_BIG_ENDIAN);
4344 if (GET_CODE (dest) == SUBREG)
4345 dest = SUBREG_REG (dest);
4346 if (GET_CODE (src) == SUBREG)
4347 src = SUBREG_REG (src);
4348
4349 /* The optimization handles two single SVE REGs with different element
4350 sizes. */
4351 if (!REG_P (dest)
4352 || !REG_P (src)
4353 || aarch64_classify_vector_mode (GET_MODE (dest)) != VEC_SVE_DATA
4354 || aarch64_classify_vector_mode (GET_MODE (src)) != VEC_SVE_DATA
4355 || (GET_MODE_UNIT_SIZE (GET_MODE (dest))
4356 == GET_MODE_UNIT_SIZE (GET_MODE (src))))
4357 return false;
4358
4359 /* Generate *aarch64_sve_mov<mode>_subreg_be. */
4360 rtx ptrue = aarch64_ptrue_reg (VNx16BImode);
4361 rtx unspec = gen_rtx_UNSPEC (GET_MODE (dest), gen_rtvec (2, ptrue, src),
4362 UNSPEC_REV_SUBREG);
4363 emit_insn (gen_rtx_SET (dest, unspec));
4364 return true;
4365 }
4366
4367 /* Return a copy of X with mode MODE, without changing its other
4368 attributes. Unlike gen_lowpart, this doesn't care whether the
4369 mode change is valid. */
4370
4371 static rtx
4372 aarch64_replace_reg_mode (rtx x, machine_mode mode)
4373 {
4374 if (GET_MODE (x) == mode)
4375 return x;
4376
4377 x = shallow_copy_rtx (x);
4378 set_mode_and_regno (x, mode, REGNO (x));
4379 return x;
4380 }
4381
4382 /* Return the SVE REV[BHW] unspec for reversing quantites of mode MODE
4383 stored in wider integer containers. */
4384
4385 static unsigned int
4386 aarch64_sve_rev_unspec (machine_mode mode)
4387 {
4388 switch (GET_MODE_UNIT_SIZE (mode))
4389 {
4390 case 1: return UNSPEC_REVB;
4391 case 2: return UNSPEC_REVH;
4392 case 4: return UNSPEC_REVW;
4393 }
4394 gcc_unreachable ();
4395 }
4396
4397 /* Split a *aarch64_sve_mov<mode>_subreg_be pattern with the given
4398 operands. */
4399
4400 void
4401 aarch64_split_sve_subreg_move (rtx dest, rtx ptrue, rtx src)
4402 {
4403 /* Decide which REV operation we need. The mode with wider elements
4404 determines the mode of the operands and the mode with the narrower
4405 elements determines the reverse width. */
4406 machine_mode mode_with_wider_elts = GET_MODE (dest);
4407 machine_mode mode_with_narrower_elts = GET_MODE (src);
4408 if (GET_MODE_UNIT_SIZE (mode_with_wider_elts)
4409 < GET_MODE_UNIT_SIZE (mode_with_narrower_elts))
4410 std::swap (mode_with_wider_elts, mode_with_narrower_elts);
4411
4412 unsigned int unspec = aarch64_sve_rev_unspec (mode_with_narrower_elts);
4413 unsigned int wider_bytes = GET_MODE_UNIT_SIZE (mode_with_wider_elts);
4414 machine_mode pred_mode = aarch64_sve_pred_mode (wider_bytes).require ();
4415
4416 /* Get the operands in the appropriate modes and emit the instruction. */
4417 ptrue = gen_lowpart (pred_mode, ptrue);
4418 dest = aarch64_replace_reg_mode (dest, mode_with_wider_elts);
4419 src = aarch64_replace_reg_mode (src, mode_with_wider_elts);
4420 emit_insn (gen_aarch64_pred (unspec, mode_with_wider_elts,
4421 dest, ptrue, src));
4422 }
4423
4424 static bool
4425 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
4426 tree exp ATTRIBUTE_UNUSED)
4427 {
4428 if (aarch64_simd_decl_p (cfun->decl) != aarch64_simd_decl_p (decl))
4429 return false;
4430
4431 return true;
4432 }
4433
4434 /* Implement TARGET_PASS_BY_REFERENCE. */
4435
4436 static bool
4437 aarch64_pass_by_reference (cumulative_args_t, const function_arg_info &arg)
4438 {
4439 HOST_WIDE_INT size;
4440 machine_mode dummymode;
4441 int nregs;
4442
4443 /* GET_MODE_SIZE (BLKmode) is useless since it is 0. */
4444 if (arg.mode == BLKmode && arg.type)
4445 size = int_size_in_bytes (arg.type);
4446 else
4447 /* No frontends can create types with variable-sized modes, so we
4448 shouldn't be asked to pass or return them. */
4449 size = GET_MODE_SIZE (arg.mode).to_constant ();
4450
4451 /* Aggregates are passed by reference based on their size. */
4452 if (arg.aggregate_type_p ())
4453 size = int_size_in_bytes (arg.type);
4454
4455 /* Variable sized arguments are always returned by reference. */
4456 if (size < 0)
4457 return true;
4458
4459 /* Can this be a candidate to be passed in fp/simd register(s)? */
4460 if (aarch64_vfp_is_call_or_return_candidate (arg.mode, arg.type,
4461 &dummymode, &nregs,
4462 NULL))
4463 return false;
4464
4465 /* Arguments which are variable sized or larger than 2 registers are
4466 passed by reference unless they are a homogenous floating point
4467 aggregate. */
4468 return size > 2 * UNITS_PER_WORD;
4469 }
4470
4471 /* Return TRUE if VALTYPE is padded to its least significant bits. */
4472 static bool
4473 aarch64_return_in_msb (const_tree valtype)
4474 {
4475 machine_mode dummy_mode;
4476 int dummy_int;
4477
4478 /* Never happens in little-endian mode. */
4479 if (!BYTES_BIG_ENDIAN)
4480 return false;
4481
4482 /* Only composite types smaller than or equal to 16 bytes can
4483 be potentially returned in registers. */
4484 if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
4485 || int_size_in_bytes (valtype) <= 0
4486 || int_size_in_bytes (valtype) > 16)
4487 return false;
4488
4489 /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
4490 or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
4491 is always passed/returned in the least significant bits of fp/simd
4492 register(s). */
4493 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
4494 &dummy_mode, &dummy_int, NULL))
4495 return false;
4496
4497 return true;
4498 }
4499
4500 /* Implement TARGET_FUNCTION_VALUE.
4501 Define how to find the value returned by a function. */
4502
4503 static rtx
4504 aarch64_function_value (const_tree type, const_tree func,
4505 bool outgoing ATTRIBUTE_UNUSED)
4506 {
4507 machine_mode mode;
4508 int unsignedp;
4509 int count;
4510 machine_mode ag_mode;
4511
4512 mode = TYPE_MODE (type);
4513 if (INTEGRAL_TYPE_P (type))
4514 mode = promote_function_mode (type, mode, &unsignedp, func, 1);
4515
4516 if (aarch64_return_in_msb (type))
4517 {
4518 HOST_WIDE_INT size = int_size_in_bytes (type);
4519
4520 if (size % UNITS_PER_WORD != 0)
4521 {
4522 size += UNITS_PER_WORD - size % UNITS_PER_WORD;
4523 mode = int_mode_for_size (size * BITS_PER_UNIT, 0).require ();
4524 }
4525 }
4526
4527 if (aarch64_vfp_is_call_or_return_candidate (mode, type,
4528 &ag_mode, &count, NULL))
4529 {
4530 if (!aarch64_composite_type_p (type, mode))
4531 {
4532 gcc_assert (count == 1 && mode == ag_mode);
4533 return gen_rtx_REG (mode, V0_REGNUM);
4534 }
4535 else
4536 {
4537 int i;
4538 rtx par;
4539
4540 par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
4541 for (i = 0; i < count; i++)
4542 {
4543 rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
4544 rtx offset = gen_int_mode (i * GET_MODE_SIZE (ag_mode), Pmode);
4545 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
4546 XVECEXP (par, 0, i) = tmp;
4547 }
4548 return par;
4549 }
4550 }
4551 else
4552 return gen_rtx_REG (mode, R0_REGNUM);
4553 }
4554
4555 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
4556 Return true if REGNO is the number of a hard register in which the values
4557 of called function may come back. */
4558
4559 static bool
4560 aarch64_function_value_regno_p (const unsigned int regno)
4561 {
4562 /* Maximum of 16 bytes can be returned in the general registers. Examples
4563 of 16-byte return values are: 128-bit integers and 16-byte small
4564 structures (excluding homogeneous floating-point aggregates). */
4565 if (regno == R0_REGNUM || regno == R1_REGNUM)
4566 return true;
4567
4568 /* Up to four fp/simd registers can return a function value, e.g. a
4569 homogeneous floating-point aggregate having four members. */
4570 if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
4571 return TARGET_FLOAT;
4572
4573 return false;
4574 }
4575
4576 /* Implement TARGET_RETURN_IN_MEMORY.
4577
4578 If the type T of the result of a function is such that
4579 void func (T arg)
4580 would require that arg be passed as a value in a register (or set of
4581 registers) according to the parameter passing rules, then the result
4582 is returned in the same registers as would be used for such an
4583 argument. */
4584
4585 static bool
4586 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
4587 {
4588 HOST_WIDE_INT size;
4589 machine_mode ag_mode;
4590 int count;
4591
4592 if (!AGGREGATE_TYPE_P (type)
4593 && TREE_CODE (type) != COMPLEX_TYPE
4594 && TREE_CODE (type) != VECTOR_TYPE)
4595 /* Simple scalar types always returned in registers. */
4596 return false;
4597
4598 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
4599 type,
4600 &ag_mode,
4601 &count,
4602 NULL))
4603 return false;
4604
4605 /* Types larger than 2 registers returned in memory. */
4606 size = int_size_in_bytes (type);
4607 return (size < 0 || size > 2 * UNITS_PER_WORD);
4608 }
4609
4610 static bool
4611 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
4612 const_tree type, int *nregs)
4613 {
4614 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
4615 return aarch64_vfp_is_call_or_return_candidate (mode,
4616 type,
4617 &pcum->aapcs_vfp_rmode,
4618 nregs,
4619 NULL);
4620 }
4621
4622 /* Given MODE and TYPE of a function argument, return the alignment in
4623 bits. The idea is to suppress any stronger alignment requested by
4624 the user and opt for the natural alignment (specified in AAPCS64 \S
4625 4.1). ABI_BREAK is set to true if the alignment was incorrectly
4626 calculated in versions of GCC prior to GCC-9. This is a helper
4627 function for local use only. */
4628
4629 static unsigned int
4630 aarch64_function_arg_alignment (machine_mode mode, const_tree type,
4631 bool *abi_break)
4632 {
4633 *abi_break = false;
4634 if (!type)
4635 return GET_MODE_ALIGNMENT (mode);
4636
4637 if (integer_zerop (TYPE_SIZE (type)))
4638 return 0;
4639
4640 gcc_assert (TYPE_MODE (type) == mode);
4641
4642 if (!AGGREGATE_TYPE_P (type))
4643 return TYPE_ALIGN (TYPE_MAIN_VARIANT (type));
4644
4645 if (TREE_CODE (type) == ARRAY_TYPE)
4646 return TYPE_ALIGN (TREE_TYPE (type));
4647
4648 unsigned int alignment = 0;
4649 unsigned int bitfield_alignment = 0;
4650 for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
4651 if (TREE_CODE (field) == FIELD_DECL)
4652 {
4653 alignment = std::max (alignment, DECL_ALIGN (field));
4654 if (DECL_BIT_FIELD_TYPE (field))
4655 bitfield_alignment
4656 = std::max (bitfield_alignment,
4657 TYPE_ALIGN (DECL_BIT_FIELD_TYPE (field)));
4658 }
4659
4660 if (bitfield_alignment > alignment)
4661 {
4662 *abi_break = true;
4663 return bitfield_alignment;
4664 }
4665
4666 return alignment;
4667 }
4668
4669 /* Layout a function argument according to the AAPCS64 rules. The rule
4670 numbers refer to the rule numbers in the AAPCS64. */
4671
4672 static void
4673 aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
4674 const_tree type,
4675 bool named ATTRIBUTE_UNUSED)
4676 {
4677 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
4678 int ncrn, nvrn, nregs;
4679 bool allocate_ncrn, allocate_nvrn;
4680 HOST_WIDE_INT size;
4681 bool abi_break;
4682
4683 /* We need to do this once per argument. */
4684 if (pcum->aapcs_arg_processed)
4685 return;
4686
4687 pcum->aapcs_arg_processed = true;
4688
4689 /* Size in bytes, rounded to the nearest multiple of 8 bytes. */
4690 if (type)
4691 size = int_size_in_bytes (type);
4692 else
4693 /* No frontends can create types with variable-sized modes, so we
4694 shouldn't be asked to pass or return them. */
4695 size = GET_MODE_SIZE (mode).to_constant ();
4696 size = ROUND_UP (size, UNITS_PER_WORD);
4697
4698 allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
4699 allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
4700 mode,
4701 type,
4702 &nregs);
4703
4704 /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
4705 The following code thus handles passing by SIMD/FP registers first. */
4706
4707 nvrn = pcum->aapcs_nvrn;
4708
4709 /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
4710 and homogenous short-vector aggregates (HVA). */
4711 if (allocate_nvrn)
4712 {
4713 if (!TARGET_FLOAT)
4714 aarch64_err_no_fpadvsimd (mode);
4715
4716 if (nvrn + nregs <= NUM_FP_ARG_REGS)
4717 {
4718 pcum->aapcs_nextnvrn = nvrn + nregs;
4719 if (!aarch64_composite_type_p (type, mode))
4720 {
4721 gcc_assert (nregs == 1);
4722 pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
4723 }
4724 else
4725 {
4726 rtx par;
4727 int i;
4728 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
4729 for (i = 0; i < nregs; i++)
4730 {
4731 rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
4732 V0_REGNUM + nvrn + i);
4733 rtx offset = gen_int_mode
4734 (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode), Pmode);
4735 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
4736 XVECEXP (par, 0, i) = tmp;
4737 }
4738 pcum->aapcs_reg = par;
4739 }
4740 return;
4741 }
4742 else
4743 {
4744 /* C.3 NSRN is set to 8. */
4745 pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
4746 goto on_stack;
4747 }
4748 }
4749
4750 ncrn = pcum->aapcs_ncrn;
4751 nregs = size / UNITS_PER_WORD;
4752
4753 /* C6 - C9. though the sign and zero extension semantics are
4754 handled elsewhere. This is the case where the argument fits
4755 entirely general registers. */
4756 if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
4757 {
4758 gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
4759
4760 /* C.8 if the argument has an alignment of 16 then the NGRN is
4761 rounded up to the next even number. */
4762 if (nregs == 2
4763 && ncrn % 2
4764 /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
4765 comparison is there because for > 16 * BITS_PER_UNIT
4766 alignment nregs should be > 2 and therefore it should be
4767 passed by reference rather than value. */
4768 && (aarch64_function_arg_alignment (mode, type, &abi_break)
4769 == 16 * BITS_PER_UNIT))
4770 {
4771 if (abi_break && warn_psabi && currently_expanding_gimple_stmt)
4772 inform (input_location, "parameter passing for argument of type "
4773 "%qT changed in GCC 9.1", type);
4774 ++ncrn;
4775 gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
4776 }
4777
4778 /* NREGS can be 0 when e.g. an empty structure is to be passed.
4779 A reg is still generated for it, but the caller should be smart
4780 enough not to use it. */
4781 if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
4782 pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
4783 else
4784 {
4785 rtx par;
4786 int i;
4787
4788 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
4789 for (i = 0; i < nregs; i++)
4790 {
4791 rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
4792 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
4793 GEN_INT (i * UNITS_PER_WORD));
4794 XVECEXP (par, 0, i) = tmp;
4795 }
4796 pcum->aapcs_reg = par;
4797 }
4798
4799 pcum->aapcs_nextncrn = ncrn + nregs;
4800 return;
4801 }
4802
4803 /* C.11 */
4804 pcum->aapcs_nextncrn = NUM_ARG_REGS;
4805
4806 /* The argument is passed on stack; record the needed number of words for
4807 this argument and align the total size if necessary. */
4808 on_stack:
4809 pcum->aapcs_stack_words = size / UNITS_PER_WORD;
4810
4811 if (aarch64_function_arg_alignment (mode, type, &abi_break)
4812 == 16 * BITS_PER_UNIT)
4813 {
4814 int new_size = ROUND_UP (pcum->aapcs_stack_size, 16 / UNITS_PER_WORD);
4815 if (pcum->aapcs_stack_size != new_size)
4816 {
4817 if (abi_break && warn_psabi && currently_expanding_gimple_stmt)
4818 inform (input_location, "parameter passing for argument of type "
4819 "%qT changed in GCC 9.1", type);
4820 pcum->aapcs_stack_size = new_size;
4821 }
4822 }
4823 return;
4824 }
4825
4826 /* Implement TARGET_FUNCTION_ARG. */
4827
4828 static rtx
4829 aarch64_function_arg (cumulative_args_t pcum_v, const function_arg_info &arg)
4830 {
4831 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
4832 gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64);
4833
4834 if (arg.end_marker_p ())
4835 return NULL_RTX;
4836
4837 aarch64_layout_arg (pcum_v, arg.mode, arg.type, arg.named);
4838 return pcum->aapcs_reg;
4839 }
4840
4841 void
4842 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
4843 const_tree fntype ATTRIBUTE_UNUSED,
4844 rtx libname ATTRIBUTE_UNUSED,
4845 const_tree fndecl ATTRIBUTE_UNUSED,
4846 unsigned n_named ATTRIBUTE_UNUSED)
4847 {
4848 pcum->aapcs_ncrn = 0;
4849 pcum->aapcs_nvrn = 0;
4850 pcum->aapcs_nextncrn = 0;
4851 pcum->aapcs_nextnvrn = 0;
4852 pcum->pcs_variant = ARM_PCS_AAPCS64;
4853 pcum->aapcs_reg = NULL_RTX;
4854 pcum->aapcs_arg_processed = false;
4855 pcum->aapcs_stack_words = 0;
4856 pcum->aapcs_stack_size = 0;
4857
4858 if (!TARGET_FLOAT
4859 && fndecl && TREE_PUBLIC (fndecl)
4860 && fntype && fntype != error_mark_node)
4861 {
4862 const_tree type = TREE_TYPE (fntype);
4863 machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument. */
4864 int nregs ATTRIBUTE_UNUSED; /* Likewise. */
4865 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
4866 &mode, &nregs, NULL))
4867 aarch64_err_no_fpadvsimd (TYPE_MODE (type));
4868 }
4869 return;
4870 }
4871
4872 static void
4873 aarch64_function_arg_advance (cumulative_args_t pcum_v,
4874 const function_arg_info &arg)
4875 {
4876 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
4877 if (pcum->pcs_variant == ARM_PCS_AAPCS64)
4878 {
4879 aarch64_layout_arg (pcum_v, arg.mode, arg.type, arg.named);
4880 gcc_assert ((pcum->aapcs_reg != NULL_RTX)
4881 != (pcum->aapcs_stack_words != 0));
4882 pcum->aapcs_arg_processed = false;
4883 pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
4884 pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
4885 pcum->aapcs_stack_size += pcum->aapcs_stack_words;
4886 pcum->aapcs_stack_words = 0;
4887 pcum->aapcs_reg = NULL_RTX;
4888 }
4889 }
4890
4891 bool
4892 aarch64_function_arg_regno_p (unsigned regno)
4893 {
4894 return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
4895 || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
4896 }
4897
4898 /* Implement FUNCTION_ARG_BOUNDARY. Every parameter gets at least
4899 PARM_BOUNDARY bits of alignment, but will be given anything up
4900 to STACK_BOUNDARY bits if the type requires it. This makes sure
4901 that both before and after the layout of each argument, the Next
4902 Stacked Argument Address (NSAA) will have a minimum alignment of
4903 8 bytes. */
4904
4905 static unsigned int
4906 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
4907 {
4908 bool abi_break;
4909 unsigned int alignment = aarch64_function_arg_alignment (mode, type,
4910 &abi_break);
4911 if (abi_break & warn_psabi)
4912 inform (input_location, "parameter passing for argument of type "
4913 "%qT changed in GCC 9.1", type);
4914
4915 return MIN (MAX (alignment, PARM_BOUNDARY), STACK_BOUNDARY);
4916 }
4917
4918 /* Implement TARGET_GET_RAW_RESULT_MODE and TARGET_GET_RAW_ARG_MODE. */
4919
4920 static fixed_size_mode
4921 aarch64_get_reg_raw_mode (int regno)
4922 {
4923 if (TARGET_SVE && FP_REGNUM_P (regno))
4924 /* Don't use the SVE part of the register for __builtin_apply and
4925 __builtin_return. The SVE registers aren't used by the normal PCS,
4926 so using them there would be a waste of time. The PCS extensions
4927 for SVE types are fundamentally incompatible with the
4928 __builtin_return/__builtin_apply interface. */
4929 return as_a <fixed_size_mode> (V16QImode);
4930 return default_get_reg_raw_mode (regno);
4931 }
4932
4933 /* Implement TARGET_FUNCTION_ARG_PADDING.
4934
4935 Small aggregate types are placed in the lowest memory address.
4936
4937 The related parameter passing rules are B.4, C.3, C.5 and C.14. */
4938
4939 static pad_direction
4940 aarch64_function_arg_padding (machine_mode mode, const_tree type)
4941 {
4942 /* On little-endian targets, the least significant byte of every stack
4943 argument is passed at the lowest byte address of the stack slot. */
4944 if (!BYTES_BIG_ENDIAN)
4945 return PAD_UPWARD;
4946
4947 /* Otherwise, integral, floating-point and pointer types are padded downward:
4948 the least significant byte of a stack argument is passed at the highest
4949 byte address of the stack slot. */
4950 if (type
4951 ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
4952 || POINTER_TYPE_P (type))
4953 : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
4954 return PAD_DOWNWARD;
4955
4956 /* Everything else padded upward, i.e. data in first byte of stack slot. */
4957 return PAD_UPWARD;
4958 }
4959
4960 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
4961
4962 It specifies padding for the last (may also be the only)
4963 element of a block move between registers and memory. If
4964 assuming the block is in the memory, padding upward means that
4965 the last element is padded after its highest significant byte,
4966 while in downward padding, the last element is padded at the
4967 its least significant byte side.
4968
4969 Small aggregates and small complex types are always padded
4970 upwards.
4971
4972 We don't need to worry about homogeneous floating-point or
4973 short-vector aggregates; their move is not affected by the
4974 padding direction determined here. Regardless of endianness,
4975 each element of such an aggregate is put in the least
4976 significant bits of a fp/simd register.
4977
4978 Return !BYTES_BIG_ENDIAN if the least significant byte of the
4979 register has useful data, and return the opposite if the most
4980 significant byte does. */
4981
4982 bool
4983 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
4984 bool first ATTRIBUTE_UNUSED)
4985 {
4986
4987 /* Small composite types are always padded upward. */
4988 if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
4989 {
4990 HOST_WIDE_INT size;
4991 if (type)
4992 size = int_size_in_bytes (type);
4993 else
4994 /* No frontends can create types with variable-sized modes, so we
4995 shouldn't be asked to pass or return them. */
4996 size = GET_MODE_SIZE (mode).to_constant ();
4997 if (size < 2 * UNITS_PER_WORD)
4998 return true;
4999 }
5000
5001 /* Otherwise, use the default padding. */
5002 return !BYTES_BIG_ENDIAN;
5003 }
5004
5005 static scalar_int_mode
5006 aarch64_libgcc_cmp_return_mode (void)
5007 {
5008 return SImode;
5009 }
5010
5011 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
5012
5013 /* We use the 12-bit shifted immediate arithmetic instructions so values
5014 must be multiple of (1 << 12), i.e. 4096. */
5015 #define ARITH_FACTOR 4096
5016
5017 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
5018 #error Cannot use simple address calculation for stack probing
5019 #endif
5020
5021 /* The pair of scratch registers used for stack probing. */
5022 #define PROBE_STACK_FIRST_REG R9_REGNUM
5023 #define PROBE_STACK_SECOND_REG R10_REGNUM
5024
5025 /* Emit code to probe a range of stack addresses from FIRST to FIRST+POLY_SIZE,
5026 inclusive. These are offsets from the current stack pointer. */
5027
5028 static void
5029 aarch64_emit_probe_stack_range (HOST_WIDE_INT first, poly_int64 poly_size)
5030 {
5031 HOST_WIDE_INT size;
5032 if (!poly_size.is_constant (&size))
5033 {
5034 sorry ("stack probes for SVE frames");
5035 return;
5036 }
5037
5038 rtx reg1 = gen_rtx_REG (Pmode, PROBE_STACK_FIRST_REG);
5039
5040 /* See the same assertion on PROBE_INTERVAL above. */
5041 gcc_assert ((first % ARITH_FACTOR) == 0);
5042
5043 /* See if we have a constant small number of probes to generate. If so,
5044 that's the easy case. */
5045 if (size <= PROBE_INTERVAL)
5046 {
5047 const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR);
5048
5049 emit_set_insn (reg1,
5050 plus_constant (Pmode,
5051 stack_pointer_rtx, -(first + base)));
5052 emit_stack_probe (plus_constant (Pmode, reg1, base - size));
5053 }
5054
5055 /* The run-time loop is made up of 8 insns in the generic case while the
5056 compile-time loop is made up of 4+2*(n-2) insns for n # of intervals. */
5057 else if (size <= 4 * PROBE_INTERVAL)
5058 {
5059 HOST_WIDE_INT i, rem;
5060
5061 emit_set_insn (reg1,
5062 plus_constant (Pmode,
5063 stack_pointer_rtx,
5064 -(first + PROBE_INTERVAL)));
5065 emit_stack_probe (reg1);
5066
5067 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
5068 it exceeds SIZE. If only two probes are needed, this will not
5069 generate any code. Then probe at FIRST + SIZE. */
5070 for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
5071 {
5072 emit_set_insn (reg1,
5073 plus_constant (Pmode, reg1, -PROBE_INTERVAL));
5074 emit_stack_probe (reg1);
5075 }
5076
5077 rem = size - (i - PROBE_INTERVAL);
5078 if (rem > 256)
5079 {
5080 const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
5081
5082 emit_set_insn (reg1, plus_constant (Pmode, reg1, -base));
5083 emit_stack_probe (plus_constant (Pmode, reg1, base - rem));
5084 }
5085 else
5086 emit_stack_probe (plus_constant (Pmode, reg1, -rem));
5087 }
5088
5089 /* Otherwise, do the same as above, but in a loop. Note that we must be
5090 extra careful with variables wrapping around because we might be at
5091 the very top (or the very bottom) of the address space and we have
5092 to be able to handle this case properly; in particular, we use an
5093 equality test for the loop condition. */
5094 else
5095 {
5096 rtx reg2 = gen_rtx_REG (Pmode, PROBE_STACK_SECOND_REG);
5097
5098 /* Step 1: round SIZE to the previous multiple of the interval. */
5099
5100 HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
5101
5102
5103 /* Step 2: compute initial and final value of the loop counter. */
5104
5105 /* TEST_ADDR = SP + FIRST. */
5106 emit_set_insn (reg1,
5107 plus_constant (Pmode, stack_pointer_rtx, -first));
5108
5109 /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE. */
5110 HOST_WIDE_INT adjustment = - (first + rounded_size);
5111 if (! aarch64_uimm12_shift (adjustment))
5112 {
5113 aarch64_internal_mov_immediate (reg2, GEN_INT (adjustment),
5114 true, Pmode);
5115 emit_set_insn (reg2, gen_rtx_PLUS (Pmode, stack_pointer_rtx, reg2));
5116 }
5117 else
5118 emit_set_insn (reg2,
5119 plus_constant (Pmode, stack_pointer_rtx, adjustment));
5120
5121 /* Step 3: the loop
5122
5123 do
5124 {
5125 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
5126 probe at TEST_ADDR
5127 }
5128 while (TEST_ADDR != LAST_ADDR)
5129
5130 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
5131 until it is equal to ROUNDED_SIZE. */
5132
5133 emit_insn (gen_probe_stack_range (reg1, reg1, reg2));
5134
5135
5136 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
5137 that SIZE is equal to ROUNDED_SIZE. */
5138
5139 if (size != rounded_size)
5140 {
5141 HOST_WIDE_INT rem = size - rounded_size;
5142
5143 if (rem > 256)
5144 {
5145 const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
5146
5147 emit_set_insn (reg2, plus_constant (Pmode, reg2, -base));
5148 emit_stack_probe (plus_constant (Pmode, reg2, base - rem));
5149 }
5150 else
5151 emit_stack_probe (plus_constant (Pmode, reg2, -rem));
5152 }
5153 }
5154
5155 /* Make sure nothing is scheduled before we are done. */
5156 emit_insn (gen_blockage ());
5157 }
5158
5159 /* Probe a range of stack addresses from REG1 to REG2 inclusive. These are
5160 absolute addresses. */
5161
5162 const char *
5163 aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
5164 {
5165 static int labelno = 0;
5166 char loop_lab[32];
5167 rtx xops[2];
5168
5169 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
5170
5171 /* Loop. */
5172 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
5173
5174 HOST_WIDE_INT stack_clash_probe_interval
5175 = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
5176
5177 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
5178 xops[0] = reg1;
5179 HOST_WIDE_INT interval;
5180 if (flag_stack_clash_protection)
5181 interval = stack_clash_probe_interval;
5182 else
5183 interval = PROBE_INTERVAL;
5184
5185 gcc_assert (aarch64_uimm12_shift (interval));
5186 xops[1] = GEN_INT (interval);
5187
5188 output_asm_insn ("sub\t%0, %0, %1", xops);
5189
5190 /* If doing stack clash protection then we probe up by the ABI specified
5191 amount. We do this because we're dropping full pages at a time in the
5192 loop. But if we're doing non-stack clash probing, probe at SP 0. */
5193 if (flag_stack_clash_protection)
5194 xops[1] = GEN_INT (STACK_CLASH_CALLER_GUARD);
5195 else
5196 xops[1] = CONST0_RTX (GET_MODE (xops[1]));
5197
5198 /* Probe at TEST_ADDR. If we're inside the loop it is always safe to probe
5199 by this amount for each iteration. */
5200 output_asm_insn ("str\txzr, [%0, %1]", xops);
5201
5202 /* Test if TEST_ADDR == LAST_ADDR. */
5203 xops[1] = reg2;
5204 output_asm_insn ("cmp\t%0, %1", xops);
5205
5206 /* Branch. */
5207 fputs ("\tb.ne\t", asm_out_file);
5208 assemble_name_raw (asm_out_file, loop_lab);
5209 fputc ('\n', asm_out_file);
5210
5211 return "";
5212 }
5213
5214 /* Emit the probe loop for doing stack clash probes and stack adjustments for
5215 SVE. This emits probes from BASE to BASE - ADJUSTMENT based on a guard size
5216 of GUARD_SIZE. When a probe is emitted it is done at most
5217 MIN_PROBE_THRESHOLD bytes from the current BASE at an interval of
5218 at most MIN_PROBE_THRESHOLD. By the end of this function
5219 BASE = BASE - ADJUSTMENT. */
5220
5221 const char *
5222 aarch64_output_probe_sve_stack_clash (rtx base, rtx adjustment,
5223 rtx min_probe_threshold, rtx guard_size)
5224 {
5225 /* This function is not allowed to use any instruction generation function
5226 like gen_ and friends. If you do you'll likely ICE during CFG validation,
5227 so instead emit the code you want using output_asm_insn. */
5228 gcc_assert (flag_stack_clash_protection);
5229 gcc_assert (CONST_INT_P (min_probe_threshold) && CONST_INT_P (guard_size));
5230 gcc_assert (INTVAL (guard_size) > INTVAL (min_probe_threshold));
5231
5232 /* The minimum required allocation before the residual requires probing. */
5233 HOST_WIDE_INT residual_probe_guard = INTVAL (min_probe_threshold);
5234
5235 /* Clamp the value down to the nearest value that can be used with a cmp. */
5236 residual_probe_guard = aarch64_clamp_to_uimm12_shift (residual_probe_guard);
5237 rtx probe_offset_value_rtx = gen_int_mode (residual_probe_guard, Pmode);
5238
5239 gcc_assert (INTVAL (min_probe_threshold) >= residual_probe_guard);
5240 gcc_assert (aarch64_uimm12_shift (residual_probe_guard));
5241
5242 static int labelno = 0;
5243 char loop_start_lab[32];
5244 char loop_end_lab[32];
5245 rtx xops[2];
5246
5247 ASM_GENERATE_INTERNAL_LABEL (loop_start_lab, "SVLPSPL", labelno);
5248 ASM_GENERATE_INTERNAL_LABEL (loop_end_lab, "SVLPEND", labelno++);
5249
5250 /* Emit loop start label. */
5251 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_start_lab);
5252
5253 /* ADJUSTMENT < RESIDUAL_PROBE_GUARD. */
5254 xops[0] = adjustment;
5255 xops[1] = probe_offset_value_rtx;
5256 output_asm_insn ("cmp\t%0, %1", xops);
5257
5258 /* Branch to end if not enough adjustment to probe. */
5259 fputs ("\tb.lt\t", asm_out_file);
5260 assemble_name_raw (asm_out_file, loop_end_lab);
5261 fputc ('\n', asm_out_file);
5262
5263 /* BASE = BASE - RESIDUAL_PROBE_GUARD. */
5264 xops[0] = base;
5265 xops[1] = probe_offset_value_rtx;
5266 output_asm_insn ("sub\t%0, %0, %1", xops);
5267
5268 /* Probe at BASE. */
5269 xops[1] = const0_rtx;
5270 output_asm_insn ("str\txzr, [%0, %1]", xops);
5271
5272 /* ADJUSTMENT = ADJUSTMENT - RESIDUAL_PROBE_GUARD. */
5273 xops[0] = adjustment;
5274 xops[1] = probe_offset_value_rtx;
5275 output_asm_insn ("sub\t%0, %0, %1", xops);
5276
5277 /* Branch to start if still more bytes to allocate. */
5278 fputs ("\tb\t", asm_out_file);
5279 assemble_name_raw (asm_out_file, loop_start_lab);
5280 fputc ('\n', asm_out_file);
5281
5282 /* No probe leave. */
5283 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_end_lab);
5284
5285 /* BASE = BASE - ADJUSTMENT. */
5286 xops[0] = base;
5287 xops[1] = adjustment;
5288 output_asm_insn ("sub\t%0, %0, %1", xops);
5289 return "";
5290 }
5291
5292 /* Determine whether a frame chain needs to be generated. */
5293 static bool
5294 aarch64_needs_frame_chain (void)
5295 {
5296 /* Force a frame chain for EH returns so the return address is at FP+8. */
5297 if (frame_pointer_needed || crtl->calls_eh_return)
5298 return true;
5299
5300 /* A leaf function cannot have calls or write LR. */
5301 bool is_leaf = crtl->is_leaf && !df_regs_ever_live_p (LR_REGNUM);
5302
5303 /* Don't use a frame chain in leaf functions if leaf frame pointers
5304 are disabled. */
5305 if (flag_omit_leaf_frame_pointer && is_leaf)
5306 return false;
5307
5308 return aarch64_use_frame_pointer;
5309 }
5310
5311 /* Mark the registers that need to be saved by the callee and calculate
5312 the size of the callee-saved registers area and frame record (both FP
5313 and LR may be omitted). */
5314 static void
5315 aarch64_layout_frame (void)
5316 {
5317 HOST_WIDE_INT offset = 0;
5318 int regno, last_fp_reg = INVALID_REGNUM;
5319 bool simd_function = aarch64_simd_decl_p (cfun->decl);
5320
5321 cfun->machine->frame.emit_frame_chain = aarch64_needs_frame_chain ();
5322
5323 /* Adjust the outgoing arguments size if required. Keep it in sync with what
5324 the mid-end is doing. */
5325 crtl->outgoing_args_size = STACK_DYNAMIC_OFFSET (cfun);
5326
5327 #define SLOT_NOT_REQUIRED (-2)
5328 #define SLOT_REQUIRED (-1)
5329
5330 cfun->machine->frame.wb_candidate1 = INVALID_REGNUM;
5331 cfun->machine->frame.wb_candidate2 = INVALID_REGNUM;
5332
5333 /* If this is a non-leaf simd function with calls we assume that
5334 at least one of those calls is to a non-simd function and thus
5335 we must save V8 to V23 in the prologue. */
5336
5337 if (simd_function && !crtl->is_leaf)
5338 {
5339 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
5340 if (FP_SIMD_SAVED_REGNUM_P (regno))
5341 df_set_regs_ever_live (regno, true);
5342 }
5343
5344 /* First mark all the registers that really need to be saved... */
5345 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
5346 cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
5347
5348 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
5349 cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
5350
5351 /* ... that includes the eh data registers (if needed)... */
5352 if (crtl->calls_eh_return)
5353 for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
5354 cfun->machine->frame.reg_offset[EH_RETURN_DATA_REGNO (regno)]
5355 = SLOT_REQUIRED;
5356
5357 /* ... and any callee saved register that dataflow says is live. */
5358 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
5359 if (df_regs_ever_live_p (regno)
5360 && (regno == R30_REGNUM
5361 || !call_used_or_fixed_reg_p (regno)))
5362 cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
5363
5364 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
5365 if (df_regs_ever_live_p (regno)
5366 && (!call_used_or_fixed_reg_p (regno)
5367 || (simd_function && FP_SIMD_SAVED_REGNUM_P (regno))))
5368 {
5369 cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
5370 last_fp_reg = regno;
5371 }
5372
5373 if (cfun->machine->frame.emit_frame_chain)
5374 {
5375 /* FP and LR are placed in the linkage record. */
5376 cfun->machine->frame.reg_offset[R29_REGNUM] = 0;
5377 cfun->machine->frame.wb_candidate1 = R29_REGNUM;
5378 cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
5379 cfun->machine->frame.wb_candidate2 = R30_REGNUM;
5380 offset = 2 * UNITS_PER_WORD;
5381 }
5382
5383 /* With stack-clash, LR must be saved in non-leaf functions. */
5384 gcc_assert (crtl->is_leaf
5385 || (cfun->machine->frame.reg_offset[R30_REGNUM]
5386 != SLOT_NOT_REQUIRED));
5387
5388 /* Now assign stack slots for them. */
5389 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
5390 if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
5391 {
5392 cfun->machine->frame.reg_offset[regno] = offset;
5393 if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
5394 cfun->machine->frame.wb_candidate1 = regno;
5395 else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM)
5396 cfun->machine->frame.wb_candidate2 = regno;
5397 offset += UNITS_PER_WORD;
5398 }
5399
5400 HOST_WIDE_INT max_int_offset = offset;
5401 offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
5402 bool has_align_gap = offset != max_int_offset;
5403
5404 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
5405 if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
5406 {
5407 /* If there is an alignment gap between integer and fp callee-saves,
5408 allocate the last fp register to it if possible. */
5409 if (regno == last_fp_reg
5410 && has_align_gap
5411 && !simd_function
5412 && (offset & 8) == 0)
5413 {
5414 cfun->machine->frame.reg_offset[regno] = max_int_offset;
5415 break;
5416 }
5417
5418 cfun->machine->frame.reg_offset[regno] = offset;
5419 if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
5420 cfun->machine->frame.wb_candidate1 = regno;
5421 else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM
5422 && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
5423 cfun->machine->frame.wb_candidate2 = regno;
5424 offset += simd_function ? UNITS_PER_VREG : UNITS_PER_WORD;
5425 }
5426
5427 offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
5428
5429 cfun->machine->frame.saved_regs_size = offset;
5430
5431 HOST_WIDE_INT varargs_and_saved_regs_size
5432 = offset + cfun->machine->frame.saved_varargs_size;
5433
5434 cfun->machine->frame.hard_fp_offset
5435 = aligned_upper_bound (varargs_and_saved_regs_size
5436 + get_frame_size (),
5437 STACK_BOUNDARY / BITS_PER_UNIT);
5438
5439 /* Both these values are already aligned. */
5440 gcc_assert (multiple_p (crtl->outgoing_args_size,
5441 STACK_BOUNDARY / BITS_PER_UNIT));
5442 cfun->machine->frame.frame_size
5443 = (cfun->machine->frame.hard_fp_offset
5444 + crtl->outgoing_args_size);
5445
5446 cfun->machine->frame.locals_offset = cfun->machine->frame.saved_varargs_size;
5447
5448 cfun->machine->frame.initial_adjust = 0;
5449 cfun->machine->frame.final_adjust = 0;
5450 cfun->machine->frame.callee_adjust = 0;
5451 cfun->machine->frame.callee_offset = 0;
5452
5453 HOST_WIDE_INT max_push_offset = 0;
5454 if (cfun->machine->frame.wb_candidate2 != INVALID_REGNUM)
5455 max_push_offset = 512;
5456 else if (cfun->machine->frame.wb_candidate1 != INVALID_REGNUM)
5457 max_push_offset = 256;
5458
5459 HOST_WIDE_INT const_size, const_fp_offset;
5460 if (cfun->machine->frame.frame_size.is_constant (&const_size)
5461 && const_size < max_push_offset
5462 && known_eq (crtl->outgoing_args_size, 0))
5463 {
5464 /* Simple, small frame with no outgoing arguments:
5465 stp reg1, reg2, [sp, -frame_size]!
5466 stp reg3, reg4, [sp, 16] */
5467 cfun->machine->frame.callee_adjust = const_size;
5468 }
5469 else if (known_lt (crtl->outgoing_args_size
5470 + cfun->machine->frame.saved_regs_size, 512)
5471 && !(cfun->calls_alloca
5472 && known_lt (cfun->machine->frame.hard_fp_offset,
5473 max_push_offset)))
5474 {
5475 /* Frame with small outgoing arguments:
5476 sub sp, sp, frame_size
5477 stp reg1, reg2, [sp, outgoing_args_size]
5478 stp reg3, reg4, [sp, outgoing_args_size + 16] */
5479 cfun->machine->frame.initial_adjust = cfun->machine->frame.frame_size;
5480 cfun->machine->frame.callee_offset
5481 = cfun->machine->frame.frame_size - cfun->machine->frame.hard_fp_offset;
5482 }
5483 else if (cfun->machine->frame.hard_fp_offset.is_constant (&const_fp_offset)
5484 && const_fp_offset < max_push_offset)
5485 {
5486 /* Frame with large outgoing arguments but a small local area:
5487 stp reg1, reg2, [sp, -hard_fp_offset]!
5488 stp reg3, reg4, [sp, 16]
5489 sub sp, sp, outgoing_args_size */
5490 cfun->machine->frame.callee_adjust = const_fp_offset;
5491 cfun->machine->frame.final_adjust
5492 = cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
5493 }
5494 else
5495 {
5496 /* Frame with large local area and outgoing arguments using frame pointer:
5497 sub sp, sp, hard_fp_offset
5498 stp x29, x30, [sp, 0]
5499 add x29, sp, 0
5500 stp reg3, reg4, [sp, 16]
5501 sub sp, sp, outgoing_args_size */
5502 cfun->machine->frame.initial_adjust = cfun->machine->frame.hard_fp_offset;
5503 cfun->machine->frame.final_adjust
5504 = cfun->machine->frame.frame_size - cfun->machine->frame.initial_adjust;
5505 }
5506
5507 cfun->machine->frame.laid_out = true;
5508 }
5509
5510 /* Return true if the register REGNO is saved on entry to
5511 the current function. */
5512
5513 static bool
5514 aarch64_register_saved_on_entry (int regno)
5515 {
5516 return cfun->machine->frame.reg_offset[regno] >= 0;
5517 }
5518
5519 /* Return the next register up from REGNO up to LIMIT for the callee
5520 to save. */
5521
5522 static unsigned
5523 aarch64_next_callee_save (unsigned regno, unsigned limit)
5524 {
5525 while (regno <= limit && !aarch64_register_saved_on_entry (regno))
5526 regno ++;
5527 return regno;
5528 }
5529
5530 /* Push the register number REGNO of mode MODE to the stack with write-back
5531 adjusting the stack by ADJUSTMENT. */
5532
5533 static void
5534 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
5535 HOST_WIDE_INT adjustment)
5536 {
5537 rtx base_rtx = stack_pointer_rtx;
5538 rtx insn, reg, mem;
5539
5540 reg = gen_rtx_REG (mode, regno);
5541 mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
5542 plus_constant (Pmode, base_rtx, -adjustment));
5543 mem = gen_frame_mem (mode, mem);
5544
5545 insn = emit_move_insn (mem, reg);
5546 RTX_FRAME_RELATED_P (insn) = 1;
5547 }
5548
5549 /* Generate and return an instruction to store the pair of registers
5550 REG and REG2 of mode MODE to location BASE with write-back adjusting
5551 the stack location BASE by ADJUSTMENT. */
5552
5553 static rtx
5554 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
5555 HOST_WIDE_INT adjustment)
5556 {
5557 switch (mode)
5558 {
5559 case E_DImode:
5560 return gen_storewb_pairdi_di (base, base, reg, reg2,
5561 GEN_INT (-adjustment),
5562 GEN_INT (UNITS_PER_WORD - adjustment));
5563 case E_DFmode:
5564 return gen_storewb_pairdf_di (base, base, reg, reg2,
5565 GEN_INT (-adjustment),
5566 GEN_INT (UNITS_PER_WORD - adjustment));
5567 case E_TFmode:
5568 return gen_storewb_pairtf_di (base, base, reg, reg2,
5569 GEN_INT (-adjustment),
5570 GEN_INT (UNITS_PER_VREG - adjustment));
5571 default:
5572 gcc_unreachable ();
5573 }
5574 }
5575
5576 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
5577 stack pointer by ADJUSTMENT. */
5578
5579 static void
5580 aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
5581 {
5582 rtx_insn *insn;
5583 machine_mode mode = aarch64_reg_save_mode (cfun->decl, regno1);
5584
5585 if (regno2 == INVALID_REGNUM)
5586 return aarch64_pushwb_single_reg (mode, regno1, adjustment);
5587
5588 rtx reg1 = gen_rtx_REG (mode, regno1);
5589 rtx reg2 = gen_rtx_REG (mode, regno2);
5590
5591 insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
5592 reg2, adjustment));
5593 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
5594 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
5595 RTX_FRAME_RELATED_P (insn) = 1;
5596 }
5597
5598 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
5599 adjusting it by ADJUSTMENT afterwards. */
5600
5601 static rtx
5602 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
5603 HOST_WIDE_INT adjustment)
5604 {
5605 switch (mode)
5606 {
5607 case E_DImode:
5608 return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
5609 GEN_INT (UNITS_PER_WORD));
5610 case E_DFmode:
5611 return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
5612 GEN_INT (UNITS_PER_WORD));
5613 case E_TFmode:
5614 return gen_loadwb_pairtf_di (base, base, reg, reg2, GEN_INT (adjustment),
5615 GEN_INT (UNITS_PER_VREG));
5616 default:
5617 gcc_unreachable ();
5618 }
5619 }
5620
5621 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
5622 afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
5623 into CFI_OPS. */
5624
5625 static void
5626 aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
5627 rtx *cfi_ops)
5628 {
5629 machine_mode mode = aarch64_reg_save_mode (cfun->decl, regno1);
5630 rtx reg1 = gen_rtx_REG (mode, regno1);
5631
5632 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
5633
5634 if (regno2 == INVALID_REGNUM)
5635 {
5636 rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment);
5637 mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
5638 emit_move_insn (reg1, gen_frame_mem (mode, mem));
5639 }
5640 else
5641 {
5642 rtx reg2 = gen_rtx_REG (mode, regno2);
5643 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
5644 emit_insn (aarch64_gen_loadwb_pair (mode, stack_pointer_rtx, reg1,
5645 reg2, adjustment));
5646 }
5647 }
5648
5649 /* Generate and return a store pair instruction of mode MODE to store
5650 register REG1 to MEM1 and register REG2 to MEM2. */
5651
5652 static rtx
5653 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
5654 rtx reg2)
5655 {
5656 switch (mode)
5657 {
5658 case E_DImode:
5659 return gen_store_pair_dw_didi (mem1, reg1, mem2, reg2);
5660
5661 case E_DFmode:
5662 return gen_store_pair_dw_dfdf (mem1, reg1, mem2, reg2);
5663
5664 case E_TFmode:
5665 return gen_store_pair_dw_tftf (mem1, reg1, mem2, reg2);
5666
5667 default:
5668 gcc_unreachable ();
5669 }
5670 }
5671
5672 /* Generate and regurn a load pair isntruction of mode MODE to load register
5673 REG1 from MEM1 and register REG2 from MEM2. */
5674
5675 static rtx
5676 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
5677 rtx mem2)
5678 {
5679 switch (mode)
5680 {
5681 case E_DImode:
5682 return gen_load_pair_dw_didi (reg1, mem1, reg2, mem2);
5683
5684 case E_DFmode:
5685 return gen_load_pair_dw_dfdf (reg1, mem1, reg2, mem2);
5686
5687 case E_TFmode:
5688 return gen_load_pair_dw_tftf (reg1, mem1, reg2, mem2);
5689
5690 default:
5691 gcc_unreachable ();
5692 }
5693 }
5694
5695 /* Return TRUE if return address signing should be enabled for the current
5696 function, otherwise return FALSE. */
5697
5698 bool
5699 aarch64_return_address_signing_enabled (void)
5700 {
5701 /* This function should only be called after frame laid out. */
5702 gcc_assert (cfun->machine->frame.laid_out);
5703
5704 /* If signing scope is AARCH64_FUNCTION_NON_LEAF, we only sign a leaf function
5705 if its LR is pushed onto stack. */
5706 return (aarch64_ra_sign_scope == AARCH64_FUNCTION_ALL
5707 || (aarch64_ra_sign_scope == AARCH64_FUNCTION_NON_LEAF
5708 && cfun->machine->frame.reg_offset[LR_REGNUM] >= 0));
5709 }
5710
5711 /* Return TRUE if Branch Target Identification Mechanism is enabled. */
5712 bool
5713 aarch64_bti_enabled (void)
5714 {
5715 return (aarch64_enable_bti == 1);
5716 }
5717
5718 /* Emit code to save the callee-saved registers from register number START
5719 to LIMIT to the stack at the location starting at offset START_OFFSET,
5720 skipping any write-back candidates if SKIP_WB is true. */
5721
5722 static void
5723 aarch64_save_callee_saves (machine_mode mode, poly_int64 start_offset,
5724 unsigned start, unsigned limit, bool skip_wb)
5725 {
5726 rtx_insn *insn;
5727 unsigned regno;
5728 unsigned regno2;
5729
5730 for (regno = aarch64_next_callee_save (start, limit);
5731 regno <= limit;
5732 regno = aarch64_next_callee_save (regno + 1, limit))
5733 {
5734 rtx reg, mem;
5735 poly_int64 offset;
5736 int offset_diff;
5737
5738 if (skip_wb
5739 && (regno == cfun->machine->frame.wb_candidate1
5740 || regno == cfun->machine->frame.wb_candidate2))
5741 continue;
5742
5743 if (cfun->machine->reg_is_wrapped_separately[regno])
5744 continue;
5745
5746 reg = gen_rtx_REG (mode, regno);
5747 offset = start_offset + cfun->machine->frame.reg_offset[regno];
5748 mem = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
5749 offset));
5750
5751 regno2 = aarch64_next_callee_save (regno + 1, limit);
5752 offset_diff = cfun->machine->frame.reg_offset[regno2]
5753 - cfun->machine->frame.reg_offset[regno];
5754
5755 if (regno2 <= limit
5756 && !cfun->machine->reg_is_wrapped_separately[regno2]
5757 && known_eq (GET_MODE_SIZE (mode), offset_diff))
5758 {
5759 rtx reg2 = gen_rtx_REG (mode, regno2);
5760 rtx mem2;
5761
5762 offset = start_offset + cfun->machine->frame.reg_offset[regno2];
5763 mem2 = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
5764 offset));
5765 insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
5766 reg2));
5767
5768 /* The first part of a frame-related parallel insn is
5769 always assumed to be relevant to the frame
5770 calculations; subsequent parts, are only
5771 frame-related if explicitly marked. */
5772 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
5773 regno = regno2;
5774 }
5775 else
5776 insn = emit_move_insn (mem, reg);
5777
5778 RTX_FRAME_RELATED_P (insn) = 1;
5779 }
5780 }
5781
5782 /* Emit code to restore the callee registers of mode MODE from register
5783 number START up to and including LIMIT. Restore from the stack offset
5784 START_OFFSET, skipping any write-back candidates if SKIP_WB is true.
5785 Write the appropriate REG_CFA_RESTORE notes into CFI_OPS. */
5786
5787 static void
5788 aarch64_restore_callee_saves (machine_mode mode,
5789 poly_int64 start_offset, unsigned start,
5790 unsigned limit, bool skip_wb, rtx *cfi_ops)
5791 {
5792 rtx base_rtx = stack_pointer_rtx;
5793 unsigned regno;
5794 unsigned regno2;
5795 poly_int64 offset;
5796
5797 for (regno = aarch64_next_callee_save (start, limit);
5798 regno <= limit;
5799 regno = aarch64_next_callee_save (regno + 1, limit))
5800 {
5801 if (cfun->machine->reg_is_wrapped_separately[regno])
5802 continue;
5803
5804 rtx reg, mem;
5805 int offset_diff;
5806
5807 if (skip_wb
5808 && (regno == cfun->machine->frame.wb_candidate1
5809 || regno == cfun->machine->frame.wb_candidate2))
5810 continue;
5811
5812 reg = gen_rtx_REG (mode, regno);
5813 offset = start_offset + cfun->machine->frame.reg_offset[regno];
5814 mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
5815
5816 regno2 = aarch64_next_callee_save (regno + 1, limit);
5817 offset_diff = cfun->machine->frame.reg_offset[regno2]
5818 - cfun->machine->frame.reg_offset[regno];
5819
5820 if (regno2 <= limit
5821 && !cfun->machine->reg_is_wrapped_separately[regno2]
5822 && known_eq (GET_MODE_SIZE (mode), offset_diff))
5823 {
5824 rtx reg2 = gen_rtx_REG (mode, regno2);
5825 rtx mem2;
5826
5827 offset = start_offset + cfun->machine->frame.reg_offset[regno2];
5828 mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
5829 emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
5830
5831 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
5832 regno = regno2;
5833 }
5834 else
5835 emit_move_insn (reg, mem);
5836 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
5837 }
5838 }
5839
5840 /* Return true if OFFSET is a signed 4-bit value multiplied by the size
5841 of MODE. */
5842
5843 static inline bool
5844 offset_4bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
5845 {
5846 HOST_WIDE_INT multiple;
5847 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5848 && IN_RANGE (multiple, -8, 7));
5849 }
5850
5851 /* Return true if OFFSET is a unsigned 6-bit value multiplied by the size
5852 of MODE. */
5853
5854 static inline bool
5855 offset_6bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
5856 {
5857 HOST_WIDE_INT multiple;
5858 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5859 && IN_RANGE (multiple, 0, 63));
5860 }
5861
5862 /* Return true if OFFSET is a signed 7-bit value multiplied by the size
5863 of MODE. */
5864
5865 bool
5866 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
5867 {
5868 HOST_WIDE_INT multiple;
5869 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5870 && IN_RANGE (multiple, -64, 63));
5871 }
5872
5873 /* Return true if OFFSET is a signed 9-bit value. */
5874
5875 bool
5876 aarch64_offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
5877 poly_int64 offset)
5878 {
5879 HOST_WIDE_INT const_offset;
5880 return (offset.is_constant (&const_offset)
5881 && IN_RANGE (const_offset, -256, 255));
5882 }
5883
5884 /* Return true if OFFSET is a signed 9-bit value multiplied by the size
5885 of MODE. */
5886
5887 static inline bool
5888 offset_9bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
5889 {
5890 HOST_WIDE_INT multiple;
5891 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5892 && IN_RANGE (multiple, -256, 255));
5893 }
5894
5895 /* Return true if OFFSET is an unsigned 12-bit value multiplied by the size
5896 of MODE. */
5897
5898 static inline bool
5899 offset_12bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
5900 {
5901 HOST_WIDE_INT multiple;
5902 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5903 && IN_RANGE (multiple, 0, 4095));
5904 }
5905
5906 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS. */
5907
5908 static sbitmap
5909 aarch64_get_separate_components (void)
5910 {
5911 sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
5912 bitmap_clear (components);
5913
5914 /* The registers we need saved to the frame. */
5915 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
5916 if (aarch64_register_saved_on_entry (regno))
5917 {
5918 poly_int64 offset = cfun->machine->frame.reg_offset[regno];
5919 if (!frame_pointer_needed)
5920 offset += cfun->machine->frame.frame_size
5921 - cfun->machine->frame.hard_fp_offset;
5922 /* Check that we can access the stack slot of the register with one
5923 direct load with no adjustments needed. */
5924 if (offset_12bit_unsigned_scaled_p (DImode, offset))
5925 bitmap_set_bit (components, regno);
5926 }
5927
5928 /* Don't mess with the hard frame pointer. */
5929 if (frame_pointer_needed)
5930 bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM);
5931
5932 unsigned reg1 = cfun->machine->frame.wb_candidate1;
5933 unsigned reg2 = cfun->machine->frame.wb_candidate2;
5934 /* If registers have been chosen to be stored/restored with
5935 writeback don't interfere with them to avoid having to output explicit
5936 stack adjustment instructions. */
5937 if (reg2 != INVALID_REGNUM)
5938 bitmap_clear_bit (components, reg2);
5939 if (reg1 != INVALID_REGNUM)
5940 bitmap_clear_bit (components, reg1);
5941
5942 bitmap_clear_bit (components, LR_REGNUM);
5943 bitmap_clear_bit (components, SP_REGNUM);
5944
5945 return components;
5946 }
5947
5948 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB. */
5949
5950 static sbitmap
5951 aarch64_components_for_bb (basic_block bb)
5952 {
5953 bitmap in = DF_LIVE_IN (bb);
5954 bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
5955 bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
5956 bool simd_function = aarch64_simd_decl_p (cfun->decl);
5957
5958 sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
5959 bitmap_clear (components);
5960
5961 /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets. */
5962 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
5963 if ((!call_used_or_fixed_reg_p (regno)
5964 || (simd_function && FP_SIMD_SAVED_REGNUM_P (regno)))
5965 && (bitmap_bit_p (in, regno)
5966 || bitmap_bit_p (gen, regno)
5967 || bitmap_bit_p (kill, regno)))
5968 {
5969 unsigned regno2, offset, offset2;
5970 bitmap_set_bit (components, regno);
5971
5972 /* If there is a callee-save at an adjacent offset, add it too
5973 to increase the use of LDP/STP. */
5974 offset = cfun->machine->frame.reg_offset[regno];
5975 regno2 = ((offset & 8) == 0) ? regno + 1 : regno - 1;
5976
5977 if (regno2 <= LAST_SAVED_REGNUM)
5978 {
5979 offset2 = cfun->machine->frame.reg_offset[regno2];
5980 if ((offset & ~8) == (offset2 & ~8))
5981 bitmap_set_bit (components, regno2);
5982 }
5983 }
5984
5985 return components;
5986 }
5987
5988 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
5989 Nothing to do for aarch64. */
5990
5991 static void
5992 aarch64_disqualify_components (sbitmap, edge, sbitmap, bool)
5993 {
5994 }
5995
5996 /* Return the next set bit in BMP from START onwards. Return the total number
5997 of bits in BMP if no set bit is found at or after START. */
5998
5999 static unsigned int
6000 aarch64_get_next_set_bit (sbitmap bmp, unsigned int start)
6001 {
6002 unsigned int nbits = SBITMAP_SIZE (bmp);
6003 if (start == nbits)
6004 return start;
6005
6006 gcc_assert (start < nbits);
6007 for (unsigned int i = start; i < nbits; i++)
6008 if (bitmap_bit_p (bmp, i))
6009 return i;
6010
6011 return nbits;
6012 }
6013
6014 /* Do the work for aarch64_emit_prologue_components and
6015 aarch64_emit_epilogue_components. COMPONENTS is the bitmap of registers
6016 to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
6017 for these components or the epilogue sequence. That is, it determines
6018 whether we should emit stores or loads and what kind of CFA notes to attach
6019 to the insns. Otherwise the logic for the two sequences is very
6020 similar. */
6021
6022 static void
6023 aarch64_process_components (sbitmap components, bool prologue_p)
6024 {
6025 rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
6026 ? HARD_FRAME_POINTER_REGNUM
6027 : STACK_POINTER_REGNUM);
6028
6029 unsigned last_regno = SBITMAP_SIZE (components);
6030 unsigned regno = aarch64_get_next_set_bit (components, R0_REGNUM);
6031 rtx_insn *insn = NULL;
6032
6033 while (regno != last_regno)
6034 {
6035 /* AAPCS64 section 5.1.2 requires only the low 64 bits to be saved
6036 so DFmode for the vector registers is enough. For simd functions
6037 we want to save the low 128 bits. */
6038 machine_mode mode = aarch64_reg_save_mode (cfun->decl, regno);
6039
6040 rtx reg = gen_rtx_REG (mode, regno);
6041 poly_int64 offset = cfun->machine->frame.reg_offset[regno];
6042 if (!frame_pointer_needed)
6043 offset += cfun->machine->frame.frame_size
6044 - cfun->machine->frame.hard_fp_offset;
6045 rtx addr = plus_constant (Pmode, ptr_reg, offset);
6046 rtx mem = gen_frame_mem (mode, addr);
6047
6048 rtx set = prologue_p ? gen_rtx_SET (mem, reg) : gen_rtx_SET (reg, mem);
6049 unsigned regno2 = aarch64_get_next_set_bit (components, regno + 1);
6050 /* No more registers to handle after REGNO.
6051 Emit a single save/restore and exit. */
6052 if (regno2 == last_regno)
6053 {
6054 insn = emit_insn (set);
6055 RTX_FRAME_RELATED_P (insn) = 1;
6056 if (prologue_p)
6057 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
6058 else
6059 add_reg_note (insn, REG_CFA_RESTORE, reg);
6060 break;
6061 }
6062
6063 poly_int64 offset2 = cfun->machine->frame.reg_offset[regno2];
6064 /* The next register is not of the same class or its offset is not
6065 mergeable with the current one into a pair. */
6066 if (!satisfies_constraint_Ump (mem)
6067 || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
6068 || (aarch64_simd_decl_p (cfun->decl) && FP_REGNUM_P (regno))
6069 || maybe_ne ((offset2 - cfun->machine->frame.reg_offset[regno]),
6070 GET_MODE_SIZE (mode)))
6071 {
6072 insn = emit_insn (set);
6073 RTX_FRAME_RELATED_P (insn) = 1;
6074 if (prologue_p)
6075 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
6076 else
6077 add_reg_note (insn, REG_CFA_RESTORE, reg);
6078
6079 regno = regno2;
6080 continue;
6081 }
6082
6083 /* REGNO2 can be saved/restored in a pair with REGNO. */
6084 rtx reg2 = gen_rtx_REG (mode, regno2);
6085 if (!frame_pointer_needed)
6086 offset2 += cfun->machine->frame.frame_size
6087 - cfun->machine->frame.hard_fp_offset;
6088 rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
6089 rtx mem2 = gen_frame_mem (mode, addr2);
6090 rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
6091 : gen_rtx_SET (reg2, mem2);
6092
6093 if (prologue_p)
6094 insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2, reg2));
6095 else
6096 insn = emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
6097
6098 RTX_FRAME_RELATED_P (insn) = 1;
6099 if (prologue_p)
6100 {
6101 add_reg_note (insn, REG_CFA_OFFSET, set);
6102 add_reg_note (insn, REG_CFA_OFFSET, set2);
6103 }
6104 else
6105 {
6106 add_reg_note (insn, REG_CFA_RESTORE, reg);
6107 add_reg_note (insn, REG_CFA_RESTORE, reg2);
6108 }
6109
6110 regno = aarch64_get_next_set_bit (components, regno2 + 1);
6111 }
6112 }
6113
6114 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS. */
6115
6116 static void
6117 aarch64_emit_prologue_components (sbitmap components)
6118 {
6119 aarch64_process_components (components, true);
6120 }
6121
6122 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS. */
6123
6124 static void
6125 aarch64_emit_epilogue_components (sbitmap components)
6126 {
6127 aarch64_process_components (components, false);
6128 }
6129
6130 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS. */
6131
6132 static void
6133 aarch64_set_handled_components (sbitmap components)
6134 {
6135 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
6136 if (bitmap_bit_p (components, regno))
6137 cfun->machine->reg_is_wrapped_separately[regno] = true;
6138 }
6139
6140 /* On AArch64 we have an ABI defined safe buffer. This constant is used to
6141 determining the probe offset for alloca. */
6142
6143 static HOST_WIDE_INT
6144 aarch64_stack_clash_protection_alloca_probe_range (void)
6145 {
6146 return STACK_CLASH_CALLER_GUARD;
6147 }
6148
6149
6150 /* Allocate POLY_SIZE bytes of stack space using TEMP1 and TEMP2 as scratch
6151 registers. If POLY_SIZE is not large enough to require a probe this function
6152 will only adjust the stack. When allocating the stack space
6153 FRAME_RELATED_P is then used to indicate if the allocation is frame related.
6154 FINAL_ADJUSTMENT_P indicates whether we are allocating the outgoing
6155 arguments. If we are then we ensure that any allocation larger than the ABI
6156 defined buffer needs a probe so that the invariant of having a 1KB buffer is
6157 maintained.
6158
6159 We emit barriers after each stack adjustment to prevent optimizations from
6160 breaking the invariant that we never drop the stack more than a page. This
6161 invariant is needed to make it easier to correctly handle asynchronous
6162 events, e.g. if we were to allow the stack to be dropped by more than a page
6163 and then have multiple probes up and we take a signal somewhere in between
6164 then the signal handler doesn't know the state of the stack and can make no
6165 assumptions about which pages have been probed. */
6166
6167 static void
6168 aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
6169 poly_int64 poly_size,
6170 bool frame_related_p,
6171 bool final_adjustment_p)
6172 {
6173 HOST_WIDE_INT guard_size
6174 = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
6175 HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
6176 /* When doing the final adjustment for the outgoing argument size we can't
6177 assume that LR was saved at position 0. So subtract it's offset from the
6178 ABI safe buffer so that we don't accidentally allow an adjustment that
6179 would result in an allocation larger than the ABI buffer without
6180 probing. */
6181 HOST_WIDE_INT min_probe_threshold
6182 = final_adjustment_p
6183 ? guard_used_by_caller - cfun->machine->frame.reg_offset[LR_REGNUM]
6184 : guard_size - guard_used_by_caller;
6185
6186 poly_int64 frame_size = cfun->machine->frame.frame_size;
6187
6188 /* We should always have a positive probe threshold. */
6189 gcc_assert (min_probe_threshold > 0);
6190
6191 if (flag_stack_clash_protection && !final_adjustment_p)
6192 {
6193 poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
6194 poly_int64 final_adjust = cfun->machine->frame.final_adjust;
6195
6196 if (known_eq (frame_size, 0))
6197 {
6198 dump_stack_clash_frame_info (NO_PROBE_NO_FRAME, false);
6199 }
6200 else if (known_lt (initial_adjust, guard_size - guard_used_by_caller)
6201 && known_lt (final_adjust, guard_used_by_caller))
6202 {
6203 dump_stack_clash_frame_info (NO_PROBE_SMALL_FRAME, true);
6204 }
6205 }
6206
6207 /* If SIZE is not large enough to require probing, just adjust the stack and
6208 exit. */
6209 if (known_lt (poly_size, min_probe_threshold)
6210 || !flag_stack_clash_protection)
6211 {
6212 aarch64_sub_sp (temp1, temp2, poly_size, frame_related_p);
6213 return;
6214 }
6215
6216 HOST_WIDE_INT size;
6217 /* Handle the SVE non-constant case first. */
6218 if (!poly_size.is_constant (&size))
6219 {
6220 if (dump_file)
6221 {
6222 fprintf (dump_file, "Stack clash SVE prologue: ");
6223 print_dec (poly_size, dump_file);
6224 fprintf (dump_file, " bytes, dynamic probing will be required.\n");
6225 }
6226
6227 /* First calculate the amount of bytes we're actually spilling. */
6228 aarch64_add_offset (Pmode, temp1, CONST0_RTX (Pmode),
6229 poly_size, temp1, temp2, false, true);
6230
6231 rtx_insn *insn = get_last_insn ();
6232
6233 if (frame_related_p)
6234 {
6235 /* This is done to provide unwinding information for the stack
6236 adjustments we're about to do, however to prevent the optimizers
6237 from removing the R11 move and leaving the CFA note (which would be
6238 very wrong) we tie the old and new stack pointer together.
6239 The tie will expand to nothing but the optimizers will not touch
6240 the instruction. */
6241 rtx stack_ptr_copy = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
6242 emit_move_insn (stack_ptr_copy, stack_pointer_rtx);
6243 emit_insn (gen_stack_tie (stack_ptr_copy, stack_pointer_rtx));
6244
6245 /* We want the CFA independent of the stack pointer for the
6246 duration of the loop. */
6247 add_reg_note (insn, REG_CFA_DEF_CFA, stack_ptr_copy);
6248 RTX_FRAME_RELATED_P (insn) = 1;
6249 }
6250
6251 rtx probe_const = gen_int_mode (min_probe_threshold, Pmode);
6252 rtx guard_const = gen_int_mode (guard_size, Pmode);
6253
6254 insn = emit_insn (gen_probe_sve_stack_clash (Pmode, stack_pointer_rtx,
6255 stack_pointer_rtx, temp1,
6256 probe_const, guard_const));
6257
6258 /* Now reset the CFA register if needed. */
6259 if (frame_related_p)
6260 {
6261 add_reg_note (insn, REG_CFA_DEF_CFA,
6262 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
6263 gen_int_mode (poly_size, Pmode)));
6264 RTX_FRAME_RELATED_P (insn) = 1;
6265 }
6266
6267 return;
6268 }
6269
6270 if (dump_file)
6271 fprintf (dump_file,
6272 "Stack clash AArch64 prologue: " HOST_WIDE_INT_PRINT_DEC
6273 " bytes, probing will be required.\n", size);
6274
6275 /* Round size to the nearest multiple of guard_size, and calculate the
6276 residual as the difference between the original size and the rounded
6277 size. */
6278 HOST_WIDE_INT rounded_size = ROUND_DOWN (size, guard_size);
6279 HOST_WIDE_INT residual = size - rounded_size;
6280
6281 /* We can handle a small number of allocations/probes inline. Otherwise
6282 punt to a loop. */
6283 if (rounded_size <= STACK_CLASH_MAX_UNROLL_PAGES * guard_size)
6284 {
6285 for (HOST_WIDE_INT i = 0; i < rounded_size; i += guard_size)
6286 {
6287 aarch64_sub_sp (NULL, temp2, guard_size, true);
6288 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
6289 guard_used_by_caller));
6290 emit_insn (gen_blockage ());
6291 }
6292 dump_stack_clash_frame_info (PROBE_INLINE, size != rounded_size);
6293 }
6294 else
6295 {
6296 /* Compute the ending address. */
6297 aarch64_add_offset (Pmode, temp1, stack_pointer_rtx, -rounded_size,
6298 temp1, NULL, false, true);
6299 rtx_insn *insn = get_last_insn ();
6300
6301 /* For the initial allocation, we don't have a frame pointer
6302 set up, so we always need CFI notes. If we're doing the
6303 final allocation, then we may have a frame pointer, in which
6304 case it is the CFA, otherwise we need CFI notes.
6305
6306 We can determine which allocation we are doing by looking at
6307 the value of FRAME_RELATED_P since the final allocations are not
6308 frame related. */
6309 if (frame_related_p)
6310 {
6311 /* We want the CFA independent of the stack pointer for the
6312 duration of the loop. */
6313 add_reg_note (insn, REG_CFA_DEF_CFA,
6314 plus_constant (Pmode, temp1, rounded_size));
6315 RTX_FRAME_RELATED_P (insn) = 1;
6316 }
6317
6318 /* This allocates and probes the stack. Note that this re-uses some of
6319 the existing Ada stack protection code. However we are guaranteed not
6320 to enter the non loop or residual branches of that code.
6321
6322 The non-loop part won't be entered because if our allocation amount
6323 doesn't require a loop, the case above would handle it.
6324
6325 The residual amount won't be entered because TEMP1 is a mutliple of
6326 the allocation size. The residual will always be 0. As such, the only
6327 part we are actually using from that code is the loop setup. The
6328 actual probing is done in aarch64_output_probe_stack_range. */
6329 insn = emit_insn (gen_probe_stack_range (stack_pointer_rtx,
6330 stack_pointer_rtx, temp1));
6331
6332 /* Now reset the CFA register if needed. */
6333 if (frame_related_p)
6334 {
6335 add_reg_note (insn, REG_CFA_DEF_CFA,
6336 plus_constant (Pmode, stack_pointer_rtx, rounded_size));
6337 RTX_FRAME_RELATED_P (insn) = 1;
6338 }
6339
6340 emit_insn (gen_blockage ());
6341 dump_stack_clash_frame_info (PROBE_LOOP, size != rounded_size);
6342 }
6343
6344 /* Handle any residuals. Residuals of at least MIN_PROBE_THRESHOLD have to
6345 be probed. This maintains the requirement that each page is probed at
6346 least once. For initial probing we probe only if the allocation is
6347 more than GUARD_SIZE - buffer, and for the outgoing arguments we probe
6348 if the amount is larger than buffer. GUARD_SIZE - buffer + buffer ==
6349 GUARD_SIZE. This works that for any allocation that is large enough to
6350 trigger a probe here, we'll have at least one, and if they're not large
6351 enough for this code to emit anything for them, The page would have been
6352 probed by the saving of FP/LR either by this function or any callees. If
6353 we don't have any callees then we won't have more stack adjustments and so
6354 are still safe. */
6355 if (residual)
6356 {
6357 HOST_WIDE_INT residual_probe_offset = guard_used_by_caller;
6358 /* If we're doing final adjustments, and we've done any full page
6359 allocations then any residual needs to be probed. */
6360 if (final_adjustment_p && rounded_size != 0)
6361 min_probe_threshold = 0;
6362 /* If doing a small final adjustment, we always probe at offset 0.
6363 This is done to avoid issues when LR is not at position 0 or when
6364 the final adjustment is smaller than the probing offset. */
6365 else if (final_adjustment_p && rounded_size == 0)
6366 residual_probe_offset = 0;
6367
6368 aarch64_sub_sp (temp1, temp2, residual, frame_related_p);
6369 if (residual >= min_probe_threshold)
6370 {
6371 if (dump_file)
6372 fprintf (dump_file,
6373 "Stack clash AArch64 prologue residuals: "
6374 HOST_WIDE_INT_PRINT_DEC " bytes, probing will be required."
6375 "\n", residual);
6376
6377 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
6378 residual_probe_offset));
6379 emit_insn (gen_blockage ());
6380 }
6381 }
6382 }
6383
6384 /* Return 1 if the register is used by the epilogue. We need to say the
6385 return register is used, but only after epilogue generation is complete.
6386 Note that in the case of sibcalls, the values "used by the epilogue" are
6387 considered live at the start of the called function.
6388
6389 For SIMD functions we need to return 1 for FP registers that are saved and
6390 restored by a function but are not zero in call_used_regs. If we do not do
6391 this optimizations may remove the restore of the register. */
6392
6393 int
6394 aarch64_epilogue_uses (int regno)
6395 {
6396 if (epilogue_completed)
6397 {
6398 if (regno == LR_REGNUM)
6399 return 1;
6400 if (aarch64_simd_decl_p (cfun->decl) && FP_SIMD_SAVED_REGNUM_P (regno))
6401 return 1;
6402 }
6403 return 0;
6404 }
6405
6406 /* Add a REG_CFA_EXPRESSION note to INSN to say that register REG
6407 is saved at BASE + OFFSET. */
6408
6409 static void
6410 aarch64_add_cfa_expression (rtx_insn *insn, unsigned int reg,
6411 rtx base, poly_int64 offset)
6412 {
6413 rtx mem = gen_frame_mem (DImode, plus_constant (Pmode, base, offset));
6414 add_reg_note (insn, REG_CFA_EXPRESSION,
6415 gen_rtx_SET (mem, regno_reg_rtx[reg]));
6416 }
6417
6418 /* AArch64 stack frames generated by this compiler look like:
6419
6420 +-------------------------------+
6421 | |
6422 | incoming stack arguments |
6423 | |
6424 +-------------------------------+
6425 | | <-- incoming stack pointer (aligned)
6426 | callee-allocated save area |
6427 | for register varargs |
6428 | |
6429 +-------------------------------+
6430 | local variables | <-- frame_pointer_rtx
6431 | |
6432 +-------------------------------+
6433 | padding | \
6434 +-------------------------------+ |
6435 | callee-saved registers | | frame.saved_regs_size
6436 +-------------------------------+ |
6437 | LR' | |
6438 +-------------------------------+ |
6439 | FP' | / <- hard_frame_pointer_rtx (aligned)
6440 +-------------------------------+
6441 | dynamic allocation |
6442 +-------------------------------+
6443 | padding |
6444 +-------------------------------+
6445 | outgoing stack arguments | <-- arg_pointer
6446 | |
6447 +-------------------------------+
6448 | | <-- stack_pointer_rtx (aligned)
6449
6450 Dynamic stack allocations via alloca() decrease stack_pointer_rtx
6451 but leave frame_pointer_rtx and hard_frame_pointer_rtx
6452 unchanged.
6453
6454 By default for stack-clash we assume the guard is at least 64KB, but this
6455 value is configurable to either 4KB or 64KB. We also force the guard size to
6456 be the same as the probing interval and both values are kept in sync.
6457
6458 With those assumptions the callee can allocate up to 63KB (or 3KB depending
6459 on the guard size) of stack space without probing.
6460
6461 When probing is needed, we emit a probe at the start of the prologue
6462 and every PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE bytes thereafter.
6463
6464 We have to track how much space has been allocated and the only stores
6465 to the stack we track as implicit probes are the FP/LR stores.
6466
6467 For outgoing arguments we probe if the size is larger than 1KB, such that
6468 the ABI specified buffer is maintained for the next callee.
6469
6470 The following registers are reserved during frame layout and should not be
6471 used for any other purpose:
6472
6473 - r11: Used by stack clash protection when SVE is enabled.
6474 - r12(EP0) and r13(EP1): Used as temporaries for stack adjustment.
6475 - r14 and r15: Used for speculation tracking.
6476 - r16(IP0), r17(IP1): Used by indirect tailcalls.
6477 - r30(LR), r29(FP): Used by standard frame layout.
6478
6479 These registers must be avoided in frame layout related code unless the
6480 explicit intention is to interact with one of the features listed above. */
6481
6482 /* Generate the prologue instructions for entry into a function.
6483 Establish the stack frame by decreasing the stack pointer with a
6484 properly calculated size and, if necessary, create a frame record
6485 filled with the values of LR and previous frame pointer. The
6486 current FP is also set up if it is in use. */
6487
6488 void
6489 aarch64_expand_prologue (void)
6490 {
6491 poly_int64 frame_size = cfun->machine->frame.frame_size;
6492 poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
6493 HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
6494 poly_int64 final_adjust = cfun->machine->frame.final_adjust;
6495 poly_int64 callee_offset = cfun->machine->frame.callee_offset;
6496 unsigned reg1 = cfun->machine->frame.wb_candidate1;
6497 unsigned reg2 = cfun->machine->frame.wb_candidate2;
6498 bool emit_frame_chain = cfun->machine->frame.emit_frame_chain;
6499 rtx_insn *insn;
6500
6501 /* Sign return address for functions. */
6502 if (aarch64_return_address_signing_enabled ())
6503 {
6504 switch (aarch64_ra_sign_key)
6505 {
6506 case AARCH64_KEY_A:
6507 insn = emit_insn (gen_paciasp ());
6508 break;
6509 case AARCH64_KEY_B:
6510 insn = emit_insn (gen_pacibsp ());
6511 break;
6512 default:
6513 gcc_unreachable ();
6514 }
6515 add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
6516 RTX_FRAME_RELATED_P (insn) = 1;
6517 }
6518
6519 if (flag_stack_usage_info)
6520 current_function_static_stack_size = constant_lower_bound (frame_size);
6521
6522 if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
6523 {
6524 if (crtl->is_leaf && !cfun->calls_alloca)
6525 {
6526 if (maybe_gt (frame_size, PROBE_INTERVAL)
6527 && maybe_gt (frame_size, get_stack_check_protect ()))
6528 aarch64_emit_probe_stack_range (get_stack_check_protect (),
6529 (frame_size
6530 - get_stack_check_protect ()));
6531 }
6532 else if (maybe_gt (frame_size, 0))
6533 aarch64_emit_probe_stack_range (get_stack_check_protect (), frame_size);
6534 }
6535
6536 rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
6537 rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
6538
6539 /* In theory we should never have both an initial adjustment
6540 and a callee save adjustment. Verify that is the case since the
6541 code below does not handle it for -fstack-clash-protection. */
6542 gcc_assert (known_eq (initial_adjust, 0) || callee_adjust == 0);
6543
6544 /* Will only probe if the initial adjustment is larger than the guard
6545 less the amount of the guard reserved for use by the caller's
6546 outgoing args. */
6547 aarch64_allocate_and_probe_stack_space (tmp0_rtx, tmp1_rtx, initial_adjust,
6548 true, false);
6549
6550 if (callee_adjust != 0)
6551 aarch64_push_regs (reg1, reg2, callee_adjust);
6552
6553 if (emit_frame_chain)
6554 {
6555 poly_int64 reg_offset = callee_adjust;
6556 if (callee_adjust == 0)
6557 {
6558 reg1 = R29_REGNUM;
6559 reg2 = R30_REGNUM;
6560 reg_offset = callee_offset;
6561 aarch64_save_callee_saves (DImode, reg_offset, reg1, reg2, false);
6562 }
6563 aarch64_add_offset (Pmode, hard_frame_pointer_rtx,
6564 stack_pointer_rtx, callee_offset,
6565 tmp1_rtx, tmp0_rtx, frame_pointer_needed);
6566 if (frame_pointer_needed && !frame_size.is_constant ())
6567 {
6568 /* Variable-sized frames need to describe the save slot
6569 address using DW_CFA_expression rather than DW_CFA_offset.
6570 This means that, without taking further action, the
6571 locations of the registers that we've already saved would
6572 remain based on the stack pointer even after we redefine
6573 the CFA based on the frame pointer. We therefore need new
6574 DW_CFA_expressions to re-express the save slots with addresses
6575 based on the frame pointer. */
6576 rtx_insn *insn = get_last_insn ();
6577 gcc_assert (RTX_FRAME_RELATED_P (insn));
6578
6579 /* Add an explicit CFA definition if this was previously
6580 implicit. */
6581 if (!find_reg_note (insn, REG_CFA_ADJUST_CFA, NULL_RTX))
6582 {
6583 rtx src = plus_constant (Pmode, stack_pointer_rtx,
6584 callee_offset);
6585 add_reg_note (insn, REG_CFA_ADJUST_CFA,
6586 gen_rtx_SET (hard_frame_pointer_rtx, src));
6587 }
6588
6589 /* Change the save slot expressions for the registers that
6590 we've already saved. */
6591 reg_offset -= callee_offset;
6592 aarch64_add_cfa_expression (insn, reg2, hard_frame_pointer_rtx,
6593 reg_offset + UNITS_PER_WORD);
6594 aarch64_add_cfa_expression (insn, reg1, hard_frame_pointer_rtx,
6595 reg_offset);
6596 }
6597 emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
6598 }
6599
6600 aarch64_save_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
6601 callee_adjust != 0 || emit_frame_chain);
6602 if (aarch64_simd_decl_p (cfun->decl))
6603 aarch64_save_callee_saves (TFmode, callee_offset, V0_REGNUM, V31_REGNUM,
6604 callee_adjust != 0 || emit_frame_chain);
6605 else
6606 aarch64_save_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
6607 callee_adjust != 0 || emit_frame_chain);
6608
6609 /* We may need to probe the final adjustment if it is larger than the guard
6610 that is assumed by the called. */
6611 aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx, final_adjust,
6612 !frame_pointer_needed, true);
6613 }
6614
6615 /* Return TRUE if we can use a simple_return insn.
6616
6617 This function checks whether the callee saved stack is empty, which
6618 means no restore actions are need. The pro_and_epilogue will use
6619 this to check whether shrink-wrapping opt is feasible. */
6620
6621 bool
6622 aarch64_use_return_insn_p (void)
6623 {
6624 if (!reload_completed)
6625 return false;
6626
6627 if (crtl->profile)
6628 return false;
6629
6630 return known_eq (cfun->machine->frame.frame_size, 0);
6631 }
6632
6633 /* Return false for non-leaf SIMD functions in order to avoid
6634 shrink-wrapping them. Doing this will lose the necessary
6635 save/restore of FP registers. */
6636
6637 bool
6638 aarch64_use_simple_return_insn_p (void)
6639 {
6640 if (aarch64_simd_decl_p (cfun->decl) && !crtl->is_leaf)
6641 return false;
6642
6643 return true;
6644 }
6645
6646 /* Generate the epilogue instructions for returning from a function.
6647 This is almost exactly the reverse of the prolog sequence, except
6648 that we need to insert barriers to avoid scheduling loads that read
6649 from a deallocated stack, and we optimize the unwind records by
6650 emitting them all together if possible. */
6651 void
6652 aarch64_expand_epilogue (bool for_sibcall)
6653 {
6654 poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
6655 HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
6656 poly_int64 final_adjust = cfun->machine->frame.final_adjust;
6657 poly_int64 callee_offset = cfun->machine->frame.callee_offset;
6658 unsigned reg1 = cfun->machine->frame.wb_candidate1;
6659 unsigned reg2 = cfun->machine->frame.wb_candidate2;
6660 rtx cfi_ops = NULL;
6661 rtx_insn *insn;
6662 /* A stack clash protection prologue may not have left EP0_REGNUM or
6663 EP1_REGNUM in a usable state. The same is true for allocations
6664 with an SVE component, since we then need both temporary registers
6665 for each allocation. For stack clash we are in a usable state if
6666 the adjustment is less than GUARD_SIZE - GUARD_USED_BY_CALLER. */
6667 HOST_WIDE_INT guard_size
6668 = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
6669 HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
6670
6671 /* We can re-use the registers when the allocation amount is smaller than
6672 guard_size - guard_used_by_caller because we won't be doing any probes
6673 then. In such situations the register should remain live with the correct
6674 value. */
6675 bool can_inherit_p = (initial_adjust.is_constant ()
6676 && final_adjust.is_constant ())
6677 && (!flag_stack_clash_protection
6678 || known_lt (initial_adjust,
6679 guard_size - guard_used_by_caller));
6680
6681 /* We need to add memory barrier to prevent read from deallocated stack. */
6682 bool need_barrier_p
6683 = maybe_ne (get_frame_size ()
6684 + cfun->machine->frame.saved_varargs_size, 0);
6685
6686 /* Emit a barrier to prevent loads from a deallocated stack. */
6687 if (maybe_gt (final_adjust, crtl->outgoing_args_size)
6688 || cfun->calls_alloca
6689 || crtl->calls_eh_return)
6690 {
6691 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
6692 need_barrier_p = false;
6693 }
6694
6695 /* Restore the stack pointer from the frame pointer if it may not
6696 be the same as the stack pointer. */
6697 rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
6698 rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
6699 if (frame_pointer_needed
6700 && (maybe_ne (final_adjust, 0) || cfun->calls_alloca))
6701 /* If writeback is used when restoring callee-saves, the CFA
6702 is restored on the instruction doing the writeback. */
6703 aarch64_add_offset (Pmode, stack_pointer_rtx,
6704 hard_frame_pointer_rtx, -callee_offset,
6705 tmp1_rtx, tmp0_rtx, callee_adjust == 0);
6706 else
6707 /* The case where we need to re-use the register here is very rare, so
6708 avoid the complicated condition and just always emit a move if the
6709 immediate doesn't fit. */
6710 aarch64_add_sp (tmp1_rtx, tmp0_rtx, final_adjust, true);
6711
6712 aarch64_restore_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
6713 callee_adjust != 0, &cfi_ops);
6714 if (aarch64_simd_decl_p (cfun->decl))
6715 aarch64_restore_callee_saves (TFmode, callee_offset, V0_REGNUM, V31_REGNUM,
6716 callee_adjust != 0, &cfi_ops);
6717 else
6718 aarch64_restore_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
6719 callee_adjust != 0, &cfi_ops);
6720
6721 if (need_barrier_p)
6722 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
6723
6724 if (callee_adjust != 0)
6725 aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops);
6726
6727 if (callee_adjust != 0 || maybe_gt (initial_adjust, 65536))
6728 {
6729 /* Emit delayed restores and set the CFA to be SP + initial_adjust. */
6730 insn = get_last_insn ();
6731 rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust);
6732 REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
6733 RTX_FRAME_RELATED_P (insn) = 1;
6734 cfi_ops = NULL;
6735 }
6736
6737 /* Liveness of EP0_REGNUM can not be trusted across function calls either, so
6738 add restriction on emit_move optimization to leaf functions. */
6739 aarch64_add_sp (tmp0_rtx, tmp1_rtx, initial_adjust,
6740 (!can_inherit_p || !crtl->is_leaf
6741 || df_regs_ever_live_p (EP0_REGNUM)));
6742
6743 if (cfi_ops)
6744 {
6745 /* Emit delayed restores and reset the CFA to be SP. */
6746 insn = get_last_insn ();
6747 cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
6748 REG_NOTES (insn) = cfi_ops;
6749 RTX_FRAME_RELATED_P (insn) = 1;
6750 }
6751
6752 /* We prefer to emit the combined return/authenticate instruction RETAA,
6753 however there are three cases in which we must instead emit an explicit
6754 authentication instruction.
6755
6756 1) Sibcalls don't return in a normal way, so if we're about to call one
6757 we must authenticate.
6758
6759 2) The RETAA instruction is not available before ARMv8.3-A, so if we are
6760 generating code for !TARGET_ARMV8_3 we can't use it and must
6761 explicitly authenticate.
6762
6763 3) On an eh_return path we make extra stack adjustments to update the
6764 canonical frame address to be the exception handler's CFA. We want
6765 to authenticate using the CFA of the function which calls eh_return.
6766 */
6767 if (aarch64_return_address_signing_enabled ()
6768 && (for_sibcall || !TARGET_ARMV8_3 || crtl->calls_eh_return))
6769 {
6770 switch (aarch64_ra_sign_key)
6771 {
6772 case AARCH64_KEY_A:
6773 insn = emit_insn (gen_autiasp ());
6774 break;
6775 case AARCH64_KEY_B:
6776 insn = emit_insn (gen_autibsp ());
6777 break;
6778 default:
6779 gcc_unreachable ();
6780 }
6781 add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
6782 RTX_FRAME_RELATED_P (insn) = 1;
6783 }
6784
6785 /* Stack adjustment for exception handler. */
6786 if (crtl->calls_eh_return && !for_sibcall)
6787 {
6788 /* We need to unwind the stack by the offset computed by
6789 EH_RETURN_STACKADJ_RTX. We have already reset the CFA
6790 to be SP; letting the CFA move during this adjustment
6791 is just as correct as retaining the CFA from the body
6792 of the function. Therefore, do nothing special. */
6793 emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
6794 }
6795
6796 emit_use (gen_rtx_REG (DImode, LR_REGNUM));
6797 if (!for_sibcall)
6798 emit_jump_insn (ret_rtx);
6799 }
6800
6801 /* Implement EH_RETURN_HANDLER_RTX. EH returns need to either return
6802 normally or return to a previous frame after unwinding.
6803
6804 An EH return uses a single shared return sequence. The epilogue is
6805 exactly like a normal epilogue except that it has an extra input
6806 register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
6807 that must be applied after the frame has been destroyed. An extra label
6808 is inserted before the epilogue which initializes this register to zero,
6809 and this is the entry point for a normal return.
6810
6811 An actual EH return updates the return address, initializes the stack
6812 adjustment and jumps directly into the epilogue (bypassing the zeroing
6813 of the adjustment). Since the return address is typically saved on the
6814 stack when a function makes a call, the saved LR must be updated outside
6815 the epilogue.
6816
6817 This poses problems as the store is generated well before the epilogue,
6818 so the offset of LR is not known yet. Also optimizations will remove the
6819 store as it appears dead, even after the epilogue is generated (as the
6820 base or offset for loading LR is different in many cases).
6821
6822 To avoid these problems this implementation forces the frame pointer
6823 in eh_return functions so that the location of LR is fixed and known early.
6824 It also marks the store volatile, so no optimization is permitted to
6825 remove the store. */
6826 rtx
6827 aarch64_eh_return_handler_rtx (void)
6828 {
6829 rtx tmp = gen_frame_mem (Pmode,
6830 plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
6831
6832 /* Mark the store volatile, so no optimization is permitted to remove it. */
6833 MEM_VOLATILE_P (tmp) = true;
6834 return tmp;
6835 }
6836
6837 /* Output code to add DELTA to the first argument, and then jump
6838 to FUNCTION. Used for C++ multiple inheritance. */
6839 static void
6840 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
6841 HOST_WIDE_INT delta,
6842 HOST_WIDE_INT vcall_offset,
6843 tree function)
6844 {
6845 /* The this pointer is always in x0. Note that this differs from
6846 Arm where the this pointer maybe bumped to r1 if r0 is required
6847 to return a pointer to an aggregate. On AArch64 a result value
6848 pointer will be in x8. */
6849 int this_regno = R0_REGNUM;
6850 rtx this_rtx, temp0, temp1, addr, funexp;
6851 rtx_insn *insn;
6852 const char *fnname = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (thunk));
6853
6854 if (aarch64_bti_enabled ())
6855 emit_insn (gen_bti_c());
6856
6857 reload_completed = 1;
6858 emit_note (NOTE_INSN_PROLOGUE_END);
6859
6860 this_rtx = gen_rtx_REG (Pmode, this_regno);
6861 temp0 = gen_rtx_REG (Pmode, EP0_REGNUM);
6862 temp1 = gen_rtx_REG (Pmode, EP1_REGNUM);
6863
6864 if (vcall_offset == 0)
6865 aarch64_add_offset (Pmode, this_rtx, this_rtx, delta, temp1, temp0, false);
6866 else
6867 {
6868 gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
6869
6870 addr = this_rtx;
6871 if (delta != 0)
6872 {
6873 if (delta >= -256 && delta < 256)
6874 addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
6875 plus_constant (Pmode, this_rtx, delta));
6876 else
6877 aarch64_add_offset (Pmode, this_rtx, this_rtx, delta,
6878 temp1, temp0, false);
6879 }
6880
6881 if (Pmode == ptr_mode)
6882 aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
6883 else
6884 aarch64_emit_move (temp0,
6885 gen_rtx_ZERO_EXTEND (Pmode,
6886 gen_rtx_MEM (ptr_mode, addr)));
6887
6888 if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
6889 addr = plus_constant (Pmode, temp0, vcall_offset);
6890 else
6891 {
6892 aarch64_internal_mov_immediate (temp1, GEN_INT (vcall_offset), true,
6893 Pmode);
6894 addr = gen_rtx_PLUS (Pmode, temp0, temp1);
6895 }
6896
6897 if (Pmode == ptr_mode)
6898 aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
6899 else
6900 aarch64_emit_move (temp1,
6901 gen_rtx_SIGN_EXTEND (Pmode,
6902 gen_rtx_MEM (ptr_mode, addr)));
6903
6904 emit_insn (gen_add2_insn (this_rtx, temp1));
6905 }
6906
6907 /* Generate a tail call to the target function. */
6908 if (!TREE_USED (function))
6909 {
6910 assemble_external (function);
6911 TREE_USED (function) = 1;
6912 }
6913 funexp = XEXP (DECL_RTL (function), 0);
6914 funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
6915 insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, NULL_RTX));
6916 SIBLING_CALL_P (insn) = 1;
6917
6918 insn = get_insns ();
6919 shorten_branches (insn);
6920
6921 assemble_start_function (thunk, fnname);
6922 final_start_function (insn, file, 1);
6923 final (insn, file, 1);
6924 final_end_function ();
6925 assemble_end_function (thunk, fnname);
6926
6927 /* Stop pretending to be a post-reload pass. */
6928 reload_completed = 0;
6929 }
6930
6931 static bool
6932 aarch64_tls_referenced_p (rtx x)
6933 {
6934 if (!TARGET_HAVE_TLS)
6935 return false;
6936 subrtx_iterator::array_type array;
6937 FOR_EACH_SUBRTX (iter, array, x, ALL)
6938 {
6939 const_rtx x = *iter;
6940 if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
6941 return true;
6942 /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
6943 TLS offsets, not real symbol references. */
6944 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
6945 iter.skip_subrtxes ();
6946 }
6947 return false;
6948 }
6949
6950
6951 /* Return true if val can be encoded as a 12-bit unsigned immediate with
6952 a left shift of 0 or 12 bits. */
6953 bool
6954 aarch64_uimm12_shift (HOST_WIDE_INT val)
6955 {
6956 return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
6957 || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
6958 );
6959 }
6960
6961 /* Returns the nearest value to VAL that will fit as a 12-bit unsigned immediate
6962 that can be created with a left shift of 0 or 12. */
6963 static HOST_WIDE_INT
6964 aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val)
6965 {
6966 /* Check to see if the value fits in 24 bits, as that is the maximum we can
6967 handle correctly. */
6968 gcc_assert ((val & 0xffffff) == val);
6969
6970 if (((val & 0xfff) << 0) == val)
6971 return val;
6972
6973 return val & (0xfff << 12);
6974 }
6975
6976 /* Return true if val is an immediate that can be loaded into a
6977 register by a MOVZ instruction. */
6978 static bool
6979 aarch64_movw_imm (HOST_WIDE_INT val, scalar_int_mode mode)
6980 {
6981 if (GET_MODE_SIZE (mode) > 4)
6982 {
6983 if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
6984 || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
6985 return 1;
6986 }
6987 else
6988 {
6989 /* Ignore sign extension. */
6990 val &= (HOST_WIDE_INT) 0xffffffff;
6991 }
6992 return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
6993 || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
6994 }
6995
6996 /* VAL is a value with the inner mode of MODE. Replicate it to fill a
6997 64-bit (DImode) integer. */
6998
6999 static unsigned HOST_WIDE_INT
7000 aarch64_replicate_bitmask_imm (unsigned HOST_WIDE_INT val, machine_mode mode)
7001 {
7002 unsigned int size = GET_MODE_UNIT_PRECISION (mode);
7003 while (size < 64)
7004 {
7005 val &= (HOST_WIDE_INT_1U << size) - 1;
7006 val |= val << size;
7007 size *= 2;
7008 }
7009 return val;
7010 }
7011
7012 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2. */
7013
7014 static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
7015 {
7016 0x0000000100000001ull,
7017 0x0001000100010001ull,
7018 0x0101010101010101ull,
7019 0x1111111111111111ull,
7020 0x5555555555555555ull,
7021 };
7022
7023
7024 /* Return true if val is a valid bitmask immediate. */
7025
7026 bool
7027 aarch64_bitmask_imm (HOST_WIDE_INT val_in, machine_mode mode)
7028 {
7029 unsigned HOST_WIDE_INT val, tmp, mask, first_one, next_one;
7030 int bits;
7031
7032 /* Check for a single sequence of one bits and return quickly if so.
7033 The special cases of all ones and all zeroes returns false. */
7034 val = aarch64_replicate_bitmask_imm (val_in, mode);
7035 tmp = val + (val & -val);
7036
7037 if (tmp == (tmp & -tmp))
7038 return (val + 1) > 1;
7039
7040 /* Replicate 32-bit immediates so we can treat them as 64-bit. */
7041 if (mode == SImode)
7042 val = (val << 32) | (val & 0xffffffff);
7043
7044 /* Invert if the immediate doesn't start with a zero bit - this means we
7045 only need to search for sequences of one bits. */
7046 if (val & 1)
7047 val = ~val;
7048
7049 /* Find the first set bit and set tmp to val with the first sequence of one
7050 bits removed. Return success if there is a single sequence of ones. */
7051 first_one = val & -val;
7052 tmp = val & (val + first_one);
7053
7054 if (tmp == 0)
7055 return true;
7056
7057 /* Find the next set bit and compute the difference in bit position. */
7058 next_one = tmp & -tmp;
7059 bits = clz_hwi (first_one) - clz_hwi (next_one);
7060 mask = val ^ tmp;
7061
7062 /* Check the bit position difference is a power of 2, and that the first
7063 sequence of one bits fits within 'bits' bits. */
7064 if ((mask >> bits) != 0 || bits != (bits & -bits))
7065 return false;
7066
7067 /* Check the sequence of one bits is repeated 64/bits times. */
7068 return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
7069 }
7070
7071 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
7072 Assumed precondition: VAL_IN Is not zero. */
7073
7074 unsigned HOST_WIDE_INT
7075 aarch64_and_split_imm1 (HOST_WIDE_INT val_in)
7076 {
7077 int lowest_bit_set = ctz_hwi (val_in);
7078 int highest_bit_set = floor_log2 (val_in);
7079 gcc_assert (val_in != 0);
7080
7081 return ((HOST_WIDE_INT_UC (2) << highest_bit_set) -
7082 (HOST_WIDE_INT_1U << lowest_bit_set));
7083 }
7084
7085 /* Create constant where bits outside of lowest bit set to highest bit set
7086 are set to 1. */
7087
7088 unsigned HOST_WIDE_INT
7089 aarch64_and_split_imm2 (HOST_WIDE_INT val_in)
7090 {
7091 return val_in | ~aarch64_and_split_imm1 (val_in);
7092 }
7093
7094 /* Return true if VAL_IN is a valid 'and' bitmask immediate. */
7095
7096 bool
7097 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode)
7098 {
7099 scalar_int_mode int_mode;
7100 if (!is_a <scalar_int_mode> (mode, &int_mode))
7101 return false;
7102
7103 if (aarch64_bitmask_imm (val_in, int_mode))
7104 return false;
7105
7106 if (aarch64_move_imm (val_in, int_mode))
7107 return false;
7108
7109 unsigned HOST_WIDE_INT imm2 = aarch64_and_split_imm2 (val_in);
7110
7111 return aarch64_bitmask_imm (imm2, int_mode);
7112 }
7113
7114 /* Return true if val is an immediate that can be loaded into a
7115 register in a single instruction. */
7116 bool
7117 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
7118 {
7119 scalar_int_mode int_mode;
7120 if (!is_a <scalar_int_mode> (mode, &int_mode))
7121 return false;
7122
7123 if (aarch64_movw_imm (val, int_mode) || aarch64_movw_imm (~val, int_mode))
7124 return 1;
7125 return aarch64_bitmask_imm (val, int_mode);
7126 }
7127
7128 static bool
7129 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
7130 {
7131 rtx base, offset;
7132
7133 if (GET_CODE (x) == HIGH)
7134 return true;
7135
7136 /* There's no way to calculate VL-based values using relocations. */
7137 subrtx_iterator::array_type array;
7138 FOR_EACH_SUBRTX (iter, array, x, ALL)
7139 if (GET_CODE (*iter) == CONST_POLY_INT)
7140 return true;
7141
7142 split_const (x, &base, &offset);
7143 if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
7144 {
7145 if (aarch64_classify_symbol (base, INTVAL (offset))
7146 != SYMBOL_FORCE_TO_MEM)
7147 return true;
7148 else
7149 /* Avoid generating a 64-bit relocation in ILP32; leave
7150 to aarch64_expand_mov_immediate to handle it properly. */
7151 return mode != ptr_mode;
7152 }
7153
7154 return aarch64_tls_referenced_p (x);
7155 }
7156
7157 /* Implement TARGET_CASE_VALUES_THRESHOLD.
7158 The expansion for a table switch is quite expensive due to the number
7159 of instructions, the table lookup and hard to predict indirect jump.
7160 When optimizing for speed, and -O3 enabled, use the per-core tuning if
7161 set, otherwise use tables for > 16 cases as a tradeoff between size and
7162 performance. When optimizing for size, use the default setting. */
7163
7164 static unsigned int
7165 aarch64_case_values_threshold (void)
7166 {
7167 /* Use the specified limit for the number of cases before using jump
7168 tables at higher optimization levels. */
7169 if (optimize > 2
7170 && selected_cpu->tune->max_case_values != 0)
7171 return selected_cpu->tune->max_case_values;
7172 else
7173 return optimize_size ? default_case_values_threshold () : 17;
7174 }
7175
7176 /* Return true if register REGNO is a valid index register.
7177 STRICT_P is true if REG_OK_STRICT is in effect. */
7178
7179 bool
7180 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
7181 {
7182 if (!HARD_REGISTER_NUM_P (regno))
7183 {
7184 if (!strict_p)
7185 return true;
7186
7187 if (!reg_renumber)
7188 return false;
7189
7190 regno = reg_renumber[regno];
7191 }
7192 return GP_REGNUM_P (regno);
7193 }
7194
7195 /* Return true if register REGNO is a valid base register for mode MODE.
7196 STRICT_P is true if REG_OK_STRICT is in effect. */
7197
7198 bool
7199 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
7200 {
7201 if (!HARD_REGISTER_NUM_P (regno))
7202 {
7203 if (!strict_p)
7204 return true;
7205
7206 if (!reg_renumber)
7207 return false;
7208
7209 regno = reg_renumber[regno];
7210 }
7211
7212 /* The fake registers will be eliminated to either the stack or
7213 hard frame pointer, both of which are usually valid base registers.
7214 Reload deals with the cases where the eliminated form isn't valid. */
7215 return (GP_REGNUM_P (regno)
7216 || regno == SP_REGNUM
7217 || regno == FRAME_POINTER_REGNUM
7218 || regno == ARG_POINTER_REGNUM);
7219 }
7220
7221 /* Return true if X is a valid base register for mode MODE.
7222 STRICT_P is true if REG_OK_STRICT is in effect. */
7223
7224 static bool
7225 aarch64_base_register_rtx_p (rtx x, bool strict_p)
7226 {
7227 if (!strict_p
7228 && GET_CODE (x) == SUBREG
7229 && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (x))])
7230 x = SUBREG_REG (x);
7231
7232 return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
7233 }
7234
7235 /* Return true if address offset is a valid index. If it is, fill in INFO
7236 appropriately. STRICT_P is true if REG_OK_STRICT is in effect. */
7237
7238 static bool
7239 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
7240 machine_mode mode, bool strict_p)
7241 {
7242 enum aarch64_address_type type;
7243 rtx index;
7244 int shift;
7245
7246 /* (reg:P) */
7247 if ((REG_P (x) || GET_CODE (x) == SUBREG)
7248 && GET_MODE (x) == Pmode)
7249 {
7250 type = ADDRESS_REG_REG;
7251 index = x;
7252 shift = 0;
7253 }
7254 /* (sign_extend:DI (reg:SI)) */
7255 else if ((GET_CODE (x) == SIGN_EXTEND
7256 || GET_CODE (x) == ZERO_EXTEND)
7257 && GET_MODE (x) == DImode
7258 && GET_MODE (XEXP (x, 0)) == SImode)
7259 {
7260 type = (GET_CODE (x) == SIGN_EXTEND)
7261 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
7262 index = XEXP (x, 0);
7263 shift = 0;
7264 }
7265 /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
7266 else if (GET_CODE (x) == MULT
7267 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
7268 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
7269 && GET_MODE (XEXP (x, 0)) == DImode
7270 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
7271 && CONST_INT_P (XEXP (x, 1)))
7272 {
7273 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
7274 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
7275 index = XEXP (XEXP (x, 0), 0);
7276 shift = exact_log2 (INTVAL (XEXP (x, 1)));
7277 }
7278 /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
7279 else if (GET_CODE (x) == ASHIFT
7280 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
7281 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
7282 && GET_MODE (XEXP (x, 0)) == DImode
7283 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
7284 && CONST_INT_P (XEXP (x, 1)))
7285 {
7286 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
7287 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
7288 index = XEXP (XEXP (x, 0), 0);
7289 shift = INTVAL (XEXP (x, 1));
7290 }
7291 /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
7292 else if ((GET_CODE (x) == SIGN_EXTRACT
7293 || GET_CODE (x) == ZERO_EXTRACT)
7294 && GET_MODE (x) == DImode
7295 && GET_CODE (XEXP (x, 0)) == MULT
7296 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
7297 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
7298 {
7299 type = (GET_CODE (x) == SIGN_EXTRACT)
7300 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
7301 index = XEXP (XEXP (x, 0), 0);
7302 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
7303 if (INTVAL (XEXP (x, 1)) != 32 + shift
7304 || INTVAL (XEXP (x, 2)) != 0)
7305 shift = -1;
7306 }
7307 /* (and:DI (mult:DI (reg:DI) (const_int scale))
7308 (const_int 0xffffffff<<shift)) */
7309 else if (GET_CODE (x) == AND
7310 && GET_MODE (x) == DImode
7311 && GET_CODE (XEXP (x, 0)) == MULT
7312 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
7313 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
7314 && CONST_INT_P (XEXP (x, 1)))
7315 {
7316 type = ADDRESS_REG_UXTW;
7317 index = XEXP (XEXP (x, 0), 0);
7318 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
7319 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
7320 shift = -1;
7321 }
7322 /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
7323 else if ((GET_CODE (x) == SIGN_EXTRACT
7324 || GET_CODE (x) == ZERO_EXTRACT)
7325 && GET_MODE (x) == DImode
7326 && GET_CODE (XEXP (x, 0)) == ASHIFT
7327 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
7328 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
7329 {
7330 type = (GET_CODE (x) == SIGN_EXTRACT)
7331 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
7332 index = XEXP (XEXP (x, 0), 0);
7333 shift = INTVAL (XEXP (XEXP (x, 0), 1));
7334 if (INTVAL (XEXP (x, 1)) != 32 + shift
7335 || INTVAL (XEXP (x, 2)) != 0)
7336 shift = -1;
7337 }
7338 /* (and:DI (ashift:DI (reg:DI) (const_int shift))
7339 (const_int 0xffffffff<<shift)) */
7340 else if (GET_CODE (x) == AND
7341 && GET_MODE (x) == DImode
7342 && GET_CODE (XEXP (x, 0)) == ASHIFT
7343 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
7344 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
7345 && CONST_INT_P (XEXP (x, 1)))
7346 {
7347 type = ADDRESS_REG_UXTW;
7348 index = XEXP (XEXP (x, 0), 0);
7349 shift = INTVAL (XEXP (XEXP (x, 0), 1));
7350 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
7351 shift = -1;
7352 }
7353 /* (mult:P (reg:P) (const_int scale)) */
7354 else if (GET_CODE (x) == MULT
7355 && GET_MODE (x) == Pmode
7356 && GET_MODE (XEXP (x, 0)) == Pmode
7357 && CONST_INT_P (XEXP (x, 1)))
7358 {
7359 type = ADDRESS_REG_REG;
7360 index = XEXP (x, 0);
7361 shift = exact_log2 (INTVAL (XEXP (x, 1)));
7362 }
7363 /* (ashift:P (reg:P) (const_int shift)) */
7364 else if (GET_CODE (x) == ASHIFT
7365 && GET_MODE (x) == Pmode
7366 && GET_MODE (XEXP (x, 0)) == Pmode
7367 && CONST_INT_P (XEXP (x, 1)))
7368 {
7369 type = ADDRESS_REG_REG;
7370 index = XEXP (x, 0);
7371 shift = INTVAL (XEXP (x, 1));
7372 }
7373 else
7374 return false;
7375
7376 if (!strict_p
7377 && GET_CODE (index) == SUBREG
7378 && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (index))])
7379 index = SUBREG_REG (index);
7380
7381 if (aarch64_sve_data_mode_p (mode))
7382 {
7383 if (type != ADDRESS_REG_REG
7384 || (1 << shift) != GET_MODE_UNIT_SIZE (mode))
7385 return false;
7386 }
7387 else
7388 {
7389 if (shift != 0
7390 && !(IN_RANGE (shift, 1, 3)
7391 && known_eq (1 << shift, GET_MODE_SIZE (mode))))
7392 return false;
7393 }
7394
7395 if (REG_P (index)
7396 && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
7397 {
7398 info->type = type;
7399 info->offset = index;
7400 info->shift = shift;
7401 return true;
7402 }
7403
7404 return false;
7405 }
7406
7407 /* Return true if MODE is one of the modes for which we
7408 support LDP/STP operations. */
7409
7410 static bool
7411 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
7412 {
7413 return mode == SImode || mode == DImode
7414 || mode == SFmode || mode == DFmode
7415 || (aarch64_vector_mode_supported_p (mode)
7416 && (known_eq (GET_MODE_SIZE (mode), 8)
7417 || (known_eq (GET_MODE_SIZE (mode), 16)
7418 && (aarch64_tune_params.extra_tuning_flags
7419 & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS) == 0)));
7420 }
7421
7422 /* Return true if REGNO is a virtual pointer register, or an eliminable
7423 "soft" frame register. Like REGNO_PTR_FRAME_P except that we don't
7424 include stack_pointer or hard_frame_pointer. */
7425 static bool
7426 virt_or_elim_regno_p (unsigned regno)
7427 {
7428 return ((regno >= FIRST_VIRTUAL_REGISTER
7429 && regno <= LAST_VIRTUAL_POINTER_REGISTER)
7430 || regno == FRAME_POINTER_REGNUM
7431 || regno == ARG_POINTER_REGNUM);
7432 }
7433
7434 /* Return true if X is a valid address of type TYPE for machine mode MODE.
7435 If it is, fill in INFO appropriately. STRICT_P is true if
7436 REG_OK_STRICT is in effect. */
7437
7438 bool
7439 aarch64_classify_address (struct aarch64_address_info *info,
7440 rtx x, machine_mode mode, bool strict_p,
7441 aarch64_addr_query_type type)
7442 {
7443 enum rtx_code code = GET_CODE (x);
7444 rtx op0, op1;
7445 poly_int64 offset;
7446
7447 HOST_WIDE_INT const_size;
7448
7449 /* On BE, we use load/store pair for all large int mode load/stores.
7450 TI/TFmode may also use a load/store pair. */
7451 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
7452 bool advsimd_struct_p = (vec_flags == (VEC_ADVSIMD | VEC_STRUCT));
7453 bool load_store_pair_p = (type == ADDR_QUERY_LDP_STP
7454 || type == ADDR_QUERY_LDP_STP_N
7455 || mode == TImode
7456 || mode == TFmode
7457 || (BYTES_BIG_ENDIAN && advsimd_struct_p));
7458
7459 /* If we are dealing with ADDR_QUERY_LDP_STP_N that means the incoming mode
7460 corresponds to the actual size of the memory being loaded/stored and the
7461 mode of the corresponding addressing mode is half of that. */
7462 if (type == ADDR_QUERY_LDP_STP_N
7463 && known_eq (GET_MODE_SIZE (mode), 16))
7464 mode = DFmode;
7465
7466 bool allow_reg_index_p = (!load_store_pair_p
7467 && (known_lt (GET_MODE_SIZE (mode), 16)
7468 || vec_flags == VEC_ADVSIMD
7469 || vec_flags & VEC_SVE_DATA));
7470
7471 /* For SVE, only accept [Rn], [Rn, Rm, LSL #shift] and
7472 [Rn, #offset, MUL VL]. */
7473 if ((vec_flags & (VEC_SVE_DATA | VEC_SVE_PRED)) != 0
7474 && (code != REG && code != PLUS))
7475 return false;
7476
7477 /* On LE, for AdvSIMD, don't support anything other than POST_INC or
7478 REG addressing. */
7479 if (advsimd_struct_p
7480 && !BYTES_BIG_ENDIAN
7481 && (code != POST_INC && code != REG))
7482 return false;
7483
7484 gcc_checking_assert (GET_MODE (x) == VOIDmode
7485 || SCALAR_INT_MODE_P (GET_MODE (x)));
7486
7487 switch (code)
7488 {
7489 case REG:
7490 case SUBREG:
7491 info->type = ADDRESS_REG_IMM;
7492 info->base = x;
7493 info->offset = const0_rtx;
7494 info->const_offset = 0;
7495 return aarch64_base_register_rtx_p (x, strict_p);
7496
7497 case PLUS:
7498 op0 = XEXP (x, 0);
7499 op1 = XEXP (x, 1);
7500
7501 if (! strict_p
7502 && REG_P (op0)
7503 && virt_or_elim_regno_p (REGNO (op0))
7504 && poly_int_rtx_p (op1, &offset))
7505 {
7506 info->type = ADDRESS_REG_IMM;
7507 info->base = op0;
7508 info->offset = op1;
7509 info->const_offset = offset;
7510
7511 return true;
7512 }
7513
7514 if (maybe_ne (GET_MODE_SIZE (mode), 0)
7515 && aarch64_base_register_rtx_p (op0, strict_p)
7516 && poly_int_rtx_p (op1, &offset))
7517 {
7518 info->type = ADDRESS_REG_IMM;
7519 info->base = op0;
7520 info->offset = op1;
7521 info->const_offset = offset;
7522
7523 /* TImode and TFmode values are allowed in both pairs of X
7524 registers and individual Q registers. The available
7525 address modes are:
7526 X,X: 7-bit signed scaled offset
7527 Q: 9-bit signed offset
7528 We conservatively require an offset representable in either mode.
7529 When performing the check for pairs of X registers i.e. LDP/STP
7530 pass down DImode since that is the natural size of the LDP/STP
7531 instruction memory accesses. */
7532 if (mode == TImode || mode == TFmode)
7533 return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
7534 && (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
7535 || offset_12bit_unsigned_scaled_p (mode, offset)));
7536
7537 /* A 7bit offset check because OImode will emit a ldp/stp
7538 instruction (only big endian will get here).
7539 For ldp/stp instructions, the offset is scaled for the size of a
7540 single element of the pair. */
7541 if (mode == OImode)
7542 return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
7543
7544 /* Three 9/12 bit offsets checks because CImode will emit three
7545 ldr/str instructions (only big endian will get here). */
7546 if (mode == CImode)
7547 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
7548 && (aarch64_offset_9bit_signed_unscaled_p (V16QImode,
7549 offset + 32)
7550 || offset_12bit_unsigned_scaled_p (V16QImode,
7551 offset + 32)));
7552
7553 /* Two 7bit offsets checks because XImode will emit two ldp/stp
7554 instructions (only big endian will get here). */
7555 if (mode == XImode)
7556 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
7557 && aarch64_offset_7bit_signed_scaled_p (TImode,
7558 offset + 32));
7559
7560 /* Make "m" use the LD1 offset range for SVE data modes, so
7561 that pre-RTL optimizers like ivopts will work to that
7562 instead of the wider LDR/STR range. */
7563 if (vec_flags == VEC_SVE_DATA)
7564 return (type == ADDR_QUERY_M
7565 ? offset_4bit_signed_scaled_p (mode, offset)
7566 : offset_9bit_signed_scaled_p (mode, offset));
7567
7568 if (vec_flags == (VEC_SVE_DATA | VEC_STRUCT))
7569 {
7570 poly_int64 end_offset = (offset
7571 + GET_MODE_SIZE (mode)
7572 - BYTES_PER_SVE_VECTOR);
7573 return (type == ADDR_QUERY_M
7574 ? offset_4bit_signed_scaled_p (mode, offset)
7575 : (offset_9bit_signed_scaled_p (SVE_BYTE_MODE, offset)
7576 && offset_9bit_signed_scaled_p (SVE_BYTE_MODE,
7577 end_offset)));
7578 }
7579
7580 if (vec_flags == VEC_SVE_PRED)
7581 return offset_9bit_signed_scaled_p (mode, offset);
7582
7583 if (load_store_pair_p)
7584 return ((known_eq (GET_MODE_SIZE (mode), 4)
7585 || known_eq (GET_MODE_SIZE (mode), 8)
7586 || known_eq (GET_MODE_SIZE (mode), 16))
7587 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
7588 else
7589 return (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
7590 || offset_12bit_unsigned_scaled_p (mode, offset));
7591 }
7592
7593 if (allow_reg_index_p)
7594 {
7595 /* Look for base + (scaled/extended) index register. */
7596 if (aarch64_base_register_rtx_p (op0, strict_p)
7597 && aarch64_classify_index (info, op1, mode, strict_p))
7598 {
7599 info->base = op0;
7600 return true;
7601 }
7602 if (aarch64_base_register_rtx_p (op1, strict_p)
7603 && aarch64_classify_index (info, op0, mode, strict_p))
7604 {
7605 info->base = op1;
7606 return true;
7607 }
7608 }
7609
7610 return false;
7611
7612 case POST_INC:
7613 case POST_DEC:
7614 case PRE_INC:
7615 case PRE_DEC:
7616 info->type = ADDRESS_REG_WB;
7617 info->base = XEXP (x, 0);
7618 info->offset = NULL_RTX;
7619 return aarch64_base_register_rtx_p (info->base, strict_p);
7620
7621 case POST_MODIFY:
7622 case PRE_MODIFY:
7623 info->type = ADDRESS_REG_WB;
7624 info->base = XEXP (x, 0);
7625 if (GET_CODE (XEXP (x, 1)) == PLUS
7626 && poly_int_rtx_p (XEXP (XEXP (x, 1), 1), &offset)
7627 && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
7628 && aarch64_base_register_rtx_p (info->base, strict_p))
7629 {
7630 info->offset = XEXP (XEXP (x, 1), 1);
7631 info->const_offset = offset;
7632
7633 /* TImode and TFmode values are allowed in both pairs of X
7634 registers and individual Q registers. The available
7635 address modes are:
7636 X,X: 7-bit signed scaled offset
7637 Q: 9-bit signed offset
7638 We conservatively require an offset representable in either mode.
7639 */
7640 if (mode == TImode || mode == TFmode)
7641 return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
7642 && aarch64_offset_9bit_signed_unscaled_p (mode, offset));
7643
7644 if (load_store_pair_p)
7645 return ((known_eq (GET_MODE_SIZE (mode), 4)
7646 || known_eq (GET_MODE_SIZE (mode), 8)
7647 || known_eq (GET_MODE_SIZE (mode), 16))
7648 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
7649 else
7650 return aarch64_offset_9bit_signed_unscaled_p (mode, offset);
7651 }
7652 return false;
7653
7654 case CONST:
7655 case SYMBOL_REF:
7656 case LABEL_REF:
7657 /* load literal: pc-relative constant pool entry. Only supported
7658 for SI mode or larger. */
7659 info->type = ADDRESS_SYMBOLIC;
7660
7661 if (!load_store_pair_p
7662 && GET_MODE_SIZE (mode).is_constant (&const_size)
7663 && const_size >= 4)
7664 {
7665 rtx sym, addend;
7666
7667 split_const (x, &sym, &addend);
7668 return ((GET_CODE (sym) == LABEL_REF
7669 || (GET_CODE (sym) == SYMBOL_REF
7670 && CONSTANT_POOL_ADDRESS_P (sym)
7671 && aarch64_pcrelative_literal_loads)));
7672 }
7673 return false;
7674
7675 case LO_SUM:
7676 info->type = ADDRESS_LO_SUM;
7677 info->base = XEXP (x, 0);
7678 info->offset = XEXP (x, 1);
7679 if (allow_reg_index_p
7680 && aarch64_base_register_rtx_p (info->base, strict_p))
7681 {
7682 rtx sym, offs;
7683 split_const (info->offset, &sym, &offs);
7684 if (GET_CODE (sym) == SYMBOL_REF
7685 && (aarch64_classify_symbol (sym, INTVAL (offs))
7686 == SYMBOL_SMALL_ABSOLUTE))
7687 {
7688 /* The symbol and offset must be aligned to the access size. */
7689 unsigned int align;
7690
7691 if (CONSTANT_POOL_ADDRESS_P (sym))
7692 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
7693 else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
7694 {
7695 tree exp = SYMBOL_REF_DECL (sym);
7696 align = TYPE_ALIGN (TREE_TYPE (exp));
7697 align = aarch64_constant_alignment (exp, align);
7698 }
7699 else if (SYMBOL_REF_DECL (sym))
7700 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
7701 else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
7702 && SYMBOL_REF_BLOCK (sym) != NULL)
7703 align = SYMBOL_REF_BLOCK (sym)->alignment;
7704 else
7705 align = BITS_PER_UNIT;
7706
7707 poly_int64 ref_size = GET_MODE_SIZE (mode);
7708 if (known_eq (ref_size, 0))
7709 ref_size = GET_MODE_SIZE (DImode);
7710
7711 return (multiple_p (INTVAL (offs), ref_size)
7712 && multiple_p (align / BITS_PER_UNIT, ref_size));
7713 }
7714 }
7715 return false;
7716
7717 default:
7718 return false;
7719 }
7720 }
7721
7722 /* Return true if the address X is valid for a PRFM instruction.
7723 STRICT_P is true if we should do strict checking with
7724 aarch64_classify_address. */
7725
7726 bool
7727 aarch64_address_valid_for_prefetch_p (rtx x, bool strict_p)
7728 {
7729 struct aarch64_address_info addr;
7730
7731 /* PRFM accepts the same addresses as DImode... */
7732 bool res = aarch64_classify_address (&addr, x, DImode, strict_p);
7733 if (!res)
7734 return false;
7735
7736 /* ... except writeback forms. */
7737 return addr.type != ADDRESS_REG_WB;
7738 }
7739
7740 bool
7741 aarch64_symbolic_address_p (rtx x)
7742 {
7743 rtx offset;
7744
7745 split_const (x, &x, &offset);
7746 return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
7747 }
7748
7749 /* Classify the base of symbolic expression X. */
7750
7751 enum aarch64_symbol_type
7752 aarch64_classify_symbolic_expression (rtx x)
7753 {
7754 rtx offset;
7755
7756 split_const (x, &x, &offset);
7757 return aarch64_classify_symbol (x, INTVAL (offset));
7758 }
7759
7760
7761 /* Return TRUE if X is a legitimate address for accessing memory in
7762 mode MODE. */
7763 static bool
7764 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
7765 {
7766 struct aarch64_address_info addr;
7767
7768 return aarch64_classify_address (&addr, x, mode, strict_p);
7769 }
7770
7771 /* Return TRUE if X is a legitimate address of type TYPE for accessing
7772 memory in mode MODE. STRICT_P is true if REG_OK_STRICT is in effect. */
7773 bool
7774 aarch64_legitimate_address_p (machine_mode mode, rtx x, bool strict_p,
7775 aarch64_addr_query_type type)
7776 {
7777 struct aarch64_address_info addr;
7778
7779 return aarch64_classify_address (&addr, x, mode, strict_p, type);
7780 }
7781
7782 /* Implement TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT. */
7783
7784 static bool
7785 aarch64_legitimize_address_displacement (rtx *offset1, rtx *offset2,
7786 poly_int64 orig_offset,
7787 machine_mode mode)
7788 {
7789 HOST_WIDE_INT size;
7790 if (GET_MODE_SIZE (mode).is_constant (&size))
7791 {
7792 HOST_WIDE_INT const_offset, second_offset;
7793
7794 /* A general SVE offset is A * VQ + B. Remove the A component from
7795 coefficient 0 in order to get the constant B. */
7796 const_offset = orig_offset.coeffs[0] - orig_offset.coeffs[1];
7797
7798 /* Split an out-of-range address displacement into a base and
7799 offset. Use 4KB range for 1- and 2-byte accesses and a 16KB
7800 range otherwise to increase opportunities for sharing the base
7801 address of different sizes. Unaligned accesses use the signed
7802 9-bit range, TImode/TFmode use the intersection of signed
7803 scaled 7-bit and signed 9-bit offset. */
7804 if (mode == TImode || mode == TFmode)
7805 second_offset = ((const_offset + 0x100) & 0x1f8) - 0x100;
7806 else if ((const_offset & (size - 1)) != 0)
7807 second_offset = ((const_offset + 0x100) & 0x1ff) - 0x100;
7808 else
7809 second_offset = const_offset & (size < 4 ? 0xfff : 0x3ffc);
7810
7811 if (second_offset == 0 || known_eq (orig_offset, second_offset))
7812 return false;
7813
7814 /* Split the offset into second_offset and the rest. */
7815 *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
7816 *offset2 = gen_int_mode (second_offset, Pmode);
7817 return true;
7818 }
7819 else
7820 {
7821 /* Get the mode we should use as the basis of the range. For structure
7822 modes this is the mode of one vector. */
7823 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
7824 machine_mode step_mode
7825 = (vec_flags & VEC_STRUCT) != 0 ? SVE_BYTE_MODE : mode;
7826
7827 /* Get the "mul vl" multiplier we'd like to use. */
7828 HOST_WIDE_INT factor = GET_MODE_SIZE (step_mode).coeffs[1];
7829 HOST_WIDE_INT vnum = orig_offset.coeffs[1] / factor;
7830 if (vec_flags & VEC_SVE_DATA)
7831 /* LDR supports a 9-bit range, but the move patterns for
7832 structure modes require all vectors to be in range of the
7833 same base. The simplest way of accomodating that while still
7834 promoting reuse of anchor points between different modes is
7835 to use an 8-bit range unconditionally. */
7836 vnum = ((vnum + 128) & 255) - 128;
7837 else
7838 /* Predicates are only handled singly, so we might as well use
7839 the full range. */
7840 vnum = ((vnum + 256) & 511) - 256;
7841 if (vnum == 0)
7842 return false;
7843
7844 /* Convert the "mul vl" multiplier into a byte offset. */
7845 poly_int64 second_offset = GET_MODE_SIZE (step_mode) * vnum;
7846 if (known_eq (second_offset, orig_offset))
7847 return false;
7848
7849 /* Split the offset into second_offset and the rest. */
7850 *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
7851 *offset2 = gen_int_mode (second_offset, Pmode);
7852 return true;
7853 }
7854 }
7855
7856 /* Return the binary representation of floating point constant VALUE in INTVAL.
7857 If the value cannot be converted, return false without setting INTVAL.
7858 The conversion is done in the given MODE. */
7859 bool
7860 aarch64_reinterpret_float_as_int (rtx value, unsigned HOST_WIDE_INT *intval)
7861 {
7862
7863 /* We make a general exception for 0. */
7864 if (aarch64_float_const_zero_rtx_p (value))
7865 {
7866 *intval = 0;
7867 return true;
7868 }
7869
7870 scalar_float_mode mode;
7871 if (GET_CODE (value) != CONST_DOUBLE
7872 || !is_a <scalar_float_mode> (GET_MODE (value), &mode)
7873 || GET_MODE_BITSIZE (mode) > HOST_BITS_PER_WIDE_INT
7874 /* Only support up to DF mode. */
7875 || GET_MODE_BITSIZE (mode) > GET_MODE_BITSIZE (DFmode))
7876 return false;
7877
7878 unsigned HOST_WIDE_INT ival = 0;
7879
7880 long res[2];
7881 real_to_target (res,
7882 CONST_DOUBLE_REAL_VALUE (value),
7883 REAL_MODE_FORMAT (mode));
7884
7885 if (mode == DFmode)
7886 {
7887 int order = BYTES_BIG_ENDIAN ? 1 : 0;
7888 ival = zext_hwi (res[order], 32);
7889 ival |= (zext_hwi (res[1 - order], 32) << 32);
7890 }
7891 else
7892 ival = zext_hwi (res[0], 32);
7893
7894 *intval = ival;
7895 return true;
7896 }
7897
7898 /* Return TRUE if rtx X is an immediate constant that can be moved using a
7899 single MOV(+MOVK) followed by an FMOV. */
7900 bool
7901 aarch64_float_const_rtx_p (rtx x)
7902 {
7903 machine_mode mode = GET_MODE (x);
7904 if (mode == VOIDmode)
7905 return false;
7906
7907 /* Determine whether it's cheaper to write float constants as
7908 mov/movk pairs over ldr/adrp pairs. */
7909 unsigned HOST_WIDE_INT ival;
7910
7911 if (GET_CODE (x) == CONST_DOUBLE
7912 && SCALAR_FLOAT_MODE_P (mode)
7913 && aarch64_reinterpret_float_as_int (x, &ival))
7914 {
7915 scalar_int_mode imode = (mode == HFmode
7916 ? SImode
7917 : int_mode_for_mode (mode).require ());
7918 int num_instr = aarch64_internal_mov_immediate
7919 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
7920 return num_instr < 3;
7921 }
7922
7923 return false;
7924 }
7925
7926 /* Return TRUE if rtx X is immediate constant 0.0 */
7927 bool
7928 aarch64_float_const_zero_rtx_p (rtx x)
7929 {
7930 if (GET_MODE (x) == VOIDmode)
7931 return false;
7932
7933 if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
7934 return !HONOR_SIGNED_ZEROS (GET_MODE (x));
7935 return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
7936 }
7937
7938 /* Return TRUE if rtx X is immediate constant that fits in a single
7939 MOVI immediate operation. */
7940 bool
7941 aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode)
7942 {
7943 if (!TARGET_SIMD)
7944 return false;
7945
7946 machine_mode vmode;
7947 scalar_int_mode imode;
7948 unsigned HOST_WIDE_INT ival;
7949
7950 if (GET_CODE (x) == CONST_DOUBLE
7951 && SCALAR_FLOAT_MODE_P (mode))
7952 {
7953 if (!aarch64_reinterpret_float_as_int (x, &ival))
7954 return false;
7955
7956 /* We make a general exception for 0. */
7957 if (aarch64_float_const_zero_rtx_p (x))
7958 return true;
7959
7960 imode = int_mode_for_mode (mode).require ();
7961 }
7962 else if (GET_CODE (x) == CONST_INT
7963 && is_a <scalar_int_mode> (mode, &imode))
7964 ival = INTVAL (x);
7965 else
7966 return false;
7967
7968 /* use a 64 bit mode for everything except for DI/DF mode, where we use
7969 a 128 bit vector mode. */
7970 int width = GET_MODE_BITSIZE (imode) == 64 ? 128 : 64;
7971
7972 vmode = aarch64_simd_container_mode (imode, width);
7973 rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, ival);
7974
7975 return aarch64_simd_valid_immediate (v_op, NULL);
7976 }
7977
7978
7979 /* Return the fixed registers used for condition codes. */
7980
7981 static bool
7982 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
7983 {
7984 *p1 = CC_REGNUM;
7985 *p2 = INVALID_REGNUM;
7986 return true;
7987 }
7988
7989 /* This function is used by the call expanders of the machine description.
7990 RESULT is the register in which the result is returned. It's NULL for
7991 "call" and "sibcall".
7992 MEM is the location of the function call.
7993 SIBCALL indicates whether this function call is normal call or sibling call.
7994 It will generate different pattern accordingly. */
7995
7996 void
7997 aarch64_expand_call (rtx result, rtx mem, bool sibcall)
7998 {
7999 rtx call, callee, tmp;
8000 rtvec vec;
8001 machine_mode mode;
8002
8003 gcc_assert (MEM_P (mem));
8004 callee = XEXP (mem, 0);
8005 mode = GET_MODE (callee);
8006 gcc_assert (mode == Pmode);
8007
8008 /* Decide if we should generate indirect calls by loading the
8009 address of the callee into a register before performing
8010 the branch-and-link. */
8011 if (SYMBOL_REF_P (callee)
8012 ? (aarch64_is_long_call_p (callee)
8013 || aarch64_is_noplt_call_p (callee))
8014 : !REG_P (callee))
8015 XEXP (mem, 0) = force_reg (mode, callee);
8016
8017 call = gen_rtx_CALL (VOIDmode, mem, const0_rtx);
8018
8019 if (result != NULL_RTX)
8020 call = gen_rtx_SET (result, call);
8021
8022 if (sibcall)
8023 tmp = ret_rtx;
8024 else
8025 tmp = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (Pmode, LR_REGNUM));
8026
8027 vec = gen_rtvec (2, call, tmp);
8028 call = gen_rtx_PARALLEL (VOIDmode, vec);
8029
8030 aarch64_emit_call_insn (call);
8031 }
8032
8033 /* Emit call insn with PAT and do aarch64-specific handling. */
8034
8035 void
8036 aarch64_emit_call_insn (rtx pat)
8037 {
8038 rtx insn = emit_call_insn (pat);
8039
8040 rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
8041 clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
8042 clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
8043 }
8044
8045 machine_mode
8046 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
8047 {
8048 machine_mode mode_x = GET_MODE (x);
8049 rtx_code code_x = GET_CODE (x);
8050
8051 /* All floating point compares return CCFP if it is an equality
8052 comparison, and CCFPE otherwise. */
8053 if (GET_MODE_CLASS (mode_x) == MODE_FLOAT)
8054 {
8055 switch (code)
8056 {
8057 case EQ:
8058 case NE:
8059 case UNORDERED:
8060 case ORDERED:
8061 case UNLT:
8062 case UNLE:
8063 case UNGT:
8064 case UNGE:
8065 case UNEQ:
8066 return CCFPmode;
8067
8068 case LT:
8069 case LE:
8070 case GT:
8071 case GE:
8072 case LTGT:
8073 return CCFPEmode;
8074
8075 default:
8076 gcc_unreachable ();
8077 }
8078 }
8079
8080 /* Equality comparisons of short modes against zero can be performed
8081 using the TST instruction with the appropriate bitmask. */
8082 if (y == const0_rtx && (REG_P (x) || SUBREG_P (x))
8083 && (code == EQ || code == NE)
8084 && (mode_x == HImode || mode_x == QImode))
8085 return CC_NZmode;
8086
8087 /* Similarly, comparisons of zero_extends from shorter modes can
8088 be performed using an ANDS with an immediate mask. */
8089 if (y == const0_rtx && code_x == ZERO_EXTEND
8090 && (mode_x == SImode || mode_x == DImode)
8091 && (GET_MODE (XEXP (x, 0)) == HImode || GET_MODE (XEXP (x, 0)) == QImode)
8092 && (code == EQ || code == NE))
8093 return CC_NZmode;
8094
8095 if ((mode_x == SImode || mode_x == DImode)
8096 && y == const0_rtx
8097 && (code == EQ || code == NE || code == LT || code == GE)
8098 && (code_x == PLUS || code_x == MINUS || code_x == AND
8099 || code_x == NEG
8100 || (code_x == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
8101 && CONST_INT_P (XEXP (x, 2)))))
8102 return CC_NZmode;
8103
8104 /* A compare with a shifted operand. Because of canonicalization,
8105 the comparison will have to be swapped when we emit the assembly
8106 code. */
8107 if ((mode_x == SImode || mode_x == DImode)
8108 && (REG_P (y) || GET_CODE (y) == SUBREG || y == const0_rtx)
8109 && (code_x == ASHIFT || code_x == ASHIFTRT
8110 || code_x == LSHIFTRT
8111 || code_x == ZERO_EXTEND || code_x == SIGN_EXTEND))
8112 return CC_SWPmode;
8113
8114 /* Similarly for a negated operand, but we can only do this for
8115 equalities. */
8116 if ((mode_x == SImode || mode_x == DImode)
8117 && (REG_P (y) || GET_CODE (y) == SUBREG)
8118 && (code == EQ || code == NE)
8119 && code_x == NEG)
8120 return CC_Zmode;
8121
8122 /* A test for unsigned overflow from an addition. */
8123 if ((mode_x == DImode || mode_x == TImode)
8124 && (code == LTU || code == GEU)
8125 && code_x == PLUS
8126 && rtx_equal_p (XEXP (x, 0), y))
8127 return CC_Cmode;
8128
8129 /* A test for unsigned overflow from an add with carry. */
8130 if ((mode_x == DImode || mode_x == TImode)
8131 && (code == LTU || code == GEU)
8132 && code_x == PLUS
8133 && CONST_SCALAR_INT_P (y)
8134 && (rtx_mode_t (y, mode_x)
8135 == (wi::shwi (1, mode_x)
8136 << (GET_MODE_BITSIZE (mode_x).to_constant () / 2))))
8137 return CC_ADCmode;
8138
8139 /* A test for signed overflow. */
8140 if ((mode_x == DImode || mode_x == TImode)
8141 && code == NE
8142 && code_x == PLUS
8143 && GET_CODE (y) == SIGN_EXTEND)
8144 return CC_Vmode;
8145
8146 /* For everything else, return CCmode. */
8147 return CCmode;
8148 }
8149
8150 static int
8151 aarch64_get_condition_code_1 (machine_mode, enum rtx_code);
8152
8153 int
8154 aarch64_get_condition_code (rtx x)
8155 {
8156 machine_mode mode = GET_MODE (XEXP (x, 0));
8157 enum rtx_code comp_code = GET_CODE (x);
8158
8159 if (GET_MODE_CLASS (mode) != MODE_CC)
8160 mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
8161 return aarch64_get_condition_code_1 (mode, comp_code);
8162 }
8163
8164 static int
8165 aarch64_get_condition_code_1 (machine_mode mode, enum rtx_code comp_code)
8166 {
8167 switch (mode)
8168 {
8169 case E_CCFPmode:
8170 case E_CCFPEmode:
8171 switch (comp_code)
8172 {
8173 case GE: return AARCH64_GE;
8174 case GT: return AARCH64_GT;
8175 case LE: return AARCH64_LS;
8176 case LT: return AARCH64_MI;
8177 case NE: return AARCH64_NE;
8178 case EQ: return AARCH64_EQ;
8179 case ORDERED: return AARCH64_VC;
8180 case UNORDERED: return AARCH64_VS;
8181 case UNLT: return AARCH64_LT;
8182 case UNLE: return AARCH64_LE;
8183 case UNGT: return AARCH64_HI;
8184 case UNGE: return AARCH64_PL;
8185 default: return -1;
8186 }
8187 break;
8188
8189 case E_CCmode:
8190 switch (comp_code)
8191 {
8192 case NE: return AARCH64_NE;
8193 case EQ: return AARCH64_EQ;
8194 case GE: return AARCH64_GE;
8195 case GT: return AARCH64_GT;
8196 case LE: return AARCH64_LE;
8197 case LT: return AARCH64_LT;
8198 case GEU: return AARCH64_CS;
8199 case GTU: return AARCH64_HI;
8200 case LEU: return AARCH64_LS;
8201 case LTU: return AARCH64_CC;
8202 default: return -1;
8203 }
8204 break;
8205
8206 case E_CC_SWPmode:
8207 switch (comp_code)
8208 {
8209 case NE: return AARCH64_NE;
8210 case EQ: return AARCH64_EQ;
8211 case GE: return AARCH64_LE;
8212 case GT: return AARCH64_LT;
8213 case LE: return AARCH64_GE;
8214 case LT: return AARCH64_GT;
8215 case GEU: return AARCH64_LS;
8216 case GTU: return AARCH64_CC;
8217 case LEU: return AARCH64_CS;
8218 case LTU: return AARCH64_HI;
8219 default: return -1;
8220 }
8221 break;
8222
8223 case E_CC_NZCmode:
8224 switch (comp_code)
8225 {
8226 case NE: return AARCH64_NE; /* = any */
8227 case EQ: return AARCH64_EQ; /* = none */
8228 case GE: return AARCH64_PL; /* = nfrst */
8229 case LT: return AARCH64_MI; /* = first */
8230 case GEU: return AARCH64_CS; /* = nlast */
8231 case GTU: return AARCH64_HI; /* = pmore */
8232 case LEU: return AARCH64_LS; /* = plast */
8233 case LTU: return AARCH64_CC; /* = last */
8234 default: return -1;
8235 }
8236 break;
8237
8238 case E_CC_NZmode:
8239 switch (comp_code)
8240 {
8241 case NE: return AARCH64_NE;
8242 case EQ: return AARCH64_EQ;
8243 case GE: return AARCH64_PL;
8244 case LT: return AARCH64_MI;
8245 default: return -1;
8246 }
8247 break;
8248
8249 case E_CC_Zmode:
8250 switch (comp_code)
8251 {
8252 case NE: return AARCH64_NE;
8253 case EQ: return AARCH64_EQ;
8254 default: return -1;
8255 }
8256 break;
8257
8258 case E_CC_Cmode:
8259 switch (comp_code)
8260 {
8261 case LTU: return AARCH64_CS;
8262 case GEU: return AARCH64_CC;
8263 default: return -1;
8264 }
8265 break;
8266
8267 case E_CC_ADCmode:
8268 switch (comp_code)
8269 {
8270 case GEU: return AARCH64_CS;
8271 case LTU: return AARCH64_CC;
8272 default: return -1;
8273 }
8274 break;
8275
8276 case E_CC_Vmode:
8277 switch (comp_code)
8278 {
8279 case NE: return AARCH64_VS;
8280 case EQ: return AARCH64_VC;
8281 default: return -1;
8282 }
8283 break;
8284
8285 default:
8286 return -1;
8287 }
8288
8289 return -1;
8290 }
8291
8292 bool
8293 aarch64_const_vec_all_same_in_range_p (rtx x,
8294 HOST_WIDE_INT minval,
8295 HOST_WIDE_INT maxval)
8296 {
8297 rtx elt;
8298 return (const_vec_duplicate_p (x, &elt)
8299 && CONST_INT_P (elt)
8300 && IN_RANGE (INTVAL (elt), minval, maxval));
8301 }
8302
8303 bool
8304 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
8305 {
8306 return aarch64_const_vec_all_same_in_range_p (x, val, val);
8307 }
8308
8309 /* Return true if VEC is a constant in which every element is in the range
8310 [MINVAL, MAXVAL]. The elements do not need to have the same value. */
8311
8312 static bool
8313 aarch64_const_vec_all_in_range_p (rtx vec,
8314 HOST_WIDE_INT minval,
8315 HOST_WIDE_INT maxval)
8316 {
8317 if (GET_CODE (vec) != CONST_VECTOR
8318 || GET_MODE_CLASS (GET_MODE (vec)) != MODE_VECTOR_INT)
8319 return false;
8320
8321 int nunits;
8322 if (!CONST_VECTOR_STEPPED_P (vec))
8323 nunits = const_vector_encoded_nelts (vec);
8324 else if (!CONST_VECTOR_NUNITS (vec).is_constant (&nunits))
8325 return false;
8326
8327 for (int i = 0; i < nunits; i++)
8328 {
8329 rtx vec_elem = CONST_VECTOR_ELT (vec, i);
8330 if (!CONST_INT_P (vec_elem)
8331 || !IN_RANGE (INTVAL (vec_elem), minval, maxval))
8332 return false;
8333 }
8334 return true;
8335 }
8336
8337 /* N Z C V. */
8338 #define AARCH64_CC_V 1
8339 #define AARCH64_CC_C (1 << 1)
8340 #define AARCH64_CC_Z (1 << 2)
8341 #define AARCH64_CC_N (1 << 3)
8342
8343 /* N Z C V flags for ccmp. Indexed by AARCH64_COND_CODE. */
8344 static const int aarch64_nzcv_codes[] =
8345 {
8346 0, /* EQ, Z == 1. */
8347 AARCH64_CC_Z, /* NE, Z == 0. */
8348 0, /* CS, C == 1. */
8349 AARCH64_CC_C, /* CC, C == 0. */
8350 0, /* MI, N == 1. */
8351 AARCH64_CC_N, /* PL, N == 0. */
8352 0, /* VS, V == 1. */
8353 AARCH64_CC_V, /* VC, V == 0. */
8354 0, /* HI, C ==1 && Z == 0. */
8355 AARCH64_CC_C, /* LS, !(C == 1 && Z == 0). */
8356 AARCH64_CC_V, /* GE, N == V. */
8357 0, /* LT, N != V. */
8358 AARCH64_CC_Z, /* GT, Z == 0 && N == V. */
8359 0, /* LE, !(Z == 0 && N == V). */
8360 0, /* AL, Any. */
8361 0 /* NV, Any. */
8362 };
8363
8364 /* Print floating-point vector immediate operand X to F, negating it
8365 first if NEGATE is true. Return true on success, false if it isn't
8366 a constant we can handle. */
8367
8368 static bool
8369 aarch64_print_vector_float_operand (FILE *f, rtx x, bool negate)
8370 {
8371 rtx elt;
8372
8373 if (!const_vec_duplicate_p (x, &elt))
8374 return false;
8375
8376 REAL_VALUE_TYPE r = *CONST_DOUBLE_REAL_VALUE (elt);
8377 if (negate)
8378 r = real_value_negate (&r);
8379
8380 /* Handle the SVE single-bit immediates specially, since they have a
8381 fixed form in the assembly syntax. */
8382 if (real_equal (&r, &dconst0))
8383 asm_fprintf (f, "0.0");
8384 else if (real_equal (&r, &dconst2))
8385 asm_fprintf (f, "2.0");
8386 else if (real_equal (&r, &dconst1))
8387 asm_fprintf (f, "1.0");
8388 else if (real_equal (&r, &dconsthalf))
8389 asm_fprintf (f, "0.5");
8390 else
8391 {
8392 const int buf_size = 20;
8393 char float_buf[buf_size] = {'\0'};
8394 real_to_decimal_for_mode (float_buf, &r, buf_size, buf_size,
8395 1, GET_MODE (elt));
8396 asm_fprintf (f, "%s", float_buf);
8397 }
8398
8399 return true;
8400 }
8401
8402 /* Return the equivalent letter for size. */
8403 static char
8404 sizetochar (int size)
8405 {
8406 switch (size)
8407 {
8408 case 64: return 'd';
8409 case 32: return 's';
8410 case 16: return 'h';
8411 case 8 : return 'b';
8412 default: gcc_unreachable ();
8413 }
8414 }
8415
8416 /* Print operand X to file F in a target specific manner according to CODE.
8417 The acceptable formatting commands given by CODE are:
8418 'c': An integer or symbol address without a preceding #
8419 sign.
8420 'C': Take the duplicated element in a vector constant
8421 and print it in hex.
8422 'D': Take the duplicated element in a vector constant
8423 and print it as an unsigned integer, in decimal.
8424 'e': Print the sign/zero-extend size as a character 8->b,
8425 16->h, 32->w. Can also be used for masks:
8426 0xff->b, 0xffff->h, 0xffffffff->w.
8427 'I': If the operand is a duplicated vector constant,
8428 replace it with the duplicated scalar. If the
8429 operand is then a floating-point constant, replace
8430 it with the integer bit representation. Print the
8431 transformed constant as a signed decimal number.
8432 'p': Prints N such that 2^N == X (X must be power of 2 and
8433 const int).
8434 'P': Print the number of non-zero bits in X (a const_int).
8435 'H': Print the higher numbered register of a pair (TImode)
8436 of regs.
8437 'm': Print a condition (eq, ne, etc).
8438 'M': Same as 'm', but invert condition.
8439 'N': Take the duplicated element in a vector constant
8440 and print the negative of it in decimal.
8441 'b/h/s/d/q': Print a scalar FP/SIMD register name.
8442 'S/T/U/V': Print a FP/SIMD register name for a register list.
8443 The register printed is the FP/SIMD register name
8444 of X + 0/1/2/3 for S/T/U/V.
8445 'R': Print a scalar Integer/FP/SIMD register name + 1.
8446 'X': Print bottom 16 bits of integer constant in hex.
8447 'w/x': Print a general register name or the zero register
8448 (32-bit or 64-bit).
8449 '0': Print a normal operand, if it's a general register,
8450 then we assume DImode.
8451 'k': Print NZCV for conditional compare instructions.
8452 'A': Output address constant representing the first
8453 argument of X, specifying a relocation offset
8454 if appropriate.
8455 'L': Output constant address specified by X
8456 with a relocation offset if appropriate.
8457 'G': Prints address of X, specifying a PC relative
8458 relocation mode if appropriate.
8459 'y': Output address of LDP or STP - this is used for
8460 some LDP/STPs which don't use a PARALLEL in their
8461 pattern (so the mode needs to be adjusted).
8462 'z': Output address of a typical LDP or STP. */
8463
8464 static void
8465 aarch64_print_operand (FILE *f, rtx x, int code)
8466 {
8467 rtx elt;
8468 switch (code)
8469 {
8470 case 'c':
8471 switch (GET_CODE (x))
8472 {
8473 case CONST_INT:
8474 fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
8475 break;
8476
8477 case SYMBOL_REF:
8478 output_addr_const (f, x);
8479 break;
8480
8481 case CONST:
8482 if (GET_CODE (XEXP (x, 0)) == PLUS
8483 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
8484 {
8485 output_addr_const (f, x);
8486 break;
8487 }
8488 /* Fall through. */
8489
8490 default:
8491 output_operand_lossage ("unsupported operand for code '%c'", code);
8492 }
8493 break;
8494
8495 case 'e':
8496 {
8497 x = unwrap_const_vec_duplicate (x);
8498 if (!CONST_INT_P (x))
8499 {
8500 output_operand_lossage ("invalid operand for '%%%c'", code);
8501 return;
8502 }
8503
8504 HOST_WIDE_INT val = INTVAL (x);
8505 if ((val & ~7) == 8 || val == 0xff)
8506 fputc ('b', f);
8507 else if ((val & ~7) == 16 || val == 0xffff)
8508 fputc ('h', f);
8509 else if ((val & ~7) == 32 || val == 0xffffffff)
8510 fputc ('w', f);
8511 else
8512 {
8513 output_operand_lossage ("invalid operand for '%%%c'", code);
8514 return;
8515 }
8516 }
8517 break;
8518
8519 case 'p':
8520 {
8521 int n;
8522
8523 if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
8524 {
8525 output_operand_lossage ("invalid operand for '%%%c'", code);
8526 return;
8527 }
8528
8529 asm_fprintf (f, "%d", n);
8530 }
8531 break;
8532
8533 case 'P':
8534 if (!CONST_INT_P (x))
8535 {
8536 output_operand_lossage ("invalid operand for '%%%c'", code);
8537 return;
8538 }
8539
8540 asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
8541 break;
8542
8543 case 'H':
8544 if (x == const0_rtx)
8545 {
8546 asm_fprintf (f, "xzr");
8547 break;
8548 }
8549
8550 if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
8551 {
8552 output_operand_lossage ("invalid operand for '%%%c'", code);
8553 return;
8554 }
8555
8556 asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
8557 break;
8558
8559 case 'I':
8560 {
8561 x = aarch64_bit_representation (unwrap_const_vec_duplicate (x));
8562 if (CONST_INT_P (x))
8563 asm_fprintf (f, "%wd", INTVAL (x));
8564 else
8565 {
8566 output_operand_lossage ("invalid operand for '%%%c'", code);
8567 return;
8568 }
8569 break;
8570 }
8571
8572 case 'M':
8573 case 'm':
8574 {
8575 int cond_code;
8576 /* CONST_TRUE_RTX means al/nv (al is the default, don't print it). */
8577 if (x == const_true_rtx)
8578 {
8579 if (code == 'M')
8580 fputs ("nv", f);
8581 return;
8582 }
8583
8584 if (!COMPARISON_P (x))
8585 {
8586 output_operand_lossage ("invalid operand for '%%%c'", code);
8587 return;
8588 }
8589
8590 cond_code = aarch64_get_condition_code (x);
8591 gcc_assert (cond_code >= 0);
8592 if (code == 'M')
8593 cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code);
8594 if (GET_MODE (XEXP (x, 0)) == CC_NZCmode)
8595 fputs (aarch64_sve_condition_codes[cond_code], f);
8596 else
8597 fputs (aarch64_condition_codes[cond_code], f);
8598 }
8599 break;
8600
8601 case 'N':
8602 if (!const_vec_duplicate_p (x, &elt))
8603 {
8604 output_operand_lossage ("invalid vector constant");
8605 return;
8606 }
8607
8608 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
8609 asm_fprintf (f, "%wd", -INTVAL (elt));
8610 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
8611 && aarch64_print_vector_float_operand (f, x, true))
8612 ;
8613 else
8614 {
8615 output_operand_lossage ("invalid vector constant");
8616 return;
8617 }
8618 break;
8619
8620 case 'b':
8621 case 'h':
8622 case 's':
8623 case 'd':
8624 case 'q':
8625 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
8626 {
8627 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
8628 return;
8629 }
8630 asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
8631 break;
8632
8633 case 'S':
8634 case 'T':
8635 case 'U':
8636 case 'V':
8637 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
8638 {
8639 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
8640 return;
8641 }
8642 asm_fprintf (f, "%c%d",
8643 aarch64_sve_data_mode_p (GET_MODE (x)) ? 'z' : 'v',
8644 REGNO (x) - V0_REGNUM + (code - 'S'));
8645 break;
8646
8647 case 'R':
8648 if (REG_P (x) && FP_REGNUM_P (REGNO (x)))
8649 asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
8650 else if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
8651 asm_fprintf (f, "x%d", REGNO (x) - R0_REGNUM + 1);
8652 else
8653 output_operand_lossage ("incompatible register operand for '%%%c'",
8654 code);
8655 break;
8656
8657 case 'X':
8658 if (!CONST_INT_P (x))
8659 {
8660 output_operand_lossage ("invalid operand for '%%%c'", code);
8661 return;
8662 }
8663 asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
8664 break;
8665
8666 case 'C':
8667 {
8668 /* Print a replicated constant in hex. */
8669 if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
8670 {
8671 output_operand_lossage ("invalid operand for '%%%c'", code);
8672 return;
8673 }
8674 scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
8675 asm_fprintf (f, "0x%wx", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
8676 }
8677 break;
8678
8679 case 'D':
8680 {
8681 /* Print a replicated constant in decimal, treating it as
8682 unsigned. */
8683 if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
8684 {
8685 output_operand_lossage ("invalid operand for '%%%c'", code);
8686 return;
8687 }
8688 scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
8689 asm_fprintf (f, "%wd", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
8690 }
8691 break;
8692
8693 case 'w':
8694 case 'x':
8695 if (x == const0_rtx
8696 || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
8697 {
8698 asm_fprintf (f, "%czr", code);
8699 break;
8700 }
8701
8702 if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
8703 {
8704 asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
8705 break;
8706 }
8707
8708 if (REG_P (x) && REGNO (x) == SP_REGNUM)
8709 {
8710 asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
8711 break;
8712 }
8713
8714 /* Fall through */
8715
8716 case 0:
8717 if (x == NULL)
8718 {
8719 output_operand_lossage ("missing operand");
8720 return;
8721 }
8722
8723 switch (GET_CODE (x))
8724 {
8725 case REG:
8726 if (aarch64_sve_data_mode_p (GET_MODE (x)))
8727 {
8728 if (REG_NREGS (x) == 1)
8729 asm_fprintf (f, "z%d", REGNO (x) - V0_REGNUM);
8730 else
8731 {
8732 char suffix
8733 = sizetochar (GET_MODE_UNIT_BITSIZE (GET_MODE (x)));
8734 asm_fprintf (f, "{z%d.%c - z%d.%c}",
8735 REGNO (x) - V0_REGNUM, suffix,
8736 END_REGNO (x) - V0_REGNUM - 1, suffix);
8737 }
8738 }
8739 else
8740 asm_fprintf (f, "%s", reg_names [REGNO (x)]);
8741 break;
8742
8743 case MEM:
8744 output_address (GET_MODE (x), XEXP (x, 0));
8745 break;
8746
8747 case LABEL_REF:
8748 case SYMBOL_REF:
8749 output_addr_const (asm_out_file, x);
8750 break;
8751
8752 case CONST_INT:
8753 asm_fprintf (f, "%wd", INTVAL (x));
8754 break;
8755
8756 case CONST:
8757 if (!VECTOR_MODE_P (GET_MODE (x)))
8758 {
8759 output_addr_const (asm_out_file, x);
8760 break;
8761 }
8762 /* fall through */
8763
8764 case CONST_VECTOR:
8765 if (!const_vec_duplicate_p (x, &elt))
8766 {
8767 output_operand_lossage ("invalid vector constant");
8768 return;
8769 }
8770
8771 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
8772 asm_fprintf (f, "%wd", INTVAL (elt));
8773 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
8774 && aarch64_print_vector_float_operand (f, x, false))
8775 ;
8776 else
8777 {
8778 output_operand_lossage ("invalid vector constant");
8779 return;
8780 }
8781 break;
8782
8783 case CONST_DOUBLE:
8784 /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
8785 be getting CONST_DOUBLEs holding integers. */
8786 gcc_assert (GET_MODE (x) != VOIDmode);
8787 if (aarch64_float_const_zero_rtx_p (x))
8788 {
8789 fputc ('0', f);
8790 break;
8791 }
8792 else if (aarch64_float_const_representable_p (x))
8793 {
8794 #define buf_size 20
8795 char float_buf[buf_size] = {'\0'};
8796 real_to_decimal_for_mode (float_buf,
8797 CONST_DOUBLE_REAL_VALUE (x),
8798 buf_size, buf_size,
8799 1, GET_MODE (x));
8800 asm_fprintf (asm_out_file, "%s", float_buf);
8801 break;
8802 #undef buf_size
8803 }
8804 output_operand_lossage ("invalid constant");
8805 return;
8806 default:
8807 output_operand_lossage ("invalid operand");
8808 return;
8809 }
8810 break;
8811
8812 case 'A':
8813 if (GET_CODE (x) == HIGH)
8814 x = XEXP (x, 0);
8815
8816 switch (aarch64_classify_symbolic_expression (x))
8817 {
8818 case SYMBOL_SMALL_GOT_4G:
8819 asm_fprintf (asm_out_file, ":got:");
8820 break;
8821
8822 case SYMBOL_SMALL_TLSGD:
8823 asm_fprintf (asm_out_file, ":tlsgd:");
8824 break;
8825
8826 case SYMBOL_SMALL_TLSDESC:
8827 asm_fprintf (asm_out_file, ":tlsdesc:");
8828 break;
8829
8830 case SYMBOL_SMALL_TLSIE:
8831 asm_fprintf (asm_out_file, ":gottprel:");
8832 break;
8833
8834 case SYMBOL_TLSLE24:
8835 asm_fprintf (asm_out_file, ":tprel:");
8836 break;
8837
8838 case SYMBOL_TINY_GOT:
8839 gcc_unreachable ();
8840 break;
8841
8842 default:
8843 break;
8844 }
8845 output_addr_const (asm_out_file, x);
8846 break;
8847
8848 case 'L':
8849 switch (aarch64_classify_symbolic_expression (x))
8850 {
8851 case SYMBOL_SMALL_GOT_4G:
8852 asm_fprintf (asm_out_file, ":lo12:");
8853 break;
8854
8855 case SYMBOL_SMALL_TLSGD:
8856 asm_fprintf (asm_out_file, ":tlsgd_lo12:");
8857 break;
8858
8859 case SYMBOL_SMALL_TLSDESC:
8860 asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
8861 break;
8862
8863 case SYMBOL_SMALL_TLSIE:
8864 asm_fprintf (asm_out_file, ":gottprel_lo12:");
8865 break;
8866
8867 case SYMBOL_TLSLE12:
8868 asm_fprintf (asm_out_file, ":tprel_lo12:");
8869 break;
8870
8871 case SYMBOL_TLSLE24:
8872 asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
8873 break;
8874
8875 case SYMBOL_TINY_GOT:
8876 asm_fprintf (asm_out_file, ":got:");
8877 break;
8878
8879 case SYMBOL_TINY_TLSIE:
8880 asm_fprintf (asm_out_file, ":gottprel:");
8881 break;
8882
8883 default:
8884 break;
8885 }
8886 output_addr_const (asm_out_file, x);
8887 break;
8888
8889 case 'G':
8890 switch (aarch64_classify_symbolic_expression (x))
8891 {
8892 case SYMBOL_TLSLE24:
8893 asm_fprintf (asm_out_file, ":tprel_hi12:");
8894 break;
8895 default:
8896 break;
8897 }
8898 output_addr_const (asm_out_file, x);
8899 break;
8900
8901 case 'k':
8902 {
8903 HOST_WIDE_INT cond_code;
8904
8905 if (!CONST_INT_P (x))
8906 {
8907 output_operand_lossage ("invalid operand for '%%%c'", code);
8908 return;
8909 }
8910
8911 cond_code = INTVAL (x);
8912 gcc_assert (cond_code >= 0 && cond_code <= AARCH64_NV);
8913 asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code]);
8914 }
8915 break;
8916
8917 case 'y':
8918 case 'z':
8919 {
8920 machine_mode mode = GET_MODE (x);
8921
8922 if (GET_CODE (x) != MEM
8923 || (code == 'y' && maybe_ne (GET_MODE_SIZE (mode), 16)))
8924 {
8925 output_operand_lossage ("invalid operand for '%%%c'", code);
8926 return;
8927 }
8928
8929 if (!aarch64_print_address_internal (f, mode, XEXP (x, 0),
8930 code == 'y'
8931 ? ADDR_QUERY_LDP_STP_N
8932 : ADDR_QUERY_LDP_STP))
8933 output_operand_lossage ("invalid operand prefix '%%%c'", code);
8934 }
8935 break;
8936
8937 default:
8938 output_operand_lossage ("invalid operand prefix '%%%c'", code);
8939 return;
8940 }
8941 }
8942
8943 /* Print address 'x' of a memory access with mode 'mode'.
8944 'op' is the context required by aarch64_classify_address. It can either be
8945 MEM for a normal memory access or PARALLEL for LDP/STP. */
8946 static bool
8947 aarch64_print_address_internal (FILE *f, machine_mode mode, rtx x,
8948 aarch64_addr_query_type type)
8949 {
8950 struct aarch64_address_info addr;
8951 unsigned int size;
8952
8953 /* Check all addresses are Pmode - including ILP32. */
8954 if (GET_MODE (x) != Pmode
8955 && (!CONST_INT_P (x)
8956 || trunc_int_for_mode (INTVAL (x), Pmode) != INTVAL (x)))
8957 {
8958 output_operand_lossage ("invalid address mode");
8959 return false;
8960 }
8961
8962 if (aarch64_classify_address (&addr, x, mode, true, type))
8963 switch (addr.type)
8964 {
8965 case ADDRESS_REG_IMM:
8966 if (known_eq (addr.const_offset, 0))
8967 asm_fprintf (f, "[%s]", reg_names [REGNO (addr.base)]);
8968 else if (aarch64_sve_data_mode_p (mode))
8969 {
8970 HOST_WIDE_INT vnum
8971 = exact_div (addr.const_offset,
8972 BYTES_PER_SVE_VECTOR).to_constant ();
8973 asm_fprintf (f, "[%s, #%wd, mul vl]",
8974 reg_names[REGNO (addr.base)], vnum);
8975 }
8976 else if (aarch64_sve_pred_mode_p (mode))
8977 {
8978 HOST_WIDE_INT vnum
8979 = exact_div (addr.const_offset,
8980 BYTES_PER_SVE_PRED).to_constant ();
8981 asm_fprintf (f, "[%s, #%wd, mul vl]",
8982 reg_names[REGNO (addr.base)], vnum);
8983 }
8984 else
8985 asm_fprintf (f, "[%s, %wd]", reg_names [REGNO (addr.base)],
8986 INTVAL (addr.offset));
8987 return true;
8988
8989 case ADDRESS_REG_REG:
8990 if (addr.shift == 0)
8991 asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
8992 reg_names [REGNO (addr.offset)]);
8993 else
8994 asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
8995 reg_names [REGNO (addr.offset)], addr.shift);
8996 return true;
8997
8998 case ADDRESS_REG_UXTW:
8999 if (addr.shift == 0)
9000 asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
9001 REGNO (addr.offset) - R0_REGNUM);
9002 else
9003 asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
9004 REGNO (addr.offset) - R0_REGNUM, addr.shift);
9005 return true;
9006
9007 case ADDRESS_REG_SXTW:
9008 if (addr.shift == 0)
9009 asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
9010 REGNO (addr.offset) - R0_REGNUM);
9011 else
9012 asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
9013 REGNO (addr.offset) - R0_REGNUM, addr.shift);
9014 return true;
9015
9016 case ADDRESS_REG_WB:
9017 /* Writeback is only supported for fixed-width modes. */
9018 size = GET_MODE_SIZE (mode).to_constant ();
9019 switch (GET_CODE (x))
9020 {
9021 case PRE_INC:
9022 asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)], size);
9023 return true;
9024 case POST_INC:
9025 asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)], size);
9026 return true;
9027 case PRE_DEC:
9028 asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)], size);
9029 return true;
9030 case POST_DEC:
9031 asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)], size);
9032 return true;
9033 case PRE_MODIFY:
9034 asm_fprintf (f, "[%s, %wd]!", reg_names[REGNO (addr.base)],
9035 INTVAL (addr.offset));
9036 return true;
9037 case POST_MODIFY:
9038 asm_fprintf (f, "[%s], %wd", reg_names[REGNO (addr.base)],
9039 INTVAL (addr.offset));
9040 return true;
9041 default:
9042 break;
9043 }
9044 break;
9045
9046 case ADDRESS_LO_SUM:
9047 asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
9048 output_addr_const (f, addr.offset);
9049 asm_fprintf (f, "]");
9050 return true;
9051
9052 case ADDRESS_SYMBOLIC:
9053 output_addr_const (f, x);
9054 return true;
9055 }
9056
9057 return false;
9058 }
9059
9060 /* Print address 'x' of a memory access with mode 'mode'. */
9061 static void
9062 aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
9063 {
9064 if (!aarch64_print_address_internal (f, mode, x, ADDR_QUERY_ANY))
9065 output_addr_const (f, x);
9066 }
9067
9068 bool
9069 aarch64_label_mentioned_p (rtx x)
9070 {
9071 const char *fmt;
9072 int i;
9073
9074 if (GET_CODE (x) == LABEL_REF)
9075 return true;
9076
9077 /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
9078 referencing instruction, but they are constant offsets, not
9079 symbols. */
9080 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
9081 return false;
9082
9083 fmt = GET_RTX_FORMAT (GET_CODE (x));
9084 for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
9085 {
9086 if (fmt[i] == 'E')
9087 {
9088 int j;
9089
9090 for (j = XVECLEN (x, i) - 1; j >= 0; j--)
9091 if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
9092 return 1;
9093 }
9094 else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
9095 return 1;
9096 }
9097
9098 return 0;
9099 }
9100
9101 /* Implement REGNO_REG_CLASS. */
9102
9103 enum reg_class
9104 aarch64_regno_regclass (unsigned regno)
9105 {
9106 if (GP_REGNUM_P (regno))
9107 return GENERAL_REGS;
9108
9109 if (regno == SP_REGNUM)
9110 return STACK_REG;
9111
9112 if (regno == FRAME_POINTER_REGNUM
9113 || regno == ARG_POINTER_REGNUM)
9114 return POINTER_REGS;
9115
9116 if (FP_REGNUM_P (regno))
9117 return (FP_LO8_REGNUM_P (regno) ? FP_LO8_REGS
9118 : FP_LO_REGNUM_P (regno) ? FP_LO_REGS : FP_REGS);
9119
9120 if (PR_REGNUM_P (regno))
9121 return PR_LO_REGNUM_P (regno) ? PR_LO_REGS : PR_HI_REGS;
9122
9123 return NO_REGS;
9124 }
9125
9126 /* OFFSET is an address offset for mode MODE, which has SIZE bytes.
9127 If OFFSET is out of range, return an offset of an anchor point
9128 that is in range. Return 0 otherwise. */
9129
9130 static HOST_WIDE_INT
9131 aarch64_anchor_offset (HOST_WIDE_INT offset, HOST_WIDE_INT size,
9132 machine_mode mode)
9133 {
9134 /* Does it look like we'll need a 16-byte load/store-pair operation? */
9135 if (size > 16)
9136 return (offset + 0x400) & ~0x7f0;
9137
9138 /* For offsets that aren't a multiple of the access size, the limit is
9139 -256...255. */
9140 if (offset & (size - 1))
9141 {
9142 /* BLKmode typically uses LDP of X-registers. */
9143 if (mode == BLKmode)
9144 return (offset + 512) & ~0x3ff;
9145 return (offset + 0x100) & ~0x1ff;
9146 }
9147
9148 /* Small negative offsets are supported. */
9149 if (IN_RANGE (offset, -256, 0))
9150 return 0;
9151
9152 if (mode == TImode || mode == TFmode)
9153 return (offset + 0x100) & ~0x1ff;
9154
9155 /* Use 12-bit offset by access size. */
9156 return offset & (~0xfff * size);
9157 }
9158
9159 static rtx
9160 aarch64_legitimize_address (rtx x, rtx /* orig_x */, machine_mode mode)
9161 {
9162 /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
9163 where mask is selected by alignment and size of the offset.
9164 We try to pick as large a range for the offset as possible to
9165 maximize the chance of a CSE. However, for aligned addresses
9166 we limit the range to 4k so that structures with different sized
9167 elements are likely to use the same base. We need to be careful
9168 not to split a CONST for some forms of address expression, otherwise
9169 it will generate sub-optimal code. */
9170
9171 if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
9172 {
9173 rtx base = XEXP (x, 0);
9174 rtx offset_rtx = XEXP (x, 1);
9175 HOST_WIDE_INT offset = INTVAL (offset_rtx);
9176
9177 if (GET_CODE (base) == PLUS)
9178 {
9179 rtx op0 = XEXP (base, 0);
9180 rtx op1 = XEXP (base, 1);
9181
9182 /* Force any scaling into a temp for CSE. */
9183 op0 = force_reg (Pmode, op0);
9184 op1 = force_reg (Pmode, op1);
9185
9186 /* Let the pointer register be in op0. */
9187 if (REG_POINTER (op1))
9188 std::swap (op0, op1);
9189
9190 /* If the pointer is virtual or frame related, then we know that
9191 virtual register instantiation or register elimination is going
9192 to apply a second constant. We want the two constants folded
9193 together easily. Therefore, emit as (OP0 + CONST) + OP1. */
9194 if (virt_or_elim_regno_p (REGNO (op0)))
9195 {
9196 base = expand_binop (Pmode, add_optab, op0, offset_rtx,
9197 NULL_RTX, true, OPTAB_DIRECT);
9198 return gen_rtx_PLUS (Pmode, base, op1);
9199 }
9200
9201 /* Otherwise, in order to encourage CSE (and thence loop strength
9202 reduce) scaled addresses, emit as (OP0 + OP1) + CONST. */
9203 base = expand_binop (Pmode, add_optab, op0, op1,
9204 NULL_RTX, true, OPTAB_DIRECT);
9205 x = gen_rtx_PLUS (Pmode, base, offset_rtx);
9206 }
9207
9208 HOST_WIDE_INT size;
9209 if (GET_MODE_SIZE (mode).is_constant (&size))
9210 {
9211 HOST_WIDE_INT base_offset = aarch64_anchor_offset (offset, size,
9212 mode);
9213 if (base_offset != 0)
9214 {
9215 base = plus_constant (Pmode, base, base_offset);
9216 base = force_operand (base, NULL_RTX);
9217 return plus_constant (Pmode, base, offset - base_offset);
9218 }
9219 }
9220 }
9221
9222 return x;
9223 }
9224
9225 static reg_class_t
9226 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
9227 reg_class_t rclass,
9228 machine_mode mode,
9229 secondary_reload_info *sri)
9230 {
9231 /* Use aarch64_sve_reload_be for SVE reloads that cannot be handled
9232 directly by the *aarch64_sve_mov<mode>_be move pattern. See the
9233 comment at the head of aarch64-sve.md for more details about the
9234 big-endian handling. */
9235 if (BYTES_BIG_ENDIAN
9236 && reg_class_subset_p (rclass, FP_REGS)
9237 && !((REG_P (x) && HARD_REGISTER_P (x))
9238 || aarch64_simd_valid_immediate (x, NULL))
9239 && aarch64_sve_data_mode_p (mode))
9240 {
9241 sri->icode = CODE_FOR_aarch64_sve_reload_be;
9242 return NO_REGS;
9243 }
9244
9245 /* If we have to disable direct literal pool loads and stores because the
9246 function is too big, then we need a scratch register. */
9247 if (MEM_P (x) && GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x)
9248 && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
9249 || targetm.vector_mode_supported_p (GET_MODE (x)))
9250 && !aarch64_pcrelative_literal_loads)
9251 {
9252 sri->icode = code_for_aarch64_reload_movcp (mode, DImode);
9253 return NO_REGS;
9254 }
9255
9256 /* Without the TARGET_SIMD instructions we cannot move a Q register
9257 to a Q register directly. We need a scratch. */
9258 if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
9259 && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
9260 && reg_class_subset_p (rclass, FP_REGS))
9261 {
9262 sri->icode = code_for_aarch64_reload_mov (mode);
9263 return NO_REGS;
9264 }
9265
9266 /* A TFmode or TImode memory access should be handled via an FP_REGS
9267 because AArch64 has richer addressing modes for LDR/STR instructions
9268 than LDP/STP instructions. */
9269 if (TARGET_FLOAT && rclass == GENERAL_REGS
9270 && known_eq (GET_MODE_SIZE (mode), 16) && MEM_P (x))
9271 return FP_REGS;
9272
9273 if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
9274 return GENERAL_REGS;
9275
9276 return NO_REGS;
9277 }
9278
9279 static bool
9280 aarch64_can_eliminate (const int from ATTRIBUTE_UNUSED, const int to)
9281 {
9282 gcc_assert (from == ARG_POINTER_REGNUM || from == FRAME_POINTER_REGNUM);
9283
9284 /* If we need a frame pointer, ARG_POINTER_REGNUM and FRAME_POINTER_REGNUM
9285 can only eliminate to HARD_FRAME_POINTER_REGNUM. */
9286 if (frame_pointer_needed)
9287 return to == HARD_FRAME_POINTER_REGNUM;
9288 return true;
9289 }
9290
9291 poly_int64
9292 aarch64_initial_elimination_offset (unsigned from, unsigned to)
9293 {
9294 if (to == HARD_FRAME_POINTER_REGNUM)
9295 {
9296 if (from == ARG_POINTER_REGNUM)
9297 return cfun->machine->frame.hard_fp_offset;
9298
9299 if (from == FRAME_POINTER_REGNUM)
9300 return cfun->machine->frame.hard_fp_offset
9301 - cfun->machine->frame.locals_offset;
9302 }
9303
9304 if (to == STACK_POINTER_REGNUM)
9305 {
9306 if (from == FRAME_POINTER_REGNUM)
9307 return cfun->machine->frame.frame_size
9308 - cfun->machine->frame.locals_offset;
9309 }
9310
9311 return cfun->machine->frame.frame_size;
9312 }
9313
9314 /* Implement RETURN_ADDR_RTX. We do not support moving back to a
9315 previous frame. */
9316
9317 rtx
9318 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
9319 {
9320 if (count != 0)
9321 return const0_rtx;
9322 return get_hard_reg_initial_val (Pmode, LR_REGNUM);
9323 }
9324
9325
9326 static void
9327 aarch64_asm_trampoline_template (FILE *f)
9328 {
9329 int offset1 = 16;
9330 int offset2 = 20;
9331
9332 if (aarch64_bti_enabled ())
9333 {
9334 asm_fprintf (f, "\thint\t34 // bti c\n");
9335 offset1 -= 4;
9336 offset2 -= 4;
9337 }
9338
9339 if (TARGET_ILP32)
9340 {
9341 asm_fprintf (f, "\tldr\tw%d, .+%d\n", IP1_REGNUM - R0_REGNUM, offset1);
9342 asm_fprintf (f, "\tldr\tw%d, .+%d\n", STATIC_CHAIN_REGNUM - R0_REGNUM,
9343 offset1);
9344 }
9345 else
9346 {
9347 asm_fprintf (f, "\tldr\t%s, .+%d\n", reg_names [IP1_REGNUM], offset1);
9348 asm_fprintf (f, "\tldr\t%s, .+%d\n", reg_names [STATIC_CHAIN_REGNUM],
9349 offset2);
9350 }
9351 asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
9352
9353 /* The trampoline needs an extra padding instruction. In case if BTI is
9354 enabled the padding instruction is replaced by the BTI instruction at
9355 the beginning. */
9356 if (!aarch64_bti_enabled ())
9357 assemble_aligned_integer (4, const0_rtx);
9358
9359 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
9360 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
9361 }
9362
9363 static void
9364 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
9365 {
9366 rtx fnaddr, mem, a_tramp;
9367 const int tramp_code_sz = 16;
9368
9369 /* Don't need to copy the trailing D-words, we fill those in below. */
9370 emit_block_move (m_tramp, assemble_trampoline_template (),
9371 GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
9372 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
9373 fnaddr = XEXP (DECL_RTL (fndecl), 0);
9374 if (GET_MODE (fnaddr) != ptr_mode)
9375 fnaddr = convert_memory_address (ptr_mode, fnaddr);
9376 emit_move_insn (mem, fnaddr);
9377
9378 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
9379 emit_move_insn (mem, chain_value);
9380
9381 /* XXX We should really define a "clear_cache" pattern and use
9382 gen_clear_cache(). */
9383 a_tramp = XEXP (m_tramp, 0);
9384 emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
9385 LCT_NORMAL, VOIDmode, a_tramp, ptr_mode,
9386 plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
9387 ptr_mode);
9388 }
9389
9390 static unsigned char
9391 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
9392 {
9393 /* ??? Logically we should only need to provide a value when
9394 HARD_REGNO_MODE_OK says that at least one register in REGCLASS
9395 can hold MODE, but at the moment we need to handle all modes.
9396 Just ignore any runtime parts for registers that can't store them. */
9397 HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
9398 unsigned int nregs;
9399 switch (regclass)
9400 {
9401 case TAILCALL_ADDR_REGS:
9402 case POINTER_REGS:
9403 case GENERAL_REGS:
9404 case ALL_REGS:
9405 case POINTER_AND_FP_REGS:
9406 case FP_REGS:
9407 case FP_LO_REGS:
9408 case FP_LO8_REGS:
9409 if (aarch64_sve_data_mode_p (mode)
9410 && constant_multiple_p (GET_MODE_SIZE (mode),
9411 BYTES_PER_SVE_VECTOR, &nregs))
9412 return nregs;
9413 return (aarch64_vector_data_mode_p (mode)
9414 ? CEIL (lowest_size, UNITS_PER_VREG)
9415 : CEIL (lowest_size, UNITS_PER_WORD));
9416 case STACK_REG:
9417 case PR_REGS:
9418 case PR_LO_REGS:
9419 case PR_HI_REGS:
9420 return 1;
9421
9422 case NO_REGS:
9423 return 0;
9424
9425 default:
9426 break;
9427 }
9428 gcc_unreachable ();
9429 }
9430
9431 static reg_class_t
9432 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
9433 {
9434 if (regclass == POINTER_REGS)
9435 return GENERAL_REGS;
9436
9437 if (regclass == STACK_REG)
9438 {
9439 if (REG_P(x)
9440 && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
9441 return regclass;
9442
9443 return NO_REGS;
9444 }
9445
9446 /* Register eliminiation can result in a request for
9447 SP+constant->FP_REGS. We cannot support such operations which
9448 use SP as source and an FP_REG as destination, so reject out
9449 right now. */
9450 if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
9451 {
9452 rtx lhs = XEXP (x, 0);
9453
9454 /* Look through a possible SUBREG introduced by ILP32. */
9455 if (GET_CODE (lhs) == SUBREG)
9456 lhs = SUBREG_REG (lhs);
9457
9458 gcc_assert (REG_P (lhs));
9459 gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
9460 POINTER_REGS));
9461 return NO_REGS;
9462 }
9463
9464 return regclass;
9465 }
9466
9467 void
9468 aarch64_asm_output_labelref (FILE* f, const char *name)
9469 {
9470 asm_fprintf (f, "%U%s", name);
9471 }
9472
9473 static void
9474 aarch64_elf_asm_constructor (rtx symbol, int priority)
9475 {
9476 if (priority == DEFAULT_INIT_PRIORITY)
9477 default_ctor_section_asm_out_constructor (symbol, priority);
9478 else
9479 {
9480 section *s;
9481 /* While priority is known to be in range [0, 65535], so 18 bytes
9482 would be enough, the compiler might not know that. To avoid
9483 -Wformat-truncation false positive, use a larger size. */
9484 char buf[23];
9485 snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
9486 s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
9487 switch_to_section (s);
9488 assemble_align (POINTER_SIZE);
9489 assemble_aligned_integer (POINTER_BYTES, symbol);
9490 }
9491 }
9492
9493 static void
9494 aarch64_elf_asm_destructor (rtx symbol, int priority)
9495 {
9496 if (priority == DEFAULT_INIT_PRIORITY)
9497 default_dtor_section_asm_out_destructor (symbol, priority);
9498 else
9499 {
9500 section *s;
9501 /* While priority is known to be in range [0, 65535], so 18 bytes
9502 would be enough, the compiler might not know that. To avoid
9503 -Wformat-truncation false positive, use a larger size. */
9504 char buf[23];
9505 snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
9506 s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
9507 switch_to_section (s);
9508 assemble_align (POINTER_SIZE);
9509 assemble_aligned_integer (POINTER_BYTES, symbol);
9510 }
9511 }
9512
9513 const char*
9514 aarch64_output_casesi (rtx *operands)
9515 {
9516 char buf[100];
9517 char label[100];
9518 rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
9519 int index;
9520 static const char *const patterns[4][2] =
9521 {
9522 {
9523 "ldrb\t%w3, [%0,%w1,uxtw]",
9524 "add\t%3, %4, %w3, sxtb #2"
9525 },
9526 {
9527 "ldrh\t%w3, [%0,%w1,uxtw #1]",
9528 "add\t%3, %4, %w3, sxth #2"
9529 },
9530 {
9531 "ldr\t%w3, [%0,%w1,uxtw #2]",
9532 "add\t%3, %4, %w3, sxtw #2"
9533 },
9534 /* We assume that DImode is only generated when not optimizing and
9535 that we don't really need 64-bit address offsets. That would
9536 imply an object file with 8GB of code in a single function! */
9537 {
9538 "ldr\t%w3, [%0,%w1,uxtw #2]",
9539 "add\t%3, %4, %w3, sxtw #2"
9540 }
9541 };
9542
9543 gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
9544
9545 scalar_int_mode mode = as_a <scalar_int_mode> (GET_MODE (diff_vec));
9546 index = exact_log2 (GET_MODE_SIZE (mode));
9547
9548 gcc_assert (index >= 0 && index <= 3);
9549
9550 /* Need to implement table size reduction, by chaning the code below. */
9551 output_asm_insn (patterns[index][0], operands);
9552 ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
9553 snprintf (buf, sizeof (buf),
9554 "adr\t%%4, %s", targetm.strip_name_encoding (label));
9555 output_asm_insn (buf, operands);
9556 output_asm_insn (patterns[index][1], operands);
9557 output_asm_insn ("br\t%3", operands);
9558 assemble_label (asm_out_file, label);
9559 return "";
9560 }
9561
9562
9563 /* Return size in bits of an arithmetic operand which is shifted/scaled and
9564 masked such that it is suitable for a UXTB, UXTH, or UXTW extend
9565 operator. */
9566
9567 int
9568 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
9569 {
9570 if (shift >= 0 && shift <= 3)
9571 {
9572 int size;
9573 for (size = 8; size <= 32; size *= 2)
9574 {
9575 HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
9576 if (mask == bits << shift)
9577 return size;
9578 }
9579 }
9580 return 0;
9581 }
9582
9583 /* Constant pools are per function only when PC relative
9584 literal loads are true or we are in the large memory
9585 model. */
9586
9587 static inline bool
9588 aarch64_can_use_per_function_literal_pools_p (void)
9589 {
9590 return (aarch64_pcrelative_literal_loads
9591 || aarch64_cmodel == AARCH64_CMODEL_LARGE);
9592 }
9593
9594 static bool
9595 aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
9596 {
9597 /* We can't use blocks for constants when we're using a per-function
9598 constant pool. */
9599 return !aarch64_can_use_per_function_literal_pools_p ();
9600 }
9601
9602 /* Select appropriate section for constants depending
9603 on where we place literal pools. */
9604
9605 static section *
9606 aarch64_select_rtx_section (machine_mode mode,
9607 rtx x,
9608 unsigned HOST_WIDE_INT align)
9609 {
9610 if (aarch64_can_use_per_function_literal_pools_p ())
9611 return function_section (current_function_decl);
9612
9613 return default_elf_select_rtx_section (mode, x, align);
9614 }
9615
9616 /* Implement ASM_OUTPUT_POOL_EPILOGUE. */
9617 void
9618 aarch64_asm_output_pool_epilogue (FILE *f, const char *, tree,
9619 HOST_WIDE_INT offset)
9620 {
9621 /* When using per-function literal pools, we must ensure that any code
9622 section is aligned to the minimal instruction length, lest we get
9623 errors from the assembler re "unaligned instructions". */
9624 if ((offset & 3) && aarch64_can_use_per_function_literal_pools_p ())
9625 ASM_OUTPUT_ALIGN (f, 2);
9626 }
9627
9628 /* Costs. */
9629
9630 /* Helper function for rtx cost calculation. Strip a shift expression
9631 from X. Returns the inner operand if successful, or the original
9632 expression on failure. */
9633 static rtx
9634 aarch64_strip_shift (rtx x)
9635 {
9636 rtx op = x;
9637
9638 /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
9639 we can convert both to ROR during final output. */
9640 if ((GET_CODE (op) == ASHIFT
9641 || GET_CODE (op) == ASHIFTRT
9642 || GET_CODE (op) == LSHIFTRT
9643 || GET_CODE (op) == ROTATERT
9644 || GET_CODE (op) == ROTATE)
9645 && CONST_INT_P (XEXP (op, 1)))
9646 return XEXP (op, 0);
9647
9648 if (GET_CODE (op) == MULT
9649 && CONST_INT_P (XEXP (op, 1))
9650 && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
9651 return XEXP (op, 0);
9652
9653 return x;
9654 }
9655
9656 /* Helper function for rtx cost calculation. Strip an extend
9657 expression from X. Returns the inner operand if successful, or the
9658 original expression on failure. We deal with a number of possible
9659 canonicalization variations here. If STRIP_SHIFT is true, then
9660 we can strip off a shift also. */
9661 static rtx
9662 aarch64_strip_extend (rtx x, bool strip_shift)
9663 {
9664 scalar_int_mode mode;
9665 rtx op = x;
9666
9667 if (!is_a <scalar_int_mode> (GET_MODE (op), &mode))
9668 return op;
9669
9670 /* Zero and sign extraction of a widened value. */
9671 if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
9672 && XEXP (op, 2) == const0_rtx
9673 && GET_CODE (XEXP (op, 0)) == MULT
9674 && aarch64_is_extend_from_extract (mode, XEXP (XEXP (op, 0), 1),
9675 XEXP (op, 1)))
9676 return XEXP (XEXP (op, 0), 0);
9677
9678 /* It can also be represented (for zero-extend) as an AND with an
9679 immediate. */
9680 if (GET_CODE (op) == AND
9681 && GET_CODE (XEXP (op, 0)) == MULT
9682 && CONST_INT_P (XEXP (XEXP (op, 0), 1))
9683 && CONST_INT_P (XEXP (op, 1))
9684 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
9685 INTVAL (XEXP (op, 1))) != 0)
9686 return XEXP (XEXP (op, 0), 0);
9687
9688 /* Now handle extended register, as this may also have an optional
9689 left shift by 1..4. */
9690 if (strip_shift
9691 && GET_CODE (op) == ASHIFT
9692 && CONST_INT_P (XEXP (op, 1))
9693 && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
9694 op = XEXP (op, 0);
9695
9696 if (GET_CODE (op) == ZERO_EXTEND
9697 || GET_CODE (op) == SIGN_EXTEND)
9698 op = XEXP (op, 0);
9699
9700 if (op != x)
9701 return op;
9702
9703 return x;
9704 }
9705
9706 /* Return true iff CODE is a shift supported in combination
9707 with arithmetic instructions. */
9708
9709 static bool
9710 aarch64_shift_p (enum rtx_code code)
9711 {
9712 return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
9713 }
9714
9715
9716 /* Return true iff X is a cheap shift without a sign extend. */
9717
9718 static bool
9719 aarch64_cheap_mult_shift_p (rtx x)
9720 {
9721 rtx op0, op1;
9722
9723 op0 = XEXP (x, 0);
9724 op1 = XEXP (x, 1);
9725
9726 if (!(aarch64_tune_params.extra_tuning_flags
9727 & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND))
9728 return false;
9729
9730 if (GET_CODE (op0) == SIGN_EXTEND)
9731 return false;
9732
9733 if (GET_CODE (x) == ASHIFT && CONST_INT_P (op1)
9734 && UINTVAL (op1) <= 4)
9735 return true;
9736
9737 if (GET_CODE (x) != MULT || !CONST_INT_P (op1))
9738 return false;
9739
9740 HOST_WIDE_INT l2 = exact_log2 (INTVAL (op1));
9741
9742 if (l2 > 0 && l2 <= 4)
9743 return true;
9744
9745 return false;
9746 }
9747
9748 /* Helper function for rtx cost calculation. Calculate the cost of
9749 a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
9750 Return the calculated cost of the expression, recursing manually in to
9751 operands where needed. */
9752
9753 static int
9754 aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
9755 {
9756 rtx op0, op1;
9757 const struct cpu_cost_table *extra_cost
9758 = aarch64_tune_params.insn_extra_cost;
9759 int cost = 0;
9760 bool compound_p = (outer == PLUS || outer == MINUS);
9761 machine_mode mode = GET_MODE (x);
9762
9763 gcc_checking_assert (code == MULT);
9764
9765 op0 = XEXP (x, 0);
9766 op1 = XEXP (x, 1);
9767
9768 if (VECTOR_MODE_P (mode))
9769 mode = GET_MODE_INNER (mode);
9770
9771 /* Integer multiply/fma. */
9772 if (GET_MODE_CLASS (mode) == MODE_INT)
9773 {
9774 /* The multiply will be canonicalized as a shift, cost it as such. */
9775 if (aarch64_shift_p (GET_CODE (x))
9776 || (CONST_INT_P (op1)
9777 && exact_log2 (INTVAL (op1)) > 0))
9778 {
9779 bool is_extend = GET_CODE (op0) == ZERO_EXTEND
9780 || GET_CODE (op0) == SIGN_EXTEND;
9781 if (speed)
9782 {
9783 if (compound_p)
9784 {
9785 /* If the shift is considered cheap,
9786 then don't add any cost. */
9787 if (aarch64_cheap_mult_shift_p (x))
9788 ;
9789 else if (REG_P (op1))
9790 /* ARITH + shift-by-register. */
9791 cost += extra_cost->alu.arith_shift_reg;
9792 else if (is_extend)
9793 /* ARITH + extended register. We don't have a cost field
9794 for ARITH+EXTEND+SHIFT, so use extend_arith here. */
9795 cost += extra_cost->alu.extend_arith;
9796 else
9797 /* ARITH + shift-by-immediate. */
9798 cost += extra_cost->alu.arith_shift;
9799 }
9800 else
9801 /* LSL (immediate). */
9802 cost += extra_cost->alu.shift;
9803
9804 }
9805 /* Strip extends as we will have costed them in the case above. */
9806 if (is_extend)
9807 op0 = aarch64_strip_extend (op0, true);
9808
9809 cost += rtx_cost (op0, VOIDmode, code, 0, speed);
9810
9811 return cost;
9812 }
9813
9814 /* MNEG or [US]MNEGL. Extract the NEG operand and indicate that it's a
9815 compound and let the below cases handle it. After all, MNEG is a
9816 special-case alias of MSUB. */
9817 if (GET_CODE (op0) == NEG)
9818 {
9819 op0 = XEXP (op0, 0);
9820 compound_p = true;
9821 }
9822
9823 /* Integer multiplies or FMAs have zero/sign extending variants. */
9824 if ((GET_CODE (op0) == ZERO_EXTEND
9825 && GET_CODE (op1) == ZERO_EXTEND)
9826 || (GET_CODE (op0) == SIGN_EXTEND
9827 && GET_CODE (op1) == SIGN_EXTEND))
9828 {
9829 cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
9830 cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
9831
9832 if (speed)
9833 {
9834 if (compound_p)
9835 /* SMADDL/UMADDL/UMSUBL/SMSUBL. */
9836 cost += extra_cost->mult[0].extend_add;
9837 else
9838 /* MUL/SMULL/UMULL. */
9839 cost += extra_cost->mult[0].extend;
9840 }
9841
9842 return cost;
9843 }
9844
9845 /* This is either an integer multiply or a MADD. In both cases
9846 we want to recurse and cost the operands. */
9847 cost += rtx_cost (op0, mode, MULT, 0, speed);
9848 cost += rtx_cost (op1, mode, MULT, 1, speed);
9849
9850 if (speed)
9851 {
9852 if (compound_p)
9853 /* MADD/MSUB. */
9854 cost += extra_cost->mult[mode == DImode].add;
9855 else
9856 /* MUL. */
9857 cost += extra_cost->mult[mode == DImode].simple;
9858 }
9859
9860 return cost;
9861 }
9862 else
9863 {
9864 if (speed)
9865 {
9866 /* Floating-point FMA/FMUL can also support negations of the
9867 operands, unless the rounding mode is upward or downward in
9868 which case FNMUL is different than FMUL with operand negation. */
9869 bool neg0 = GET_CODE (op0) == NEG;
9870 bool neg1 = GET_CODE (op1) == NEG;
9871 if (compound_p || !flag_rounding_math || (neg0 && neg1))
9872 {
9873 if (neg0)
9874 op0 = XEXP (op0, 0);
9875 if (neg1)
9876 op1 = XEXP (op1, 0);
9877 }
9878
9879 if (compound_p)
9880 /* FMADD/FNMADD/FNMSUB/FMSUB. */
9881 cost += extra_cost->fp[mode == DFmode].fma;
9882 else
9883 /* FMUL/FNMUL. */
9884 cost += extra_cost->fp[mode == DFmode].mult;
9885 }
9886
9887 cost += rtx_cost (op0, mode, MULT, 0, speed);
9888 cost += rtx_cost (op1, mode, MULT, 1, speed);
9889 return cost;
9890 }
9891 }
9892
9893 static int
9894 aarch64_address_cost (rtx x,
9895 machine_mode mode,
9896 addr_space_t as ATTRIBUTE_UNUSED,
9897 bool speed)
9898 {
9899 enum rtx_code c = GET_CODE (x);
9900 const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
9901 struct aarch64_address_info info;
9902 int cost = 0;
9903 info.shift = 0;
9904
9905 if (!aarch64_classify_address (&info, x, mode, false))
9906 {
9907 if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
9908 {
9909 /* This is a CONST or SYMBOL ref which will be split
9910 in a different way depending on the code model in use.
9911 Cost it through the generic infrastructure. */
9912 int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
9913 /* Divide through by the cost of one instruction to
9914 bring it to the same units as the address costs. */
9915 cost_symbol_ref /= COSTS_N_INSNS (1);
9916 /* The cost is then the cost of preparing the address,
9917 followed by an immediate (possibly 0) offset. */
9918 return cost_symbol_ref + addr_cost->imm_offset;
9919 }
9920 else
9921 {
9922 /* This is most likely a jump table from a case
9923 statement. */
9924 return addr_cost->register_offset;
9925 }
9926 }
9927
9928 switch (info.type)
9929 {
9930 case ADDRESS_LO_SUM:
9931 case ADDRESS_SYMBOLIC:
9932 case ADDRESS_REG_IMM:
9933 cost += addr_cost->imm_offset;
9934 break;
9935
9936 case ADDRESS_REG_WB:
9937 if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
9938 cost += addr_cost->pre_modify;
9939 else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
9940 cost += addr_cost->post_modify;
9941 else
9942 gcc_unreachable ();
9943
9944 break;
9945
9946 case ADDRESS_REG_REG:
9947 cost += addr_cost->register_offset;
9948 break;
9949
9950 case ADDRESS_REG_SXTW:
9951 cost += addr_cost->register_sextend;
9952 break;
9953
9954 case ADDRESS_REG_UXTW:
9955 cost += addr_cost->register_zextend;
9956 break;
9957
9958 default:
9959 gcc_unreachable ();
9960 }
9961
9962
9963 if (info.shift > 0)
9964 {
9965 /* For the sake of calculating the cost of the shifted register
9966 component, we can treat same sized modes in the same way. */
9967 if (known_eq (GET_MODE_BITSIZE (mode), 16))
9968 cost += addr_cost->addr_scale_costs.hi;
9969 else if (known_eq (GET_MODE_BITSIZE (mode), 32))
9970 cost += addr_cost->addr_scale_costs.si;
9971 else if (known_eq (GET_MODE_BITSIZE (mode), 64))
9972 cost += addr_cost->addr_scale_costs.di;
9973 else
9974 /* We can't tell, or this is a 128-bit vector. */
9975 cost += addr_cost->addr_scale_costs.ti;
9976 }
9977
9978 return cost;
9979 }
9980
9981 /* Return the cost of a branch. If SPEED_P is true then the compiler is
9982 optimizing for speed. If PREDICTABLE_P is true then the branch is predicted
9983 to be taken. */
9984
9985 int
9986 aarch64_branch_cost (bool speed_p, bool predictable_p)
9987 {
9988 /* When optimizing for speed, use the cost of unpredictable branches. */
9989 const struct cpu_branch_cost *branch_costs =
9990 aarch64_tune_params.branch_costs;
9991
9992 if (!speed_p || predictable_p)
9993 return branch_costs->predictable;
9994 else
9995 return branch_costs->unpredictable;
9996 }
9997
9998 /* Return true if the RTX X in mode MODE is a zero or sign extract
9999 usable in an ADD or SUB (extended register) instruction. */
10000 static bool
10001 aarch64_rtx_arith_op_extract_p (rtx x, scalar_int_mode mode)
10002 {
10003 /* Catch add with a sign extract.
10004 This is add_<optab><mode>_multp2. */
10005 if (GET_CODE (x) == SIGN_EXTRACT
10006 || GET_CODE (x) == ZERO_EXTRACT)
10007 {
10008 rtx op0 = XEXP (x, 0);
10009 rtx op1 = XEXP (x, 1);
10010 rtx op2 = XEXP (x, 2);
10011
10012 if (GET_CODE (op0) == MULT
10013 && CONST_INT_P (op1)
10014 && op2 == const0_rtx
10015 && CONST_INT_P (XEXP (op0, 1))
10016 && aarch64_is_extend_from_extract (mode,
10017 XEXP (op0, 1),
10018 op1))
10019 {
10020 return true;
10021 }
10022 }
10023 /* The simple case <ARITH>, XD, XN, XM, [us]xt.
10024 No shift. */
10025 else if (GET_CODE (x) == SIGN_EXTEND
10026 || GET_CODE (x) == ZERO_EXTEND)
10027 return REG_P (XEXP (x, 0));
10028
10029 return false;
10030 }
10031
10032 static bool
10033 aarch64_frint_unspec_p (unsigned int u)
10034 {
10035 switch (u)
10036 {
10037 case UNSPEC_FRINTZ:
10038 case UNSPEC_FRINTP:
10039 case UNSPEC_FRINTM:
10040 case UNSPEC_FRINTA:
10041 case UNSPEC_FRINTN:
10042 case UNSPEC_FRINTX:
10043 case UNSPEC_FRINTI:
10044 return true;
10045
10046 default:
10047 return false;
10048 }
10049 }
10050
10051 /* Return true iff X is an rtx that will match an extr instruction
10052 i.e. as described in the *extr<mode>5_insn family of patterns.
10053 OP0 and OP1 will be set to the operands of the shifts involved
10054 on success and will be NULL_RTX otherwise. */
10055
10056 static bool
10057 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
10058 {
10059 rtx op0, op1;
10060 scalar_int_mode mode;
10061 if (!is_a <scalar_int_mode> (GET_MODE (x), &mode))
10062 return false;
10063
10064 *res_op0 = NULL_RTX;
10065 *res_op1 = NULL_RTX;
10066
10067 if (GET_CODE (x) != IOR)
10068 return false;
10069
10070 op0 = XEXP (x, 0);
10071 op1 = XEXP (x, 1);
10072
10073 if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
10074 || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
10075 {
10076 /* Canonicalise locally to ashift in op0, lshiftrt in op1. */
10077 if (GET_CODE (op1) == ASHIFT)
10078 std::swap (op0, op1);
10079
10080 if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
10081 return false;
10082
10083 unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
10084 unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
10085
10086 if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
10087 && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
10088 {
10089 *res_op0 = XEXP (op0, 0);
10090 *res_op1 = XEXP (op1, 0);
10091 return true;
10092 }
10093 }
10094
10095 return false;
10096 }
10097
10098 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
10099 storing it in *COST. Result is true if the total cost of the operation
10100 has now been calculated. */
10101 static bool
10102 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
10103 {
10104 rtx inner;
10105 rtx comparator;
10106 enum rtx_code cmpcode;
10107
10108 if (COMPARISON_P (op0))
10109 {
10110 inner = XEXP (op0, 0);
10111 comparator = XEXP (op0, 1);
10112 cmpcode = GET_CODE (op0);
10113 }
10114 else
10115 {
10116 inner = op0;
10117 comparator = const0_rtx;
10118 cmpcode = NE;
10119 }
10120
10121 if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
10122 {
10123 /* Conditional branch. */
10124 if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
10125 return true;
10126 else
10127 {
10128 if (cmpcode == NE || cmpcode == EQ)
10129 {
10130 if (comparator == const0_rtx)
10131 {
10132 /* TBZ/TBNZ/CBZ/CBNZ. */
10133 if (GET_CODE (inner) == ZERO_EXTRACT)
10134 /* TBZ/TBNZ. */
10135 *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
10136 ZERO_EXTRACT, 0, speed);
10137 else
10138 /* CBZ/CBNZ. */
10139 *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
10140
10141 return true;
10142 }
10143 }
10144 else if (cmpcode == LT || cmpcode == GE)
10145 {
10146 /* TBZ/TBNZ. */
10147 if (comparator == const0_rtx)
10148 return true;
10149 }
10150 }
10151 }
10152 else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
10153 {
10154 /* CCMP. */
10155 if (GET_CODE (op1) == COMPARE)
10156 {
10157 /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0. */
10158 if (XEXP (op1, 1) == const0_rtx)
10159 *cost += 1;
10160 if (speed)
10161 {
10162 machine_mode mode = GET_MODE (XEXP (op1, 0));
10163 const struct cpu_cost_table *extra_cost
10164 = aarch64_tune_params.insn_extra_cost;
10165
10166 if (GET_MODE_CLASS (mode) == MODE_INT)
10167 *cost += extra_cost->alu.arith;
10168 else
10169 *cost += extra_cost->fp[mode == DFmode].compare;
10170 }
10171 return true;
10172 }
10173
10174 /* It's a conditional operation based on the status flags,
10175 so it must be some flavor of CSEL. */
10176
10177 /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL. */
10178 if (GET_CODE (op1) == NEG
10179 || GET_CODE (op1) == NOT
10180 || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
10181 op1 = XEXP (op1, 0);
10182 else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND)
10183 {
10184 /* CSEL with zero-extension (*cmovdi_insn_uxtw). */
10185 op1 = XEXP (op1, 0);
10186 op2 = XEXP (op2, 0);
10187 }
10188
10189 *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
10190 *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
10191 return true;
10192 }
10193
10194 /* We don't know what this is, cost all operands. */
10195 return false;
10196 }
10197
10198 /* Check whether X is a bitfield operation of the form shift + extend that
10199 maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction. If so, return the
10200 operand to which the bitfield operation is applied. Otherwise return
10201 NULL_RTX. */
10202
10203 static rtx
10204 aarch64_extend_bitfield_pattern_p (rtx x)
10205 {
10206 rtx_code outer_code = GET_CODE (x);
10207 machine_mode outer_mode = GET_MODE (x);
10208
10209 if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND
10210 && outer_mode != SImode && outer_mode != DImode)
10211 return NULL_RTX;
10212
10213 rtx inner = XEXP (x, 0);
10214 rtx_code inner_code = GET_CODE (inner);
10215 machine_mode inner_mode = GET_MODE (inner);
10216 rtx op = NULL_RTX;
10217
10218 switch (inner_code)
10219 {
10220 case ASHIFT:
10221 if (CONST_INT_P (XEXP (inner, 1))
10222 && (inner_mode == QImode || inner_mode == HImode))
10223 op = XEXP (inner, 0);
10224 break;
10225 case LSHIFTRT:
10226 if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1))
10227 && (inner_mode == QImode || inner_mode == HImode))
10228 op = XEXP (inner, 0);
10229 break;
10230 case ASHIFTRT:
10231 if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1))
10232 && (inner_mode == QImode || inner_mode == HImode))
10233 op = XEXP (inner, 0);
10234 break;
10235 default:
10236 break;
10237 }
10238
10239 return op;
10240 }
10241
10242 /* Return true if the mask and a shift amount from an RTX of the form
10243 (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
10244 mode MODE. See the *andim_ashift<mode>_bfiz pattern. */
10245
10246 bool
10247 aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode mode, rtx mask,
10248 rtx shft_amnt)
10249 {
10250 return CONST_INT_P (mask) && CONST_INT_P (shft_amnt)
10251 && INTVAL (shft_amnt) < GET_MODE_BITSIZE (mode)
10252 && exact_log2 ((INTVAL (mask) >> INTVAL (shft_amnt)) + 1) >= 0
10253 && (INTVAL (mask)
10254 & ((HOST_WIDE_INT_1U << INTVAL (shft_amnt)) - 1)) == 0;
10255 }
10256
10257 /* Return true if the masks and a shift amount from an RTX of the form
10258 ((x & MASK1) | ((y << SHIFT_AMNT) & MASK2)) are valid to combine into
10259 a BFI instruction of mode MODE. See *arch64_bfi patterns. */
10260
10261 bool
10262 aarch64_masks_and_shift_for_bfi_p (scalar_int_mode mode,
10263 unsigned HOST_WIDE_INT mask1,
10264 unsigned HOST_WIDE_INT shft_amnt,
10265 unsigned HOST_WIDE_INT mask2)
10266 {
10267 unsigned HOST_WIDE_INT t;
10268
10269 /* Verify that there is no overlap in what bits are set in the two masks. */
10270 if (mask1 != ~mask2)
10271 return false;
10272
10273 /* Verify that mask2 is not all zeros or ones. */
10274 if (mask2 == 0 || mask2 == HOST_WIDE_INT_M1U)
10275 return false;
10276
10277 /* The shift amount should always be less than the mode size. */
10278 gcc_assert (shft_amnt < GET_MODE_BITSIZE (mode));
10279
10280 /* Verify that the mask being shifted is contiguous and would be in the
10281 least significant bits after shifting by shft_amnt. */
10282 t = mask2 + (HOST_WIDE_INT_1U << shft_amnt);
10283 return (t == (t & -t));
10284 }
10285
10286 /* Calculate the cost of calculating X, storing it in *COST. Result
10287 is true if the total cost of the operation has now been calculated. */
10288 static bool
10289 aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
10290 int param ATTRIBUTE_UNUSED, int *cost, bool speed)
10291 {
10292 rtx op0, op1, op2;
10293 const struct cpu_cost_table *extra_cost
10294 = aarch64_tune_params.insn_extra_cost;
10295 int code = GET_CODE (x);
10296 scalar_int_mode int_mode;
10297
10298 /* By default, assume that everything has equivalent cost to the
10299 cheapest instruction. Any additional costs are applied as a delta
10300 above this default. */
10301 *cost = COSTS_N_INSNS (1);
10302
10303 switch (code)
10304 {
10305 case SET:
10306 /* The cost depends entirely on the operands to SET. */
10307 *cost = 0;
10308 op0 = SET_DEST (x);
10309 op1 = SET_SRC (x);
10310
10311 switch (GET_CODE (op0))
10312 {
10313 case MEM:
10314 if (speed)
10315 {
10316 rtx address = XEXP (op0, 0);
10317 if (VECTOR_MODE_P (mode))
10318 *cost += extra_cost->ldst.storev;
10319 else if (GET_MODE_CLASS (mode) == MODE_INT)
10320 *cost += extra_cost->ldst.store;
10321 else if (mode == SFmode)
10322 *cost += extra_cost->ldst.storef;
10323 else if (mode == DFmode)
10324 *cost += extra_cost->ldst.stored;
10325
10326 *cost +=
10327 COSTS_N_INSNS (aarch64_address_cost (address, mode,
10328 0, speed));
10329 }
10330
10331 *cost += rtx_cost (op1, mode, SET, 1, speed);
10332 return true;
10333
10334 case SUBREG:
10335 if (! REG_P (SUBREG_REG (op0)))
10336 *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
10337
10338 /* Fall through. */
10339 case REG:
10340 /* The cost is one per vector-register copied. */
10341 if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
10342 {
10343 int nregs = aarch64_hard_regno_nregs (V0_REGNUM, GET_MODE (op0));
10344 *cost = COSTS_N_INSNS (nregs);
10345 }
10346 /* const0_rtx is in general free, but we will use an
10347 instruction to set a register to 0. */
10348 else if (REG_P (op1) || op1 == const0_rtx)
10349 {
10350 /* The cost is 1 per register copied. */
10351 int nregs = aarch64_hard_regno_nregs (R0_REGNUM, GET_MODE (op0));
10352 *cost = COSTS_N_INSNS (nregs);
10353 }
10354 else
10355 /* Cost is just the cost of the RHS of the set. */
10356 *cost += rtx_cost (op1, mode, SET, 1, speed);
10357 return true;
10358
10359 case ZERO_EXTRACT:
10360 case SIGN_EXTRACT:
10361 /* Bit-field insertion. Strip any redundant widening of
10362 the RHS to meet the width of the target. */
10363 if (GET_CODE (op1) == SUBREG)
10364 op1 = SUBREG_REG (op1);
10365 if ((GET_CODE (op1) == ZERO_EXTEND
10366 || GET_CODE (op1) == SIGN_EXTEND)
10367 && CONST_INT_P (XEXP (op0, 1))
10368 && is_a <scalar_int_mode> (GET_MODE (XEXP (op1, 0)), &int_mode)
10369 && GET_MODE_BITSIZE (int_mode) >= INTVAL (XEXP (op0, 1)))
10370 op1 = XEXP (op1, 0);
10371
10372 if (CONST_INT_P (op1))
10373 {
10374 /* MOV immediate is assumed to always be cheap. */
10375 *cost = COSTS_N_INSNS (1);
10376 }
10377 else
10378 {
10379 /* BFM. */
10380 if (speed)
10381 *cost += extra_cost->alu.bfi;
10382 *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed);
10383 }
10384
10385 return true;
10386
10387 default:
10388 /* We can't make sense of this, assume default cost. */
10389 *cost = COSTS_N_INSNS (1);
10390 return false;
10391 }
10392 return false;
10393
10394 case CONST_INT:
10395 /* If an instruction can incorporate a constant within the
10396 instruction, the instruction's expression avoids calling
10397 rtx_cost() on the constant. If rtx_cost() is called on a
10398 constant, then it is usually because the constant must be
10399 moved into a register by one or more instructions.
10400
10401 The exception is constant 0, which can be expressed
10402 as XZR/WZR and is therefore free. The exception to this is
10403 if we have (set (reg) (const0_rtx)) in which case we must cost
10404 the move. However, we can catch that when we cost the SET, so
10405 we don't need to consider that here. */
10406 if (x == const0_rtx)
10407 *cost = 0;
10408 else
10409 {
10410 /* To an approximation, building any other constant is
10411 proportionally expensive to the number of instructions
10412 required to build that constant. This is true whether we
10413 are compiling for SPEED or otherwise. */
10414 if (!is_a <scalar_int_mode> (mode, &int_mode))
10415 int_mode = word_mode;
10416 *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
10417 (NULL_RTX, x, false, int_mode));
10418 }
10419 return true;
10420
10421 case CONST_DOUBLE:
10422
10423 /* First determine number of instructions to do the move
10424 as an integer constant. */
10425 if (!aarch64_float_const_representable_p (x)
10426 && !aarch64_can_const_movi_rtx_p (x, mode)
10427 && aarch64_float_const_rtx_p (x))
10428 {
10429 unsigned HOST_WIDE_INT ival;
10430 bool succeed = aarch64_reinterpret_float_as_int (x, &ival);
10431 gcc_assert (succeed);
10432
10433 scalar_int_mode imode = (mode == HFmode
10434 ? SImode
10435 : int_mode_for_mode (mode).require ());
10436 int ncost = aarch64_internal_mov_immediate
10437 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
10438 *cost += COSTS_N_INSNS (ncost);
10439 return true;
10440 }
10441
10442 if (speed)
10443 {
10444 /* mov[df,sf]_aarch64. */
10445 if (aarch64_float_const_representable_p (x))
10446 /* FMOV (scalar immediate). */
10447 *cost += extra_cost->fp[mode == DFmode].fpconst;
10448 else if (!aarch64_float_const_zero_rtx_p (x))
10449 {
10450 /* This will be a load from memory. */
10451 if (mode == DFmode)
10452 *cost += extra_cost->ldst.loadd;
10453 else
10454 *cost += extra_cost->ldst.loadf;
10455 }
10456 else
10457 /* Otherwise this is +0.0. We get this using MOVI d0, #0
10458 or MOV v0.s[0], wzr - neither of which are modeled by the
10459 cost tables. Just use the default cost. */
10460 {
10461 }
10462 }
10463
10464 return true;
10465
10466 case MEM:
10467 if (speed)
10468 {
10469 /* For loads we want the base cost of a load, plus an
10470 approximation for the additional cost of the addressing
10471 mode. */
10472 rtx address = XEXP (x, 0);
10473 if (VECTOR_MODE_P (mode))
10474 *cost += extra_cost->ldst.loadv;
10475 else if (GET_MODE_CLASS (mode) == MODE_INT)
10476 *cost += extra_cost->ldst.load;
10477 else if (mode == SFmode)
10478 *cost += extra_cost->ldst.loadf;
10479 else if (mode == DFmode)
10480 *cost += extra_cost->ldst.loadd;
10481
10482 *cost +=
10483 COSTS_N_INSNS (aarch64_address_cost (address, mode,
10484 0, speed));
10485 }
10486
10487 return true;
10488
10489 case NEG:
10490 op0 = XEXP (x, 0);
10491
10492 if (VECTOR_MODE_P (mode))
10493 {
10494 if (speed)
10495 {
10496 /* FNEG. */
10497 *cost += extra_cost->vect.alu;
10498 }
10499 return false;
10500 }
10501
10502 if (GET_MODE_CLASS (mode) == MODE_INT)
10503 {
10504 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
10505 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
10506 {
10507 /* CSETM. */
10508 *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
10509 return true;
10510 }
10511
10512 /* Cost this as SUB wzr, X. */
10513 op0 = CONST0_RTX (mode);
10514 op1 = XEXP (x, 0);
10515 goto cost_minus;
10516 }
10517
10518 if (GET_MODE_CLASS (mode) == MODE_FLOAT)
10519 {
10520 /* Support (neg(fma...)) as a single instruction only if
10521 sign of zeros is unimportant. This matches the decision
10522 making in aarch64.md. */
10523 if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
10524 {
10525 /* FNMADD. */
10526 *cost = rtx_cost (op0, mode, NEG, 0, speed);
10527 return true;
10528 }
10529 if (GET_CODE (op0) == MULT)
10530 {
10531 /* FNMUL. */
10532 *cost = rtx_cost (op0, mode, NEG, 0, speed);
10533 return true;
10534 }
10535 if (speed)
10536 /* FNEG. */
10537 *cost += extra_cost->fp[mode == DFmode].neg;
10538 return false;
10539 }
10540
10541 return false;
10542
10543 case CLRSB:
10544 case CLZ:
10545 if (speed)
10546 {
10547 if (VECTOR_MODE_P (mode))
10548 *cost += extra_cost->vect.alu;
10549 else
10550 *cost += extra_cost->alu.clz;
10551 }
10552
10553 return false;
10554
10555 case COMPARE:
10556 op0 = XEXP (x, 0);
10557 op1 = XEXP (x, 1);
10558
10559 if (op1 == const0_rtx
10560 && GET_CODE (op0) == AND)
10561 {
10562 x = op0;
10563 mode = GET_MODE (op0);
10564 goto cost_logic;
10565 }
10566
10567 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
10568 {
10569 /* TODO: A write to the CC flags possibly costs extra, this
10570 needs encoding in the cost tables. */
10571
10572 mode = GET_MODE (op0);
10573 /* ANDS. */
10574 if (GET_CODE (op0) == AND)
10575 {
10576 x = op0;
10577 goto cost_logic;
10578 }
10579
10580 if (GET_CODE (op0) == PLUS)
10581 {
10582 /* ADDS (and CMN alias). */
10583 x = op0;
10584 goto cost_plus;
10585 }
10586
10587 if (GET_CODE (op0) == MINUS)
10588 {
10589 /* SUBS. */
10590 x = op0;
10591 goto cost_minus;
10592 }
10593
10594 if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx
10595 && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1))
10596 && CONST_INT_P (XEXP (op0, 2)))
10597 {
10598 /* COMPARE of ZERO_EXTRACT form of TST-immediate.
10599 Handle it here directly rather than going to cost_logic
10600 since we know the immediate generated for the TST is valid
10601 so we can avoid creating an intermediate rtx for it only
10602 for costing purposes. */
10603 if (speed)
10604 *cost += extra_cost->alu.logical;
10605
10606 *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0),
10607 ZERO_EXTRACT, 0, speed);
10608 return true;
10609 }
10610
10611 if (GET_CODE (op1) == NEG)
10612 {
10613 /* CMN. */
10614 if (speed)
10615 *cost += extra_cost->alu.arith;
10616
10617 *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
10618 *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
10619 return true;
10620 }
10621
10622 /* CMP.
10623
10624 Compare can freely swap the order of operands, and
10625 canonicalization puts the more complex operation first.
10626 But the integer MINUS logic expects the shift/extend
10627 operation in op1. */
10628 if (! (REG_P (op0)
10629 || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
10630 {
10631 op0 = XEXP (x, 1);
10632 op1 = XEXP (x, 0);
10633 }
10634 goto cost_minus;
10635 }
10636
10637 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
10638 {
10639 /* FCMP. */
10640 if (speed)
10641 *cost += extra_cost->fp[mode == DFmode].compare;
10642
10643 if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
10644 {
10645 *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
10646 /* FCMP supports constant 0.0 for no extra cost. */
10647 return true;
10648 }
10649 return false;
10650 }
10651
10652 if (VECTOR_MODE_P (mode))
10653 {
10654 /* Vector compare. */
10655 if (speed)
10656 *cost += extra_cost->vect.alu;
10657
10658 if (aarch64_float_const_zero_rtx_p (op1))
10659 {
10660 /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
10661 cost. */
10662 return true;
10663 }
10664 return false;
10665 }
10666 return false;
10667
10668 case MINUS:
10669 {
10670 op0 = XEXP (x, 0);
10671 op1 = XEXP (x, 1);
10672
10673 cost_minus:
10674 *cost += rtx_cost (op0, mode, MINUS, 0, speed);
10675
10676 /* Detect valid immediates. */
10677 if ((GET_MODE_CLASS (mode) == MODE_INT
10678 || (GET_MODE_CLASS (mode) == MODE_CC
10679 && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
10680 && CONST_INT_P (op1)
10681 && aarch64_uimm12_shift (INTVAL (op1)))
10682 {
10683 if (speed)
10684 /* SUB(S) (immediate). */
10685 *cost += extra_cost->alu.arith;
10686 return true;
10687 }
10688
10689 /* Look for SUB (extended register). */
10690 if (is_a <scalar_int_mode> (mode, &int_mode)
10691 && aarch64_rtx_arith_op_extract_p (op1, int_mode))
10692 {
10693 if (speed)
10694 *cost += extra_cost->alu.extend_arith;
10695
10696 op1 = aarch64_strip_extend (op1, true);
10697 *cost += rtx_cost (op1, VOIDmode,
10698 (enum rtx_code) GET_CODE (op1), 0, speed);
10699 return true;
10700 }
10701
10702 rtx new_op1 = aarch64_strip_extend (op1, false);
10703
10704 /* Cost this as an FMA-alike operation. */
10705 if ((GET_CODE (new_op1) == MULT
10706 || aarch64_shift_p (GET_CODE (new_op1)))
10707 && code != COMPARE)
10708 {
10709 *cost += aarch64_rtx_mult_cost (new_op1, MULT,
10710 (enum rtx_code) code,
10711 speed);
10712 return true;
10713 }
10714
10715 *cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
10716
10717 if (speed)
10718 {
10719 if (VECTOR_MODE_P (mode))
10720 {
10721 /* Vector SUB. */
10722 *cost += extra_cost->vect.alu;
10723 }
10724 else if (GET_MODE_CLASS (mode) == MODE_INT)
10725 {
10726 /* SUB(S). */
10727 *cost += extra_cost->alu.arith;
10728 }
10729 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
10730 {
10731 /* FSUB. */
10732 *cost += extra_cost->fp[mode == DFmode].addsub;
10733 }
10734 }
10735 return true;
10736 }
10737
10738 case PLUS:
10739 {
10740 rtx new_op0;
10741
10742 op0 = XEXP (x, 0);
10743 op1 = XEXP (x, 1);
10744
10745 cost_plus:
10746 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
10747 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
10748 {
10749 /* CSINC. */
10750 *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
10751 *cost += rtx_cost (op1, mode, PLUS, 1, speed);
10752 return true;
10753 }
10754
10755 if (GET_MODE_CLASS (mode) == MODE_INT
10756 && ((CONST_INT_P (op1) && aarch64_uimm12_shift (INTVAL (op1)))
10757 || aarch64_sve_addvl_addpl_immediate (op1, mode)))
10758 {
10759 *cost += rtx_cost (op0, mode, PLUS, 0, speed);
10760
10761 if (speed)
10762 /* ADD (immediate). */
10763 *cost += extra_cost->alu.arith;
10764 return true;
10765 }
10766
10767 *cost += rtx_cost (op1, mode, PLUS, 1, speed);
10768
10769 /* Look for ADD (extended register). */
10770 if (is_a <scalar_int_mode> (mode, &int_mode)
10771 && aarch64_rtx_arith_op_extract_p (op0, int_mode))
10772 {
10773 if (speed)
10774 *cost += extra_cost->alu.extend_arith;
10775
10776 op0 = aarch64_strip_extend (op0, true);
10777 *cost += rtx_cost (op0, VOIDmode,
10778 (enum rtx_code) GET_CODE (op0), 0, speed);
10779 return true;
10780 }
10781
10782 /* Strip any extend, leave shifts behind as we will
10783 cost them through mult_cost. */
10784 new_op0 = aarch64_strip_extend (op0, false);
10785
10786 if (GET_CODE (new_op0) == MULT
10787 || aarch64_shift_p (GET_CODE (new_op0)))
10788 {
10789 *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
10790 speed);
10791 return true;
10792 }
10793
10794 *cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
10795
10796 if (speed)
10797 {
10798 if (VECTOR_MODE_P (mode))
10799 {
10800 /* Vector ADD. */
10801 *cost += extra_cost->vect.alu;
10802 }
10803 else if (GET_MODE_CLASS (mode) == MODE_INT)
10804 {
10805 /* ADD. */
10806 *cost += extra_cost->alu.arith;
10807 }
10808 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
10809 {
10810 /* FADD. */
10811 *cost += extra_cost->fp[mode == DFmode].addsub;
10812 }
10813 }
10814 return true;
10815 }
10816
10817 case BSWAP:
10818 *cost = COSTS_N_INSNS (1);
10819
10820 if (speed)
10821 {
10822 if (VECTOR_MODE_P (mode))
10823 *cost += extra_cost->vect.alu;
10824 else
10825 *cost += extra_cost->alu.rev;
10826 }
10827 return false;
10828
10829 case IOR:
10830 if (aarch_rev16_p (x))
10831 {
10832 *cost = COSTS_N_INSNS (1);
10833
10834 if (speed)
10835 {
10836 if (VECTOR_MODE_P (mode))
10837 *cost += extra_cost->vect.alu;
10838 else
10839 *cost += extra_cost->alu.rev;
10840 }
10841 return true;
10842 }
10843
10844 if (aarch64_extr_rtx_p (x, &op0, &op1))
10845 {
10846 *cost += rtx_cost (op0, mode, IOR, 0, speed);
10847 *cost += rtx_cost (op1, mode, IOR, 1, speed);
10848 if (speed)
10849 *cost += extra_cost->alu.shift;
10850
10851 return true;
10852 }
10853 /* Fall through. */
10854 case XOR:
10855 case AND:
10856 cost_logic:
10857 op0 = XEXP (x, 0);
10858 op1 = XEXP (x, 1);
10859
10860 if (VECTOR_MODE_P (mode))
10861 {
10862 if (speed)
10863 *cost += extra_cost->vect.alu;
10864 return true;
10865 }
10866
10867 if (code == AND
10868 && GET_CODE (op0) == MULT
10869 && CONST_INT_P (XEXP (op0, 1))
10870 && CONST_INT_P (op1)
10871 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
10872 INTVAL (op1)) != 0)
10873 {
10874 /* This is a UBFM/SBFM. */
10875 *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
10876 if (speed)
10877 *cost += extra_cost->alu.bfx;
10878 return true;
10879 }
10880
10881 if (is_int_mode (mode, &int_mode))
10882 {
10883 if (CONST_INT_P (op1))
10884 {
10885 /* We have a mask + shift version of a UBFIZ
10886 i.e. the *andim_ashift<mode>_bfiz pattern. */
10887 if (GET_CODE (op0) == ASHIFT
10888 && aarch64_mask_and_shift_for_ubfiz_p (int_mode, op1,
10889 XEXP (op0, 1)))
10890 {
10891 *cost += rtx_cost (XEXP (op0, 0), int_mode,
10892 (enum rtx_code) code, 0, speed);
10893 if (speed)
10894 *cost += extra_cost->alu.bfx;
10895
10896 return true;
10897 }
10898 else if (aarch64_bitmask_imm (INTVAL (op1), int_mode))
10899 {
10900 /* We possibly get the immediate for free, this is not
10901 modelled. */
10902 *cost += rtx_cost (op0, int_mode,
10903 (enum rtx_code) code, 0, speed);
10904 if (speed)
10905 *cost += extra_cost->alu.logical;
10906
10907 return true;
10908 }
10909 }
10910 else
10911 {
10912 rtx new_op0 = op0;
10913
10914 /* Handle ORN, EON, or BIC. */
10915 if (GET_CODE (op0) == NOT)
10916 op0 = XEXP (op0, 0);
10917
10918 new_op0 = aarch64_strip_shift (op0);
10919
10920 /* If we had a shift on op0 then this is a logical-shift-
10921 by-register/immediate operation. Otherwise, this is just
10922 a logical operation. */
10923 if (speed)
10924 {
10925 if (new_op0 != op0)
10926 {
10927 /* Shift by immediate. */
10928 if (CONST_INT_P (XEXP (op0, 1)))
10929 *cost += extra_cost->alu.log_shift;
10930 else
10931 *cost += extra_cost->alu.log_shift_reg;
10932 }
10933 else
10934 *cost += extra_cost->alu.logical;
10935 }
10936
10937 /* In both cases we want to cost both operands. */
10938 *cost += rtx_cost (new_op0, int_mode, (enum rtx_code) code,
10939 0, speed);
10940 *cost += rtx_cost (op1, int_mode, (enum rtx_code) code,
10941 1, speed);
10942
10943 return true;
10944 }
10945 }
10946 return false;
10947
10948 case NOT:
10949 x = XEXP (x, 0);
10950 op0 = aarch64_strip_shift (x);
10951
10952 if (VECTOR_MODE_P (mode))
10953 {
10954 /* Vector NOT. */
10955 *cost += extra_cost->vect.alu;
10956 return false;
10957 }
10958
10959 /* MVN-shifted-reg. */
10960 if (op0 != x)
10961 {
10962 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
10963
10964 if (speed)
10965 *cost += extra_cost->alu.log_shift;
10966
10967 return true;
10968 }
10969 /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
10970 Handle the second form here taking care that 'a' in the above can
10971 be a shift. */
10972 else if (GET_CODE (op0) == XOR)
10973 {
10974 rtx newop0 = XEXP (op0, 0);
10975 rtx newop1 = XEXP (op0, 1);
10976 rtx op0_stripped = aarch64_strip_shift (newop0);
10977
10978 *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed);
10979 *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
10980
10981 if (speed)
10982 {
10983 if (op0_stripped != newop0)
10984 *cost += extra_cost->alu.log_shift;
10985 else
10986 *cost += extra_cost->alu.logical;
10987 }
10988
10989 return true;
10990 }
10991 /* MVN. */
10992 if (speed)
10993 *cost += extra_cost->alu.logical;
10994
10995 return false;
10996
10997 case ZERO_EXTEND:
10998
10999 op0 = XEXP (x, 0);
11000 /* If a value is written in SI mode, then zero extended to DI
11001 mode, the operation will in general be free as a write to
11002 a 'w' register implicitly zeroes the upper bits of an 'x'
11003 register. However, if this is
11004
11005 (set (reg) (zero_extend (reg)))
11006
11007 we must cost the explicit register move. */
11008 if (mode == DImode
11009 && GET_MODE (op0) == SImode
11010 && outer == SET)
11011 {
11012 int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
11013
11014 /* If OP_COST is non-zero, then the cost of the zero extend
11015 is effectively the cost of the inner operation. Otherwise
11016 we have a MOV instruction and we take the cost from the MOV
11017 itself. This is true independently of whether we are
11018 optimizing for space or time. */
11019 if (op_cost)
11020 *cost = op_cost;
11021
11022 return true;
11023 }
11024 else if (MEM_P (op0))
11025 {
11026 /* All loads can zero extend to any size for free. */
11027 *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
11028 return true;
11029 }
11030
11031 op0 = aarch64_extend_bitfield_pattern_p (x);
11032 if (op0)
11033 {
11034 *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed);
11035 if (speed)
11036 *cost += extra_cost->alu.bfx;
11037 return true;
11038 }
11039
11040 if (speed)
11041 {
11042 if (VECTOR_MODE_P (mode))
11043 {
11044 /* UMOV. */
11045 *cost += extra_cost->vect.alu;
11046 }
11047 else
11048 {
11049 /* We generate an AND instead of UXTB/UXTH. */
11050 *cost += extra_cost->alu.logical;
11051 }
11052 }
11053 return false;
11054
11055 case SIGN_EXTEND:
11056 if (MEM_P (XEXP (x, 0)))
11057 {
11058 /* LDRSH. */
11059 if (speed)
11060 {
11061 rtx address = XEXP (XEXP (x, 0), 0);
11062 *cost += extra_cost->ldst.load_sign_extend;
11063
11064 *cost +=
11065 COSTS_N_INSNS (aarch64_address_cost (address, mode,
11066 0, speed));
11067 }
11068 return true;
11069 }
11070
11071 op0 = aarch64_extend_bitfield_pattern_p (x);
11072 if (op0)
11073 {
11074 *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed);
11075 if (speed)
11076 *cost += extra_cost->alu.bfx;
11077 return true;
11078 }
11079
11080 if (speed)
11081 {
11082 if (VECTOR_MODE_P (mode))
11083 *cost += extra_cost->vect.alu;
11084 else
11085 *cost += extra_cost->alu.extend;
11086 }
11087 return false;
11088
11089 case ASHIFT:
11090 op0 = XEXP (x, 0);
11091 op1 = XEXP (x, 1);
11092
11093 if (CONST_INT_P (op1))
11094 {
11095 if (speed)
11096 {
11097 if (VECTOR_MODE_P (mode))
11098 {
11099 /* Vector shift (immediate). */
11100 *cost += extra_cost->vect.alu;
11101 }
11102 else
11103 {
11104 /* LSL (immediate), UBMF, UBFIZ and friends. These are all
11105 aliases. */
11106 *cost += extra_cost->alu.shift;
11107 }
11108 }
11109
11110 /* We can incorporate zero/sign extend for free. */
11111 if (GET_CODE (op0) == ZERO_EXTEND
11112 || GET_CODE (op0) == SIGN_EXTEND)
11113 op0 = XEXP (op0, 0);
11114
11115 *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
11116 return true;
11117 }
11118 else
11119 {
11120 if (VECTOR_MODE_P (mode))
11121 {
11122 if (speed)
11123 /* Vector shift (register). */
11124 *cost += extra_cost->vect.alu;
11125 }
11126 else
11127 {
11128 if (speed)
11129 /* LSLV. */
11130 *cost += extra_cost->alu.shift_reg;
11131
11132 if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
11133 && CONST_INT_P (XEXP (op1, 1))
11134 && known_eq (INTVAL (XEXP (op1, 1)),
11135 GET_MODE_BITSIZE (mode) - 1))
11136 {
11137 *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
11138 /* We already demanded XEXP (op1, 0) to be REG_P, so
11139 don't recurse into it. */
11140 return true;
11141 }
11142 }
11143 return false; /* All arguments need to be in registers. */
11144 }
11145
11146 case ROTATE:
11147 case ROTATERT:
11148 case LSHIFTRT:
11149 case ASHIFTRT:
11150 op0 = XEXP (x, 0);
11151 op1 = XEXP (x, 1);
11152
11153 if (CONST_INT_P (op1))
11154 {
11155 /* ASR (immediate) and friends. */
11156 if (speed)
11157 {
11158 if (VECTOR_MODE_P (mode))
11159 *cost += extra_cost->vect.alu;
11160 else
11161 *cost += extra_cost->alu.shift;
11162 }
11163
11164 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
11165 return true;
11166 }
11167 else
11168 {
11169 if (VECTOR_MODE_P (mode))
11170 {
11171 if (speed)
11172 /* Vector shift (register). */
11173 *cost += extra_cost->vect.alu;
11174 }
11175 else
11176 {
11177 if (speed)
11178 /* ASR (register) and friends. */
11179 *cost += extra_cost->alu.shift_reg;
11180
11181 if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
11182 && CONST_INT_P (XEXP (op1, 1))
11183 && known_eq (INTVAL (XEXP (op1, 1)),
11184 GET_MODE_BITSIZE (mode) - 1))
11185 {
11186 *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
11187 /* We already demanded XEXP (op1, 0) to be REG_P, so
11188 don't recurse into it. */
11189 return true;
11190 }
11191 }
11192 return false; /* All arguments need to be in registers. */
11193 }
11194
11195 case SYMBOL_REF:
11196
11197 if (aarch64_cmodel == AARCH64_CMODEL_LARGE
11198 || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
11199 {
11200 /* LDR. */
11201 if (speed)
11202 *cost += extra_cost->ldst.load;
11203 }
11204 else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
11205 || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
11206 {
11207 /* ADRP, followed by ADD. */
11208 *cost += COSTS_N_INSNS (1);
11209 if (speed)
11210 *cost += 2 * extra_cost->alu.arith;
11211 }
11212 else if (aarch64_cmodel == AARCH64_CMODEL_TINY
11213 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
11214 {
11215 /* ADR. */
11216 if (speed)
11217 *cost += extra_cost->alu.arith;
11218 }
11219
11220 if (flag_pic)
11221 {
11222 /* One extra load instruction, after accessing the GOT. */
11223 *cost += COSTS_N_INSNS (1);
11224 if (speed)
11225 *cost += extra_cost->ldst.load;
11226 }
11227 return true;
11228
11229 case HIGH:
11230 case LO_SUM:
11231 /* ADRP/ADD (immediate). */
11232 if (speed)
11233 *cost += extra_cost->alu.arith;
11234 return true;
11235
11236 case ZERO_EXTRACT:
11237 case SIGN_EXTRACT:
11238 /* UBFX/SBFX. */
11239 if (speed)
11240 {
11241 if (VECTOR_MODE_P (mode))
11242 *cost += extra_cost->vect.alu;
11243 else
11244 *cost += extra_cost->alu.bfx;
11245 }
11246
11247 /* We can trust that the immediates used will be correct (there
11248 are no by-register forms), so we need only cost op0. */
11249 *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed);
11250 return true;
11251
11252 case MULT:
11253 *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
11254 /* aarch64_rtx_mult_cost always handles recursion to its
11255 operands. */
11256 return true;
11257
11258 case MOD:
11259 /* We can expand signed mod by power of 2 using a NEGS, two parallel
11260 ANDs and a CSNEG. Assume here that CSNEG is the same as the cost of
11261 an unconditional negate. This case should only ever be reached through
11262 the set_smod_pow2_cheap check in expmed.c. */
11263 if (CONST_INT_P (XEXP (x, 1))
11264 && exact_log2 (INTVAL (XEXP (x, 1))) > 0
11265 && (mode == SImode || mode == DImode))
11266 {
11267 /* We expand to 4 instructions. Reset the baseline. */
11268 *cost = COSTS_N_INSNS (4);
11269
11270 if (speed)
11271 *cost += 2 * extra_cost->alu.logical
11272 + 2 * extra_cost->alu.arith;
11273
11274 return true;
11275 }
11276
11277 /* Fall-through. */
11278 case UMOD:
11279 if (speed)
11280 {
11281 /* Slighly prefer UMOD over SMOD. */
11282 if (VECTOR_MODE_P (mode))
11283 *cost += extra_cost->vect.alu;
11284 else if (GET_MODE_CLASS (mode) == MODE_INT)
11285 *cost += (extra_cost->mult[mode == DImode].add
11286 + extra_cost->mult[mode == DImode].idiv
11287 + (code == MOD ? 1 : 0));
11288 }
11289 return false; /* All arguments need to be in registers. */
11290
11291 case DIV:
11292 case UDIV:
11293 case SQRT:
11294 if (speed)
11295 {
11296 if (VECTOR_MODE_P (mode))
11297 *cost += extra_cost->vect.alu;
11298 else if (GET_MODE_CLASS (mode) == MODE_INT)
11299 /* There is no integer SQRT, so only DIV and UDIV can get
11300 here. */
11301 *cost += (extra_cost->mult[mode == DImode].idiv
11302 /* Slighly prefer UDIV over SDIV. */
11303 + (code == DIV ? 1 : 0));
11304 else
11305 *cost += extra_cost->fp[mode == DFmode].div;
11306 }
11307 return false; /* All arguments need to be in registers. */
11308
11309 case IF_THEN_ELSE:
11310 return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
11311 XEXP (x, 2), cost, speed);
11312
11313 case EQ:
11314 case NE:
11315 case GT:
11316 case GTU:
11317 case LT:
11318 case LTU:
11319 case GE:
11320 case GEU:
11321 case LE:
11322 case LEU:
11323
11324 return false; /* All arguments must be in registers. */
11325
11326 case FMA:
11327 op0 = XEXP (x, 0);
11328 op1 = XEXP (x, 1);
11329 op2 = XEXP (x, 2);
11330
11331 if (speed)
11332 {
11333 if (VECTOR_MODE_P (mode))
11334 *cost += extra_cost->vect.alu;
11335 else
11336 *cost += extra_cost->fp[mode == DFmode].fma;
11337 }
11338
11339 /* FMSUB, FNMADD, and FNMSUB are free. */
11340 if (GET_CODE (op0) == NEG)
11341 op0 = XEXP (op0, 0);
11342
11343 if (GET_CODE (op2) == NEG)
11344 op2 = XEXP (op2, 0);
11345
11346 /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
11347 and the by-element operand as operand 0. */
11348 if (GET_CODE (op1) == NEG)
11349 op1 = XEXP (op1, 0);
11350
11351 /* Catch vector-by-element operations. The by-element operand can
11352 either be (vec_duplicate (vec_select (x))) or just
11353 (vec_select (x)), depending on whether we are multiplying by
11354 a vector or a scalar.
11355
11356 Canonicalization is not very good in these cases, FMA4 will put the
11357 by-element operand as operand 0, FNMA4 will have it as operand 1. */
11358 if (GET_CODE (op0) == VEC_DUPLICATE)
11359 op0 = XEXP (op0, 0);
11360 else if (GET_CODE (op1) == VEC_DUPLICATE)
11361 op1 = XEXP (op1, 0);
11362
11363 if (GET_CODE (op0) == VEC_SELECT)
11364 op0 = XEXP (op0, 0);
11365 else if (GET_CODE (op1) == VEC_SELECT)
11366 op1 = XEXP (op1, 0);
11367
11368 /* If the remaining parameters are not registers,
11369 get the cost to put them into registers. */
11370 *cost += rtx_cost (op0, mode, FMA, 0, speed);
11371 *cost += rtx_cost (op1, mode, FMA, 1, speed);
11372 *cost += rtx_cost (op2, mode, FMA, 2, speed);
11373 return true;
11374
11375 case FLOAT:
11376 case UNSIGNED_FLOAT:
11377 if (speed)
11378 *cost += extra_cost->fp[mode == DFmode].fromint;
11379 return false;
11380
11381 case FLOAT_EXTEND:
11382 if (speed)
11383 {
11384 if (VECTOR_MODE_P (mode))
11385 {
11386 /*Vector truncate. */
11387 *cost += extra_cost->vect.alu;
11388 }
11389 else
11390 *cost += extra_cost->fp[mode == DFmode].widen;
11391 }
11392 return false;
11393
11394 case FLOAT_TRUNCATE:
11395 if (speed)
11396 {
11397 if (VECTOR_MODE_P (mode))
11398 {
11399 /*Vector conversion. */
11400 *cost += extra_cost->vect.alu;
11401 }
11402 else
11403 *cost += extra_cost->fp[mode == DFmode].narrow;
11404 }
11405 return false;
11406
11407 case FIX:
11408 case UNSIGNED_FIX:
11409 x = XEXP (x, 0);
11410 /* Strip the rounding part. They will all be implemented
11411 by the fcvt* family of instructions anyway. */
11412 if (GET_CODE (x) == UNSPEC)
11413 {
11414 unsigned int uns_code = XINT (x, 1);
11415
11416 if (uns_code == UNSPEC_FRINTA
11417 || uns_code == UNSPEC_FRINTM
11418 || uns_code == UNSPEC_FRINTN
11419 || uns_code == UNSPEC_FRINTP
11420 || uns_code == UNSPEC_FRINTZ)
11421 x = XVECEXP (x, 0, 0);
11422 }
11423
11424 if (speed)
11425 {
11426 if (VECTOR_MODE_P (mode))
11427 *cost += extra_cost->vect.alu;
11428 else
11429 *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
11430 }
11431
11432 /* We can combine fmul by a power of 2 followed by a fcvt into a single
11433 fixed-point fcvt. */
11434 if (GET_CODE (x) == MULT
11435 && ((VECTOR_MODE_P (mode)
11436 && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
11437 || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
11438 {
11439 *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code,
11440 0, speed);
11441 return true;
11442 }
11443
11444 *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
11445 return true;
11446
11447 case ABS:
11448 if (VECTOR_MODE_P (mode))
11449 {
11450 /* ABS (vector). */
11451 if (speed)
11452 *cost += extra_cost->vect.alu;
11453 }
11454 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
11455 {
11456 op0 = XEXP (x, 0);
11457
11458 /* FABD, which is analogous to FADD. */
11459 if (GET_CODE (op0) == MINUS)
11460 {
11461 *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
11462 *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
11463 if (speed)
11464 *cost += extra_cost->fp[mode == DFmode].addsub;
11465
11466 return true;
11467 }
11468 /* Simple FABS is analogous to FNEG. */
11469 if (speed)
11470 *cost += extra_cost->fp[mode == DFmode].neg;
11471 }
11472 else
11473 {
11474 /* Integer ABS will either be split to
11475 two arithmetic instructions, or will be an ABS
11476 (scalar), which we don't model. */
11477 *cost = COSTS_N_INSNS (2);
11478 if (speed)
11479 *cost += 2 * extra_cost->alu.arith;
11480 }
11481 return false;
11482
11483 case SMAX:
11484 case SMIN:
11485 if (speed)
11486 {
11487 if (VECTOR_MODE_P (mode))
11488 *cost += extra_cost->vect.alu;
11489 else
11490 {
11491 /* FMAXNM/FMINNM/FMAX/FMIN.
11492 TODO: This may not be accurate for all implementations, but
11493 we do not model this in the cost tables. */
11494 *cost += extra_cost->fp[mode == DFmode].addsub;
11495 }
11496 }
11497 return false;
11498
11499 case UNSPEC:
11500 /* The floating point round to integer frint* instructions. */
11501 if (aarch64_frint_unspec_p (XINT (x, 1)))
11502 {
11503 if (speed)
11504 *cost += extra_cost->fp[mode == DFmode].roundint;
11505
11506 return false;
11507 }
11508
11509 if (XINT (x, 1) == UNSPEC_RBIT)
11510 {
11511 if (speed)
11512 *cost += extra_cost->alu.rev;
11513
11514 return false;
11515 }
11516 break;
11517
11518 case TRUNCATE:
11519
11520 /* Decompose <su>muldi3_highpart. */
11521 if (/* (truncate:DI */
11522 mode == DImode
11523 /* (lshiftrt:TI */
11524 && GET_MODE (XEXP (x, 0)) == TImode
11525 && GET_CODE (XEXP (x, 0)) == LSHIFTRT
11526 /* (mult:TI */
11527 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
11528 /* (ANY_EXTEND:TI (reg:DI))
11529 (ANY_EXTEND:TI (reg:DI))) */
11530 && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
11531 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
11532 || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
11533 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
11534 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
11535 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
11536 /* (const_int 64) */
11537 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
11538 && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
11539 {
11540 /* UMULH/SMULH. */
11541 if (speed)
11542 *cost += extra_cost->mult[mode == DImode].extend;
11543 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
11544 mode, MULT, 0, speed);
11545 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
11546 mode, MULT, 1, speed);
11547 return true;
11548 }
11549
11550 /* Fall through. */
11551 default:
11552 break;
11553 }
11554
11555 if (dump_file
11556 && flag_aarch64_verbose_cost)
11557 fprintf (dump_file,
11558 "\nFailed to cost RTX. Assuming default cost.\n");
11559
11560 return true;
11561 }
11562
11563 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
11564 calculated for X. This cost is stored in *COST. Returns true
11565 if the total cost of X was calculated. */
11566 static bool
11567 aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
11568 int param, int *cost, bool speed)
11569 {
11570 bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
11571
11572 if (dump_file
11573 && flag_aarch64_verbose_cost)
11574 {
11575 print_rtl_single (dump_file, x);
11576 fprintf (dump_file, "\n%s cost: %d (%s)\n",
11577 speed ? "Hot" : "Cold",
11578 *cost, result ? "final" : "partial");
11579 }
11580
11581 return result;
11582 }
11583
11584 static int
11585 aarch64_register_move_cost (machine_mode mode,
11586 reg_class_t from_i, reg_class_t to_i)
11587 {
11588 enum reg_class from = (enum reg_class) from_i;
11589 enum reg_class to = (enum reg_class) to_i;
11590 const struct cpu_regmove_cost *regmove_cost
11591 = aarch64_tune_params.regmove_cost;
11592
11593 /* Caller save and pointer regs are equivalent to GENERAL_REGS. */
11594 if (to == TAILCALL_ADDR_REGS || to == POINTER_REGS)
11595 to = GENERAL_REGS;
11596
11597 if (from == TAILCALL_ADDR_REGS || from == POINTER_REGS)
11598 from = GENERAL_REGS;
11599
11600 /* Moving between GPR and stack cost is the same as GP2GP. */
11601 if ((from == GENERAL_REGS && to == STACK_REG)
11602 || (to == GENERAL_REGS && from == STACK_REG))
11603 return regmove_cost->GP2GP;
11604
11605 /* To/From the stack register, we move via the gprs. */
11606 if (to == STACK_REG || from == STACK_REG)
11607 return aarch64_register_move_cost (mode, from, GENERAL_REGS)
11608 + aarch64_register_move_cost (mode, GENERAL_REGS, to);
11609
11610 if (known_eq (GET_MODE_SIZE (mode), 16))
11611 {
11612 /* 128-bit operations on general registers require 2 instructions. */
11613 if (from == GENERAL_REGS && to == GENERAL_REGS)
11614 return regmove_cost->GP2GP * 2;
11615 else if (from == GENERAL_REGS)
11616 return regmove_cost->GP2FP * 2;
11617 else if (to == GENERAL_REGS)
11618 return regmove_cost->FP2GP * 2;
11619
11620 /* When AdvSIMD instructions are disabled it is not possible to move
11621 a 128-bit value directly between Q registers. This is handled in
11622 secondary reload. A general register is used as a scratch to move
11623 the upper DI value and the lower DI value is moved directly,
11624 hence the cost is the sum of three moves. */
11625 if (! TARGET_SIMD)
11626 return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
11627
11628 return regmove_cost->FP2FP;
11629 }
11630
11631 if (from == GENERAL_REGS && to == GENERAL_REGS)
11632 return regmove_cost->GP2GP;
11633 else if (from == GENERAL_REGS)
11634 return regmove_cost->GP2FP;
11635 else if (to == GENERAL_REGS)
11636 return regmove_cost->FP2GP;
11637
11638 return regmove_cost->FP2FP;
11639 }
11640
11641 static int
11642 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
11643 reg_class_t rclass ATTRIBUTE_UNUSED,
11644 bool in ATTRIBUTE_UNUSED)
11645 {
11646 return aarch64_tune_params.memmov_cost;
11647 }
11648
11649 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
11650 to optimize 1.0/sqrt. */
11651
11652 static bool
11653 use_rsqrt_p (machine_mode mode)
11654 {
11655 return (!flag_trapping_math
11656 && flag_unsafe_math_optimizations
11657 && ((aarch64_tune_params.approx_modes->recip_sqrt
11658 & AARCH64_APPROX_MODE (mode))
11659 || flag_mrecip_low_precision_sqrt));
11660 }
11661
11662 /* Function to decide when to use the approximate reciprocal square root
11663 builtin. */
11664
11665 static tree
11666 aarch64_builtin_reciprocal (tree fndecl)
11667 {
11668 machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
11669
11670 if (!use_rsqrt_p (mode))
11671 return NULL_TREE;
11672 return aarch64_builtin_rsqrt (DECL_MD_FUNCTION_CODE (fndecl));
11673 }
11674
11675 /* Emit instruction sequence to compute either the approximate square root
11676 or its approximate reciprocal, depending on the flag RECP, and return
11677 whether the sequence was emitted or not. */
11678
11679 bool
11680 aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
11681 {
11682 machine_mode mode = GET_MODE (dst);
11683
11684 if (GET_MODE_INNER (mode) == HFmode)
11685 {
11686 gcc_assert (!recp);
11687 return false;
11688 }
11689
11690 if (!recp)
11691 {
11692 if (!(flag_mlow_precision_sqrt
11693 || (aarch64_tune_params.approx_modes->sqrt
11694 & AARCH64_APPROX_MODE (mode))))
11695 return false;
11696
11697 if (flag_finite_math_only
11698 || flag_trapping_math
11699 || !flag_unsafe_math_optimizations
11700 || optimize_function_for_size_p (cfun))
11701 return false;
11702 }
11703 else
11704 /* Caller assumes we cannot fail. */
11705 gcc_assert (use_rsqrt_p (mode));
11706
11707 machine_mode mmsk = mode_for_int_vector (mode).require ();
11708 rtx xmsk = gen_reg_rtx (mmsk);
11709 if (!recp)
11710 /* When calculating the approximate square root, compare the
11711 argument with 0.0 and create a mask. */
11712 emit_insn (gen_rtx_SET (xmsk,
11713 gen_rtx_NEG (mmsk,
11714 gen_rtx_EQ (mmsk, src,
11715 CONST0_RTX (mode)))));
11716
11717 /* Estimate the approximate reciprocal square root. */
11718 rtx xdst = gen_reg_rtx (mode);
11719 emit_insn (gen_aarch64_rsqrte (mode, xdst, src));
11720
11721 /* Iterate over the series twice for SF and thrice for DF. */
11722 int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
11723
11724 /* Optionally iterate over the series once less for faster performance
11725 while sacrificing the accuracy. */
11726 if ((recp && flag_mrecip_low_precision_sqrt)
11727 || (!recp && flag_mlow_precision_sqrt))
11728 iterations--;
11729
11730 /* Iterate over the series to calculate the approximate reciprocal square
11731 root. */
11732 rtx x1 = gen_reg_rtx (mode);
11733 while (iterations--)
11734 {
11735 rtx x2 = gen_reg_rtx (mode);
11736 emit_set_insn (x2, gen_rtx_MULT (mode, xdst, xdst));
11737
11738 emit_insn (gen_aarch64_rsqrts (mode, x1, src, x2));
11739
11740 if (iterations > 0)
11741 emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, x1));
11742 }
11743
11744 if (!recp)
11745 {
11746 /* Qualify the approximate reciprocal square root when the argument is
11747 0.0 by squashing the intermediary result to 0.0. */
11748 rtx xtmp = gen_reg_rtx (mmsk);
11749 emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
11750 gen_rtx_SUBREG (mmsk, xdst, 0)));
11751 emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
11752
11753 /* Calculate the approximate square root. */
11754 emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, src));
11755 }
11756
11757 /* Finalize the approximation. */
11758 emit_set_insn (dst, gen_rtx_MULT (mode, xdst, x1));
11759
11760 return true;
11761 }
11762
11763 /* Emit the instruction sequence to compute the approximation for the division
11764 of NUM by DEN in QUO and return whether the sequence was emitted or not. */
11765
11766 bool
11767 aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
11768 {
11769 machine_mode mode = GET_MODE (quo);
11770
11771 if (GET_MODE_INNER (mode) == HFmode)
11772 return false;
11773
11774 bool use_approx_division_p = (flag_mlow_precision_div
11775 || (aarch64_tune_params.approx_modes->division
11776 & AARCH64_APPROX_MODE (mode)));
11777
11778 if (!flag_finite_math_only
11779 || flag_trapping_math
11780 || !flag_unsafe_math_optimizations
11781 || optimize_function_for_size_p (cfun)
11782 || !use_approx_division_p)
11783 return false;
11784
11785 if (!TARGET_SIMD && VECTOR_MODE_P (mode))
11786 return false;
11787
11788 /* Estimate the approximate reciprocal. */
11789 rtx xrcp = gen_reg_rtx (mode);
11790 emit_insn (gen_aarch64_frecpe (mode, xrcp, den));
11791
11792 /* Iterate over the series twice for SF and thrice for DF. */
11793 int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
11794
11795 /* Optionally iterate over the series once less for faster performance,
11796 while sacrificing the accuracy. */
11797 if (flag_mlow_precision_div)
11798 iterations--;
11799
11800 /* Iterate over the series to calculate the approximate reciprocal. */
11801 rtx xtmp = gen_reg_rtx (mode);
11802 while (iterations--)
11803 {
11804 emit_insn (gen_aarch64_frecps (mode, xtmp, xrcp, den));
11805
11806 if (iterations > 0)
11807 emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xtmp));
11808 }
11809
11810 if (num != CONST1_RTX (mode))
11811 {
11812 /* As the approximate reciprocal of DEN is already calculated, only
11813 calculate the approximate division when NUM is not 1.0. */
11814 rtx xnum = force_reg (mode, num);
11815 emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xnum));
11816 }
11817
11818 /* Finalize the approximation. */
11819 emit_set_insn (quo, gen_rtx_MULT (mode, xrcp, xtmp));
11820 return true;
11821 }
11822
11823 /* Return the number of instructions that can be issued per cycle. */
11824 static int
11825 aarch64_sched_issue_rate (void)
11826 {
11827 return aarch64_tune_params.issue_rate;
11828 }
11829
11830 /* Implement TARGET_SCHED_VARIABLE_ISSUE. */
11831 static int
11832 aarch64_sched_variable_issue (FILE *, int, rtx_insn *insn, int more)
11833 {
11834 if (DEBUG_INSN_P (insn))
11835 return more;
11836
11837 rtx_code code = GET_CODE (PATTERN (insn));
11838 if (code == USE || code == CLOBBER)
11839 return more;
11840
11841 if (get_attr_type (insn) == TYPE_NO_INSN)
11842 return more;
11843
11844 return more - 1;
11845 }
11846
11847 static int
11848 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
11849 {
11850 int issue_rate = aarch64_sched_issue_rate ();
11851
11852 return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
11853 }
11854
11855
11856 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
11857 autopref_multipass_dfa_lookahead_guard from haifa-sched.c. It only
11858 has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0. */
11859
11860 static int
11861 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
11862 int ready_index)
11863 {
11864 return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
11865 }
11866
11867
11868 /* Vectorizer cost model target hooks. */
11869
11870 /* Implement targetm.vectorize.builtin_vectorization_cost. */
11871 static int
11872 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
11873 tree vectype,
11874 int misalign ATTRIBUTE_UNUSED)
11875 {
11876 unsigned elements;
11877 const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
11878 bool fp = false;
11879
11880 if (vectype != NULL)
11881 fp = FLOAT_TYPE_P (vectype);
11882
11883 switch (type_of_cost)
11884 {
11885 case scalar_stmt:
11886 return fp ? costs->scalar_fp_stmt_cost : costs->scalar_int_stmt_cost;
11887
11888 case scalar_load:
11889 return costs->scalar_load_cost;
11890
11891 case scalar_store:
11892 return costs->scalar_store_cost;
11893
11894 case vector_stmt:
11895 return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
11896
11897 case vector_load:
11898 return costs->vec_align_load_cost;
11899
11900 case vector_store:
11901 return costs->vec_store_cost;
11902
11903 case vec_to_scalar:
11904 return costs->vec_to_scalar_cost;
11905
11906 case scalar_to_vec:
11907 return costs->scalar_to_vec_cost;
11908
11909 case unaligned_load:
11910 case vector_gather_load:
11911 return costs->vec_unalign_load_cost;
11912
11913 case unaligned_store:
11914 case vector_scatter_store:
11915 return costs->vec_unalign_store_cost;
11916
11917 case cond_branch_taken:
11918 return costs->cond_taken_branch_cost;
11919
11920 case cond_branch_not_taken:
11921 return costs->cond_not_taken_branch_cost;
11922
11923 case vec_perm:
11924 return costs->vec_permute_cost;
11925
11926 case vec_promote_demote:
11927 return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
11928
11929 case vec_construct:
11930 elements = estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype));
11931 return elements / 2 + 1;
11932
11933 default:
11934 gcc_unreachable ();
11935 }
11936 }
11937
11938 /* Implement targetm.vectorize.add_stmt_cost. */
11939 static unsigned
11940 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
11941 struct _stmt_vec_info *stmt_info, int misalign,
11942 enum vect_cost_model_location where)
11943 {
11944 unsigned *cost = (unsigned *) data;
11945 unsigned retval = 0;
11946
11947 if (flag_vect_cost_model)
11948 {
11949 tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
11950 int stmt_cost =
11951 aarch64_builtin_vectorization_cost (kind, vectype, misalign);
11952
11953 /* Statements in an inner loop relative to the loop being
11954 vectorized are weighted more heavily. The value here is
11955 arbitrary and could potentially be improved with analysis. */
11956 if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
11957 count *= 50; /* FIXME */
11958
11959 retval = (unsigned) (count * stmt_cost);
11960 cost[where] += retval;
11961 }
11962
11963 return retval;
11964 }
11965
11966 static void initialize_aarch64_code_model (struct gcc_options *);
11967
11968 /* Parse the TO_PARSE string and put the architecture struct that it
11969 selects into RES and the architectural features into ISA_FLAGS.
11970 Return an aarch64_parse_opt_result describing the parse result.
11971 If there is an error parsing, RES and ISA_FLAGS are left unchanged.
11972 When the TO_PARSE string contains an invalid extension,
11973 a copy of the string is created and stored to INVALID_EXTENSION. */
11974
11975 static enum aarch64_parse_opt_result
11976 aarch64_parse_arch (const char *to_parse, const struct processor **res,
11977 uint64_t *isa_flags, std::string *invalid_extension)
11978 {
11979 const char *ext;
11980 const struct processor *arch;
11981 size_t len;
11982
11983 ext = strchr (to_parse, '+');
11984
11985 if (ext != NULL)
11986 len = ext - to_parse;
11987 else
11988 len = strlen (to_parse);
11989
11990 if (len == 0)
11991 return AARCH64_PARSE_MISSING_ARG;
11992
11993
11994 /* Loop through the list of supported ARCHes to find a match. */
11995 for (arch = all_architectures; arch->name != NULL; arch++)
11996 {
11997 if (strlen (arch->name) == len
11998 && strncmp (arch->name, to_parse, len) == 0)
11999 {
12000 uint64_t isa_temp = arch->flags;
12001
12002 if (ext != NULL)
12003 {
12004 /* TO_PARSE string contains at least one extension. */
12005 enum aarch64_parse_opt_result ext_res
12006 = aarch64_parse_extension (ext, &isa_temp, invalid_extension);
12007
12008 if (ext_res != AARCH64_PARSE_OK)
12009 return ext_res;
12010 }
12011 /* Extension parsing was successful. Confirm the result
12012 arch and ISA flags. */
12013 *res = arch;
12014 *isa_flags = isa_temp;
12015 return AARCH64_PARSE_OK;
12016 }
12017 }
12018
12019 /* ARCH name not found in list. */
12020 return AARCH64_PARSE_INVALID_ARG;
12021 }
12022
12023 /* Parse the TO_PARSE string and put the result tuning in RES and the
12024 architecture flags in ISA_FLAGS. Return an aarch64_parse_opt_result
12025 describing the parse result. If there is an error parsing, RES and
12026 ISA_FLAGS are left unchanged.
12027 When the TO_PARSE string contains an invalid extension,
12028 a copy of the string is created and stored to INVALID_EXTENSION. */
12029
12030 static enum aarch64_parse_opt_result
12031 aarch64_parse_cpu (const char *to_parse, const struct processor **res,
12032 uint64_t *isa_flags, std::string *invalid_extension)
12033 {
12034 const char *ext;
12035 const struct processor *cpu;
12036 size_t len;
12037
12038 ext = strchr (to_parse, '+');
12039
12040 if (ext != NULL)
12041 len = ext - to_parse;
12042 else
12043 len = strlen (to_parse);
12044
12045 if (len == 0)
12046 return AARCH64_PARSE_MISSING_ARG;
12047
12048
12049 /* Loop through the list of supported CPUs to find a match. */
12050 for (cpu = all_cores; cpu->name != NULL; cpu++)
12051 {
12052 if (strlen (cpu->name) == len && strncmp (cpu->name, to_parse, len) == 0)
12053 {
12054 uint64_t isa_temp = cpu->flags;
12055
12056
12057 if (ext != NULL)
12058 {
12059 /* TO_PARSE string contains at least one extension. */
12060 enum aarch64_parse_opt_result ext_res
12061 = aarch64_parse_extension (ext, &isa_temp, invalid_extension);
12062
12063 if (ext_res != AARCH64_PARSE_OK)
12064 return ext_res;
12065 }
12066 /* Extension parsing was successfull. Confirm the result
12067 cpu and ISA flags. */
12068 *res = cpu;
12069 *isa_flags = isa_temp;
12070 return AARCH64_PARSE_OK;
12071 }
12072 }
12073
12074 /* CPU name not found in list. */
12075 return AARCH64_PARSE_INVALID_ARG;
12076 }
12077
12078 /* Parse the TO_PARSE string and put the cpu it selects into RES.
12079 Return an aarch64_parse_opt_result describing the parse result.
12080 If the parsing fails the RES does not change. */
12081
12082 static enum aarch64_parse_opt_result
12083 aarch64_parse_tune (const char *to_parse, const struct processor **res)
12084 {
12085 const struct processor *cpu;
12086
12087 /* Loop through the list of supported CPUs to find a match. */
12088 for (cpu = all_cores; cpu->name != NULL; cpu++)
12089 {
12090 if (strcmp (cpu->name, to_parse) == 0)
12091 {
12092 *res = cpu;
12093 return AARCH64_PARSE_OK;
12094 }
12095 }
12096
12097 /* CPU name not found in list. */
12098 return AARCH64_PARSE_INVALID_ARG;
12099 }
12100
12101 /* Parse TOKEN, which has length LENGTH to see if it is an option
12102 described in FLAG. If it is, return the index bit for that fusion type.
12103 If not, error (printing OPTION_NAME) and return zero. */
12104
12105 static unsigned int
12106 aarch64_parse_one_option_token (const char *token,
12107 size_t length,
12108 const struct aarch64_flag_desc *flag,
12109 const char *option_name)
12110 {
12111 for (; flag->name != NULL; flag++)
12112 {
12113 if (length == strlen (flag->name)
12114 && !strncmp (flag->name, token, length))
12115 return flag->flag;
12116 }
12117
12118 error ("unknown flag passed in %<-moverride=%s%> (%s)", option_name, token);
12119 return 0;
12120 }
12121
12122 /* Parse OPTION which is a comma-separated list of flags to enable.
12123 FLAGS gives the list of flags we understand, INITIAL_STATE gives any
12124 default state we inherit from the CPU tuning structures. OPTION_NAME
12125 gives the top-level option we are parsing in the -moverride string,
12126 for use in error messages. */
12127
12128 static unsigned int
12129 aarch64_parse_boolean_options (const char *option,
12130 const struct aarch64_flag_desc *flags,
12131 unsigned int initial_state,
12132 const char *option_name)
12133 {
12134 const char separator = '.';
12135 const char* specs = option;
12136 const char* ntoken = option;
12137 unsigned int found_flags = initial_state;
12138
12139 while ((ntoken = strchr (specs, separator)))
12140 {
12141 size_t token_length = ntoken - specs;
12142 unsigned token_ops = aarch64_parse_one_option_token (specs,
12143 token_length,
12144 flags,
12145 option_name);
12146 /* If we find "none" (or, for simplicity's sake, an error) anywhere
12147 in the token stream, reset the supported operations. So:
12148
12149 adrp+add.cmp+branch.none.adrp+add
12150
12151 would have the result of turning on only adrp+add fusion. */
12152 if (!token_ops)
12153 found_flags = 0;
12154
12155 found_flags |= token_ops;
12156 specs = ++ntoken;
12157 }
12158
12159 /* We ended with a comma, print something. */
12160 if (!(*specs))
12161 {
12162 error ("%s string ill-formed\n", option_name);
12163 return 0;
12164 }
12165
12166 /* We still have one more token to parse. */
12167 size_t token_length = strlen (specs);
12168 unsigned token_ops = aarch64_parse_one_option_token (specs,
12169 token_length,
12170 flags,
12171 option_name);
12172 if (!token_ops)
12173 found_flags = 0;
12174
12175 found_flags |= token_ops;
12176 return found_flags;
12177 }
12178
12179 /* Support for overriding instruction fusion. */
12180
12181 static void
12182 aarch64_parse_fuse_string (const char *fuse_string,
12183 struct tune_params *tune)
12184 {
12185 tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
12186 aarch64_fusible_pairs,
12187 tune->fusible_ops,
12188 "fuse=");
12189 }
12190
12191 /* Support for overriding other tuning flags. */
12192
12193 static void
12194 aarch64_parse_tune_string (const char *tune_string,
12195 struct tune_params *tune)
12196 {
12197 tune->extra_tuning_flags
12198 = aarch64_parse_boolean_options (tune_string,
12199 aarch64_tuning_flags,
12200 tune->extra_tuning_flags,
12201 "tune=");
12202 }
12203
12204 /* Parse the sve_width tuning moverride string in TUNE_STRING.
12205 Accept the valid SVE vector widths allowed by
12206 aarch64_sve_vector_bits_enum and use it to override sve_width
12207 in TUNE. */
12208
12209 static void
12210 aarch64_parse_sve_width_string (const char *tune_string,
12211 struct tune_params *tune)
12212 {
12213 int width = -1;
12214
12215 int n = sscanf (tune_string, "%d", &width);
12216 if (n == EOF)
12217 {
12218 error ("invalid format for sve_width");
12219 return;
12220 }
12221 switch (width)
12222 {
12223 case SVE_128:
12224 case SVE_256:
12225 case SVE_512:
12226 case SVE_1024:
12227 case SVE_2048:
12228 break;
12229 default:
12230 error ("invalid sve_width value: %d", width);
12231 }
12232 tune->sve_width = (enum aarch64_sve_vector_bits_enum) width;
12233 }
12234
12235 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
12236 we understand. If it is, extract the option string and handoff to
12237 the appropriate function. */
12238
12239 void
12240 aarch64_parse_one_override_token (const char* token,
12241 size_t length,
12242 struct tune_params *tune)
12243 {
12244 const struct aarch64_tuning_override_function *fn
12245 = aarch64_tuning_override_functions;
12246
12247 const char *option_part = strchr (token, '=');
12248 if (!option_part)
12249 {
12250 error ("tuning string missing in option (%s)", token);
12251 return;
12252 }
12253
12254 /* Get the length of the option name. */
12255 length = option_part - token;
12256 /* Skip the '=' to get to the option string. */
12257 option_part++;
12258
12259 for (; fn->name != NULL; fn++)
12260 {
12261 if (!strncmp (fn->name, token, length))
12262 {
12263 fn->parse_override (option_part, tune);
12264 return;
12265 }
12266 }
12267
12268 error ("unknown tuning option (%s)",token);
12269 return;
12270 }
12271
12272 /* A checking mechanism for the implementation of the tls size. */
12273
12274 static void
12275 initialize_aarch64_tls_size (struct gcc_options *opts)
12276 {
12277 if (aarch64_tls_size == 0)
12278 aarch64_tls_size = 24;
12279
12280 switch (opts->x_aarch64_cmodel_var)
12281 {
12282 case AARCH64_CMODEL_TINY:
12283 /* Both the default and maximum TLS size allowed under tiny is 1M which
12284 needs two instructions to address, so we clamp the size to 24. */
12285 if (aarch64_tls_size > 24)
12286 aarch64_tls_size = 24;
12287 break;
12288 case AARCH64_CMODEL_SMALL:
12289 /* The maximum TLS size allowed under small is 4G. */
12290 if (aarch64_tls_size > 32)
12291 aarch64_tls_size = 32;
12292 break;
12293 case AARCH64_CMODEL_LARGE:
12294 /* The maximum TLS size allowed under large is 16E.
12295 FIXME: 16E should be 64bit, we only support 48bit offset now. */
12296 if (aarch64_tls_size > 48)
12297 aarch64_tls_size = 48;
12298 break;
12299 default:
12300 gcc_unreachable ();
12301 }
12302
12303 return;
12304 }
12305
12306 /* Parse STRING looking for options in the format:
12307 string :: option:string
12308 option :: name=substring
12309 name :: {a-z}
12310 substring :: defined by option. */
12311
12312 static void
12313 aarch64_parse_override_string (const char* input_string,
12314 struct tune_params* tune)
12315 {
12316 const char separator = ':';
12317 size_t string_length = strlen (input_string) + 1;
12318 char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
12319 char *string = string_root;
12320 strncpy (string, input_string, string_length);
12321 string[string_length - 1] = '\0';
12322
12323 char* ntoken = string;
12324
12325 while ((ntoken = strchr (string, separator)))
12326 {
12327 size_t token_length = ntoken - string;
12328 /* Make this substring look like a string. */
12329 *ntoken = '\0';
12330 aarch64_parse_one_override_token (string, token_length, tune);
12331 string = ++ntoken;
12332 }
12333
12334 /* One last option to parse. */
12335 aarch64_parse_one_override_token (string, strlen (string), tune);
12336 free (string_root);
12337 }
12338
12339
12340 static void
12341 aarch64_override_options_after_change_1 (struct gcc_options *opts)
12342 {
12343 if (accepted_branch_protection_string)
12344 {
12345 opts->x_aarch64_branch_protection_string
12346 = xstrdup (accepted_branch_protection_string);
12347 }
12348
12349 /* PR 70044: We have to be careful about being called multiple times for the
12350 same function. This means all changes should be repeatable. */
12351
12352 /* Set aarch64_use_frame_pointer based on -fno-omit-frame-pointer.
12353 Disable the frame pointer flag so the mid-end will not use a frame
12354 pointer in leaf functions in order to support -fomit-leaf-frame-pointer.
12355 Set x_flag_omit_frame_pointer to the special value 2 to differentiate
12356 between -fomit-frame-pointer (1) and -fno-omit-frame-pointer (2). */
12357 aarch64_use_frame_pointer = opts->x_flag_omit_frame_pointer != 1;
12358 if (opts->x_flag_omit_frame_pointer == 0)
12359 opts->x_flag_omit_frame_pointer = 2;
12360
12361 /* If not optimizing for size, set the default
12362 alignment to what the target wants. */
12363 if (!opts->x_optimize_size)
12364 {
12365 if (opts->x_flag_align_loops && !opts->x_str_align_loops)
12366 opts->x_str_align_loops = aarch64_tune_params.loop_align;
12367 if (opts->x_flag_align_jumps && !opts->x_str_align_jumps)
12368 opts->x_str_align_jumps = aarch64_tune_params.jump_align;
12369 if (opts->x_flag_align_functions && !opts->x_str_align_functions)
12370 opts->x_str_align_functions = aarch64_tune_params.function_align;
12371 }
12372
12373 /* We default to no pc-relative literal loads. */
12374
12375 aarch64_pcrelative_literal_loads = false;
12376
12377 /* If -mpc-relative-literal-loads is set on the command line, this
12378 implies that the user asked for PC relative literal loads. */
12379 if (opts->x_pcrelative_literal_loads == 1)
12380 aarch64_pcrelative_literal_loads = true;
12381
12382 /* In the tiny memory model it makes no sense to disallow PC relative
12383 literal pool loads. */
12384 if (aarch64_cmodel == AARCH64_CMODEL_TINY
12385 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
12386 aarch64_pcrelative_literal_loads = true;
12387
12388 /* When enabling the lower precision Newton series for the square root, also
12389 enable it for the reciprocal square root, since the latter is an
12390 intermediary step for the former. */
12391 if (flag_mlow_precision_sqrt)
12392 flag_mrecip_low_precision_sqrt = true;
12393 }
12394
12395 /* 'Unpack' up the internal tuning structs and update the options
12396 in OPTS. The caller must have set up selected_tune and selected_arch
12397 as all the other target-specific codegen decisions are
12398 derived from them. */
12399
12400 void
12401 aarch64_override_options_internal (struct gcc_options *opts)
12402 {
12403 aarch64_tune_flags = selected_tune->flags;
12404 aarch64_tune = selected_tune->sched_core;
12405 /* Make a copy of the tuning parameters attached to the core, which
12406 we may later overwrite. */
12407 aarch64_tune_params = *(selected_tune->tune);
12408 aarch64_architecture_version = selected_arch->architecture_version;
12409
12410 if (opts->x_aarch64_override_tune_string)
12411 aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
12412 &aarch64_tune_params);
12413
12414 /* This target defaults to strict volatile bitfields. */
12415 if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
12416 opts->x_flag_strict_volatile_bitfields = 1;
12417
12418 if (aarch64_stack_protector_guard == SSP_GLOBAL
12419 && opts->x_aarch64_stack_protector_guard_offset_str)
12420 {
12421 error ("incompatible options %<-mstack-protector-guard=global%> and "
12422 "%<-mstack-protector-guard-offset=%s%>",
12423 aarch64_stack_protector_guard_offset_str);
12424 }
12425
12426 if (aarch64_stack_protector_guard == SSP_SYSREG
12427 && !(opts->x_aarch64_stack_protector_guard_offset_str
12428 && opts->x_aarch64_stack_protector_guard_reg_str))
12429 {
12430 error ("both %<-mstack-protector-guard-offset%> and "
12431 "%<-mstack-protector-guard-reg%> must be used "
12432 "with %<-mstack-protector-guard=sysreg%>");
12433 }
12434
12435 if (opts->x_aarch64_stack_protector_guard_reg_str)
12436 {
12437 if (strlen (opts->x_aarch64_stack_protector_guard_reg_str) > 100)
12438 error ("specify a system register with a small string length.");
12439 }
12440
12441 if (opts->x_aarch64_stack_protector_guard_offset_str)
12442 {
12443 char *end;
12444 const char *str = aarch64_stack_protector_guard_offset_str;
12445 errno = 0;
12446 long offs = strtol (aarch64_stack_protector_guard_offset_str, &end, 0);
12447 if (!*str || *end || errno)
12448 error ("%qs is not a valid offset in %qs", str,
12449 "-mstack-protector-guard-offset=");
12450 aarch64_stack_protector_guard_offset = offs;
12451 }
12452
12453 initialize_aarch64_code_model (opts);
12454 initialize_aarch64_tls_size (opts);
12455
12456 int queue_depth = 0;
12457 switch (aarch64_tune_params.autoprefetcher_model)
12458 {
12459 case tune_params::AUTOPREFETCHER_OFF:
12460 queue_depth = -1;
12461 break;
12462 case tune_params::AUTOPREFETCHER_WEAK:
12463 queue_depth = 0;
12464 break;
12465 case tune_params::AUTOPREFETCHER_STRONG:
12466 queue_depth = max_insn_queue_index + 1;
12467 break;
12468 default:
12469 gcc_unreachable ();
12470 }
12471
12472 /* We don't mind passing in global_options_set here as we don't use
12473 the *options_set structs anyway. */
12474 maybe_set_param_value (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH,
12475 queue_depth,
12476 opts->x_param_values,
12477 global_options_set.x_param_values);
12478
12479 /* Set up parameters to be used in prefetching algorithm. Do not
12480 override the defaults unless we are tuning for a core we have
12481 researched values for. */
12482 if (aarch64_tune_params.prefetch->num_slots > 0)
12483 maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
12484 aarch64_tune_params.prefetch->num_slots,
12485 opts->x_param_values,
12486 global_options_set.x_param_values);
12487 if (aarch64_tune_params.prefetch->l1_cache_size >= 0)
12488 maybe_set_param_value (PARAM_L1_CACHE_SIZE,
12489 aarch64_tune_params.prefetch->l1_cache_size,
12490 opts->x_param_values,
12491 global_options_set.x_param_values);
12492 if (aarch64_tune_params.prefetch->l1_cache_line_size >= 0)
12493 maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
12494 aarch64_tune_params.prefetch->l1_cache_line_size,
12495 opts->x_param_values,
12496 global_options_set.x_param_values);
12497 if (aarch64_tune_params.prefetch->l2_cache_size >= 0)
12498 maybe_set_param_value (PARAM_L2_CACHE_SIZE,
12499 aarch64_tune_params.prefetch->l2_cache_size,
12500 opts->x_param_values,
12501 global_options_set.x_param_values);
12502 if (!aarch64_tune_params.prefetch->prefetch_dynamic_strides)
12503 maybe_set_param_value (PARAM_PREFETCH_DYNAMIC_STRIDES,
12504 0,
12505 opts->x_param_values,
12506 global_options_set.x_param_values);
12507 if (aarch64_tune_params.prefetch->minimum_stride >= 0)
12508 maybe_set_param_value (PARAM_PREFETCH_MINIMUM_STRIDE,
12509 aarch64_tune_params.prefetch->minimum_stride,
12510 opts->x_param_values,
12511 global_options_set.x_param_values);
12512
12513 /* Use the alternative scheduling-pressure algorithm by default. */
12514 maybe_set_param_value (PARAM_SCHED_PRESSURE_ALGORITHM, SCHED_PRESSURE_MODEL,
12515 opts->x_param_values,
12516 global_options_set.x_param_values);
12517
12518 /* If the user hasn't changed it via configure then set the default to 64 KB
12519 for the backend. */
12520 maybe_set_param_value (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE,
12521 DEFAULT_STK_CLASH_GUARD_SIZE == 0
12522 ? 16 : DEFAULT_STK_CLASH_GUARD_SIZE,
12523 opts->x_param_values,
12524 global_options_set.x_param_values);
12525
12526 /* Validate the guard size. */
12527 int guard_size = PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
12528
12529 /* Enforce that interval is the same size as size so the mid-end does the
12530 right thing. */
12531 maybe_set_param_value (PARAM_STACK_CLASH_PROTECTION_PROBE_INTERVAL,
12532 guard_size,
12533 opts->x_param_values,
12534 global_options_set.x_param_values);
12535
12536 /* The maybe_set calls won't update the value if the user has explicitly set
12537 one. Which means we need to validate that probing interval and guard size
12538 are equal. */
12539 int probe_interval
12540 = PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_PROBE_INTERVAL);
12541 if (guard_size != probe_interval)
12542 error ("stack clash guard size %<%d%> must be equal to probing interval "
12543 "%<%d%>", guard_size, probe_interval);
12544
12545 /* Enable sw prefetching at specified optimization level for
12546 CPUS that have prefetch. Lower optimization level threshold by 1
12547 when profiling is enabled. */
12548 if (opts->x_flag_prefetch_loop_arrays < 0
12549 && !opts->x_optimize_size
12550 && aarch64_tune_params.prefetch->default_opt_level >= 0
12551 && opts->x_optimize >= aarch64_tune_params.prefetch->default_opt_level)
12552 opts->x_flag_prefetch_loop_arrays = 1;
12553
12554 if (opts->x_aarch64_arch_string == NULL)
12555 opts->x_aarch64_arch_string = selected_arch->name;
12556 if (opts->x_aarch64_cpu_string == NULL)
12557 opts->x_aarch64_cpu_string = selected_cpu->name;
12558 if (opts->x_aarch64_tune_string == NULL)
12559 opts->x_aarch64_tune_string = selected_tune->name;
12560
12561 aarch64_override_options_after_change_1 (opts);
12562 }
12563
12564 /* Print a hint with a suggestion for a core or architecture name that
12565 most closely resembles what the user passed in STR. ARCH is true if
12566 the user is asking for an architecture name. ARCH is false if the user
12567 is asking for a core name. */
12568
12569 static void
12570 aarch64_print_hint_for_core_or_arch (const char *str, bool arch)
12571 {
12572 auto_vec<const char *> candidates;
12573 const struct processor *entry = arch ? all_architectures : all_cores;
12574 for (; entry->name != NULL; entry++)
12575 candidates.safe_push (entry->name);
12576
12577 #ifdef HAVE_LOCAL_CPU_DETECT
12578 /* Add also "native" as possible value. */
12579 if (arch)
12580 candidates.safe_push ("native");
12581 #endif
12582
12583 char *s;
12584 const char *hint = candidates_list_and_hint (str, s, candidates);
12585 if (hint)
12586 inform (input_location, "valid arguments are: %s;"
12587 " did you mean %qs?", s, hint);
12588 else
12589 inform (input_location, "valid arguments are: %s", s);
12590
12591 XDELETEVEC (s);
12592 }
12593
12594 /* Print a hint with a suggestion for a core name that most closely resembles
12595 what the user passed in STR. */
12596
12597 inline static void
12598 aarch64_print_hint_for_core (const char *str)
12599 {
12600 aarch64_print_hint_for_core_or_arch (str, false);
12601 }
12602
12603 /* Print a hint with a suggestion for an architecture name that most closely
12604 resembles what the user passed in STR. */
12605
12606 inline static void
12607 aarch64_print_hint_for_arch (const char *str)
12608 {
12609 aarch64_print_hint_for_core_or_arch (str, true);
12610 }
12611
12612
12613 /* Print a hint with a suggestion for an extension name
12614 that most closely resembles what the user passed in STR. */
12615
12616 void
12617 aarch64_print_hint_for_extensions (const std::string &str)
12618 {
12619 auto_vec<const char *> candidates;
12620 aarch64_get_all_extension_candidates (&candidates);
12621 char *s;
12622 const char *hint = candidates_list_and_hint (str.c_str (), s, candidates);
12623 if (hint)
12624 inform (input_location, "valid arguments are: %s;"
12625 " did you mean %qs?", s, hint);
12626 else
12627 inform (input_location, "valid arguments are: %s;", s);
12628
12629 XDELETEVEC (s);
12630 }
12631
12632 /* Validate a command-line -mcpu option. Parse the cpu and extensions (if any)
12633 specified in STR and throw errors if appropriate. Put the results if
12634 they are valid in RES and ISA_FLAGS. Return whether the option is
12635 valid. */
12636
12637 static bool
12638 aarch64_validate_mcpu (const char *str, const struct processor **res,
12639 uint64_t *isa_flags)
12640 {
12641 std::string invalid_extension;
12642 enum aarch64_parse_opt_result parse_res
12643 = aarch64_parse_cpu (str, res, isa_flags, &invalid_extension);
12644
12645 if (parse_res == AARCH64_PARSE_OK)
12646 return true;
12647
12648 switch (parse_res)
12649 {
12650 case AARCH64_PARSE_MISSING_ARG:
12651 error ("missing cpu name in %<-mcpu=%s%>", str);
12652 break;
12653 case AARCH64_PARSE_INVALID_ARG:
12654 error ("unknown value %qs for %<-mcpu%>", str);
12655 aarch64_print_hint_for_core (str);
12656 break;
12657 case AARCH64_PARSE_INVALID_FEATURE:
12658 error ("invalid feature modifier %qs in %<-mcpu=%s%>",
12659 invalid_extension.c_str (), str);
12660 aarch64_print_hint_for_extensions (invalid_extension);
12661 break;
12662 default:
12663 gcc_unreachable ();
12664 }
12665
12666 return false;
12667 }
12668
12669 /* Parses CONST_STR for branch protection features specified in
12670 aarch64_branch_protect_types, and set any global variables required. Returns
12671 the parsing result and assigns LAST_STR to the last processed token from
12672 CONST_STR so that it can be used for error reporting. */
12673
12674 static enum
12675 aarch64_parse_opt_result aarch64_parse_branch_protection (const char *const_str,
12676 char** last_str)
12677 {
12678 char *str_root = xstrdup (const_str);
12679 char* token_save = NULL;
12680 char *str = strtok_r (str_root, "+", &token_save);
12681 enum aarch64_parse_opt_result res = AARCH64_PARSE_OK;
12682 if (!str)
12683 res = AARCH64_PARSE_MISSING_ARG;
12684 else
12685 {
12686 char *next_str = strtok_r (NULL, "+", &token_save);
12687 /* Reset the branch protection features to their defaults. */
12688 aarch64_handle_no_branch_protection (NULL, NULL);
12689
12690 while (str && res == AARCH64_PARSE_OK)
12691 {
12692 const aarch64_branch_protect_type* type = aarch64_branch_protect_types;
12693 bool found = false;
12694 /* Search for this type. */
12695 while (type && type->name && !found && res == AARCH64_PARSE_OK)
12696 {
12697 if (strcmp (str, type->name) == 0)
12698 {
12699 found = true;
12700 res = type->handler (str, next_str);
12701 str = next_str;
12702 next_str = strtok_r (NULL, "+", &token_save);
12703 }
12704 else
12705 type++;
12706 }
12707 if (found && res == AARCH64_PARSE_OK)
12708 {
12709 bool found_subtype = true;
12710 /* Loop through each token until we find one that isn't a
12711 subtype. */
12712 while (found_subtype)
12713 {
12714 found_subtype = false;
12715 const aarch64_branch_protect_type *subtype = type->subtypes;
12716 /* Search for the subtype. */
12717 while (str && subtype && subtype->name && !found_subtype
12718 && res == AARCH64_PARSE_OK)
12719 {
12720 if (strcmp (str, subtype->name) == 0)
12721 {
12722 found_subtype = true;
12723 res = subtype->handler (str, next_str);
12724 str = next_str;
12725 next_str = strtok_r (NULL, "+", &token_save);
12726 }
12727 else
12728 subtype++;
12729 }
12730 }
12731 }
12732 else if (!found)
12733 res = AARCH64_PARSE_INVALID_ARG;
12734 }
12735 }
12736 /* Copy the last processed token into the argument to pass it back.
12737 Used by option and attribute validation to print the offending token. */
12738 if (last_str)
12739 {
12740 if (str) strcpy (*last_str, str);
12741 else *last_str = NULL;
12742 }
12743 if (res == AARCH64_PARSE_OK)
12744 {
12745 /* If needed, alloc the accepted string then copy in const_str.
12746 Used by override_option_after_change_1. */
12747 if (!accepted_branch_protection_string)
12748 accepted_branch_protection_string = (char *) xmalloc (
12749 BRANCH_PROTECT_STR_MAX
12750 + 1);
12751 strncpy (accepted_branch_protection_string, const_str,
12752 BRANCH_PROTECT_STR_MAX + 1);
12753 /* Forcibly null-terminate. */
12754 accepted_branch_protection_string[BRANCH_PROTECT_STR_MAX] = '\0';
12755 }
12756 return res;
12757 }
12758
12759 static bool
12760 aarch64_validate_mbranch_protection (const char *const_str)
12761 {
12762 char *str = (char *) xmalloc (strlen (const_str));
12763 enum aarch64_parse_opt_result res =
12764 aarch64_parse_branch_protection (const_str, &str);
12765 if (res == AARCH64_PARSE_INVALID_ARG)
12766 error ("invalid argument %<%s%> for %<-mbranch-protection=%>", str);
12767 else if (res == AARCH64_PARSE_MISSING_ARG)
12768 error ("missing argument for %<-mbranch-protection=%>");
12769 free (str);
12770 return res == AARCH64_PARSE_OK;
12771 }
12772
12773 /* Validate a command-line -march option. Parse the arch and extensions
12774 (if any) specified in STR and throw errors if appropriate. Put the
12775 results, if they are valid, in RES and ISA_FLAGS. Return whether the
12776 option is valid. */
12777
12778 static bool
12779 aarch64_validate_march (const char *str, const struct processor **res,
12780 uint64_t *isa_flags)
12781 {
12782 std::string invalid_extension;
12783 enum aarch64_parse_opt_result parse_res
12784 = aarch64_parse_arch (str, res, isa_flags, &invalid_extension);
12785
12786 if (parse_res == AARCH64_PARSE_OK)
12787 return true;
12788
12789 switch (parse_res)
12790 {
12791 case AARCH64_PARSE_MISSING_ARG:
12792 error ("missing arch name in %<-march=%s%>", str);
12793 break;
12794 case AARCH64_PARSE_INVALID_ARG:
12795 error ("unknown value %qs for %<-march%>", str);
12796 aarch64_print_hint_for_arch (str);
12797 break;
12798 case AARCH64_PARSE_INVALID_FEATURE:
12799 error ("invalid feature modifier %qs in %<-march=%s%>",
12800 invalid_extension.c_str (), str);
12801 aarch64_print_hint_for_extensions (invalid_extension);
12802 break;
12803 default:
12804 gcc_unreachable ();
12805 }
12806
12807 return false;
12808 }
12809
12810 /* Validate a command-line -mtune option. Parse the cpu
12811 specified in STR and throw errors if appropriate. Put the
12812 result, if it is valid, in RES. Return whether the option is
12813 valid. */
12814
12815 static bool
12816 aarch64_validate_mtune (const char *str, const struct processor **res)
12817 {
12818 enum aarch64_parse_opt_result parse_res
12819 = aarch64_parse_tune (str, res);
12820
12821 if (parse_res == AARCH64_PARSE_OK)
12822 return true;
12823
12824 switch (parse_res)
12825 {
12826 case AARCH64_PARSE_MISSING_ARG:
12827 error ("missing cpu name in %<-mtune=%s%>", str);
12828 break;
12829 case AARCH64_PARSE_INVALID_ARG:
12830 error ("unknown value %qs for %<-mtune%>", str);
12831 aarch64_print_hint_for_core (str);
12832 break;
12833 default:
12834 gcc_unreachable ();
12835 }
12836 return false;
12837 }
12838
12839 /* Return the CPU corresponding to the enum CPU.
12840 If it doesn't specify a cpu, return the default. */
12841
12842 static const struct processor *
12843 aarch64_get_tune_cpu (enum aarch64_processor cpu)
12844 {
12845 if (cpu != aarch64_none)
12846 return &all_cores[cpu];
12847
12848 /* The & 0x3f is to extract the bottom 6 bits that encode the
12849 default cpu as selected by the --with-cpu GCC configure option
12850 in config.gcc.
12851 ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
12852 flags mechanism should be reworked to make it more sane. */
12853 return &all_cores[TARGET_CPU_DEFAULT & 0x3f];
12854 }
12855
12856 /* Return the architecture corresponding to the enum ARCH.
12857 If it doesn't specify a valid architecture, return the default. */
12858
12859 static const struct processor *
12860 aarch64_get_arch (enum aarch64_arch arch)
12861 {
12862 if (arch != aarch64_no_arch)
12863 return &all_architectures[arch];
12864
12865 const struct processor *cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
12866
12867 return &all_architectures[cpu->arch];
12868 }
12869
12870 /* Return the VG value associated with -msve-vector-bits= value VALUE. */
12871
12872 static poly_uint16
12873 aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits_enum value)
12874 {
12875 /* For now generate vector-length agnostic code for -msve-vector-bits=128.
12876 This ensures we can clearly distinguish SVE and Advanced SIMD modes when
12877 deciding which .md file patterns to use and when deciding whether
12878 something is a legitimate address or constant. */
12879 if (value == SVE_SCALABLE || value == SVE_128)
12880 return poly_uint16 (2, 2);
12881 else
12882 return (int) value / 64;
12883 }
12884
12885 /* Implement TARGET_OPTION_OVERRIDE. This is called once in the beginning
12886 and is used to parse the -m{cpu,tune,arch} strings and setup the initial
12887 tuning structs. In particular it must set selected_tune and
12888 aarch64_isa_flags that define the available ISA features and tuning
12889 decisions. It must also set selected_arch as this will be used to
12890 output the .arch asm tags for each function. */
12891
12892 static void
12893 aarch64_override_options (void)
12894 {
12895 uint64_t cpu_isa = 0;
12896 uint64_t arch_isa = 0;
12897 aarch64_isa_flags = 0;
12898
12899 bool valid_cpu = true;
12900 bool valid_tune = true;
12901 bool valid_arch = true;
12902
12903 selected_cpu = NULL;
12904 selected_arch = NULL;
12905 selected_tune = NULL;
12906
12907 if (aarch64_branch_protection_string)
12908 aarch64_validate_mbranch_protection (aarch64_branch_protection_string);
12909
12910 /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
12911 If either of -march or -mtune is given, they override their
12912 respective component of -mcpu. */
12913 if (aarch64_cpu_string)
12914 valid_cpu = aarch64_validate_mcpu (aarch64_cpu_string, &selected_cpu,
12915 &cpu_isa);
12916
12917 if (aarch64_arch_string)
12918 valid_arch = aarch64_validate_march (aarch64_arch_string, &selected_arch,
12919 &arch_isa);
12920
12921 if (aarch64_tune_string)
12922 valid_tune = aarch64_validate_mtune (aarch64_tune_string, &selected_tune);
12923
12924 #ifdef SUBTARGET_OVERRIDE_OPTIONS
12925 SUBTARGET_OVERRIDE_OPTIONS;
12926 #endif
12927
12928 /* If the user did not specify a processor, choose the default
12929 one for them. This will be the CPU set during configuration using
12930 --with-cpu, otherwise it is "generic". */
12931 if (!selected_cpu)
12932 {
12933 if (selected_arch)
12934 {
12935 selected_cpu = &all_cores[selected_arch->ident];
12936 aarch64_isa_flags = arch_isa;
12937 explicit_arch = selected_arch->arch;
12938 }
12939 else
12940 {
12941 /* Get default configure-time CPU. */
12942 selected_cpu = aarch64_get_tune_cpu (aarch64_none);
12943 aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
12944 }
12945
12946 if (selected_tune)
12947 explicit_tune_core = selected_tune->ident;
12948 }
12949 /* If both -mcpu and -march are specified check that they are architecturally
12950 compatible, warn if they're not and prefer the -march ISA flags. */
12951 else if (selected_arch)
12952 {
12953 if (selected_arch->arch != selected_cpu->arch)
12954 {
12955 warning (0, "switch %<-mcpu=%s%> conflicts with %<-march=%s%> switch",
12956 all_architectures[selected_cpu->arch].name,
12957 selected_arch->name);
12958 }
12959 aarch64_isa_flags = arch_isa;
12960 explicit_arch = selected_arch->arch;
12961 explicit_tune_core = selected_tune ? selected_tune->ident
12962 : selected_cpu->ident;
12963 }
12964 else
12965 {
12966 /* -mcpu but no -march. */
12967 aarch64_isa_flags = cpu_isa;
12968 explicit_tune_core = selected_tune ? selected_tune->ident
12969 : selected_cpu->ident;
12970 gcc_assert (selected_cpu);
12971 selected_arch = &all_architectures[selected_cpu->arch];
12972 explicit_arch = selected_arch->arch;
12973 }
12974
12975 /* Set the arch as well as we will need it when outputing
12976 the .arch directive in assembly. */
12977 if (!selected_arch)
12978 {
12979 gcc_assert (selected_cpu);
12980 selected_arch = &all_architectures[selected_cpu->arch];
12981 }
12982
12983 if (!selected_tune)
12984 selected_tune = selected_cpu;
12985
12986 if (aarch64_enable_bti == 2)
12987 {
12988 #ifdef TARGET_ENABLE_BTI
12989 aarch64_enable_bti = 1;
12990 #else
12991 aarch64_enable_bti = 0;
12992 #endif
12993 }
12994
12995 /* Return address signing is currently not supported for ILP32 targets. For
12996 LP64 targets use the configured option in the absence of a command-line
12997 option for -mbranch-protection. */
12998 if (!TARGET_ILP32 && accepted_branch_protection_string == NULL)
12999 {
13000 #ifdef TARGET_ENABLE_PAC_RET
13001 aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
13002 #else
13003 aarch64_ra_sign_scope = AARCH64_FUNCTION_NONE;
13004 #endif
13005 }
13006
13007 #ifndef HAVE_AS_MABI_OPTION
13008 /* The compiler may have been configured with 2.23.* binutils, which does
13009 not have support for ILP32. */
13010 if (TARGET_ILP32)
13011 error ("assembler does not support %<-mabi=ilp32%>");
13012 #endif
13013
13014 /* Convert -msve-vector-bits to a VG count. */
13015 aarch64_sve_vg = aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits);
13016
13017 if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE && TARGET_ILP32)
13018 sorry ("return address signing is only supported for %<-mabi=lp64%>");
13019
13020 /* Make sure we properly set up the explicit options. */
13021 if ((aarch64_cpu_string && valid_cpu)
13022 || (aarch64_tune_string && valid_tune))
13023 gcc_assert (explicit_tune_core != aarch64_none);
13024
13025 if ((aarch64_cpu_string && valid_cpu)
13026 || (aarch64_arch_string && valid_arch))
13027 gcc_assert (explicit_arch != aarch64_no_arch);
13028
13029 /* The pass to insert speculation tracking runs before
13030 shrink-wrapping and the latter does not know how to update the
13031 tracking status. So disable it in this case. */
13032 if (aarch64_track_speculation)
13033 flag_shrink_wrap = 0;
13034
13035 aarch64_override_options_internal (&global_options);
13036
13037 /* Save these options as the default ones in case we push and pop them later
13038 while processing functions with potential target attributes. */
13039 target_option_default_node = target_option_current_node
13040 = build_target_option_node (&global_options);
13041 }
13042
13043 /* Implement targetm.override_options_after_change. */
13044
13045 static void
13046 aarch64_override_options_after_change (void)
13047 {
13048 aarch64_override_options_after_change_1 (&global_options);
13049 }
13050
13051 static struct machine_function *
13052 aarch64_init_machine_status (void)
13053 {
13054 struct machine_function *machine;
13055 machine = ggc_cleared_alloc<machine_function> ();
13056 return machine;
13057 }
13058
13059 void
13060 aarch64_init_expanders (void)
13061 {
13062 init_machine_status = aarch64_init_machine_status;
13063 }
13064
13065 /* A checking mechanism for the implementation of the various code models. */
13066 static void
13067 initialize_aarch64_code_model (struct gcc_options *opts)
13068 {
13069 if (opts->x_flag_pic)
13070 {
13071 switch (opts->x_aarch64_cmodel_var)
13072 {
13073 case AARCH64_CMODEL_TINY:
13074 aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
13075 break;
13076 case AARCH64_CMODEL_SMALL:
13077 #ifdef HAVE_AS_SMALL_PIC_RELOCS
13078 aarch64_cmodel = (flag_pic == 2
13079 ? AARCH64_CMODEL_SMALL_PIC
13080 : AARCH64_CMODEL_SMALL_SPIC);
13081 #else
13082 aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
13083 #endif
13084 break;
13085 case AARCH64_CMODEL_LARGE:
13086 sorry ("code model %qs with %<-f%s%>", "large",
13087 opts->x_flag_pic > 1 ? "PIC" : "pic");
13088 break;
13089 default:
13090 gcc_unreachable ();
13091 }
13092 }
13093 else
13094 aarch64_cmodel = opts->x_aarch64_cmodel_var;
13095 }
13096
13097 /* Implement TARGET_OPTION_SAVE. */
13098
13099 static void
13100 aarch64_option_save (struct cl_target_option *ptr, struct gcc_options *opts)
13101 {
13102 ptr->x_aarch64_override_tune_string = opts->x_aarch64_override_tune_string;
13103 ptr->x_aarch64_branch_protection_string
13104 = opts->x_aarch64_branch_protection_string;
13105 }
13106
13107 /* Implements TARGET_OPTION_RESTORE. Restore the backend codegen decisions
13108 using the information saved in PTR. */
13109
13110 static void
13111 aarch64_option_restore (struct gcc_options *opts, struct cl_target_option *ptr)
13112 {
13113 opts->x_explicit_tune_core = ptr->x_explicit_tune_core;
13114 selected_tune = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
13115 opts->x_explicit_arch = ptr->x_explicit_arch;
13116 selected_arch = aarch64_get_arch (ptr->x_explicit_arch);
13117 opts->x_aarch64_override_tune_string = ptr->x_aarch64_override_tune_string;
13118 opts->x_aarch64_branch_protection_string
13119 = ptr->x_aarch64_branch_protection_string;
13120 if (opts->x_aarch64_branch_protection_string)
13121 {
13122 aarch64_parse_branch_protection (opts->x_aarch64_branch_protection_string,
13123 NULL);
13124 }
13125
13126 aarch64_override_options_internal (opts);
13127 }
13128
13129 /* Implement TARGET_OPTION_PRINT. */
13130
13131 static void
13132 aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
13133 {
13134 const struct processor *cpu
13135 = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
13136 uint64_t isa_flags = ptr->x_aarch64_isa_flags;
13137 const struct processor *arch = aarch64_get_arch (ptr->x_explicit_arch);
13138 std::string extension
13139 = aarch64_get_extension_string_for_isa_flags (isa_flags, arch->flags);
13140
13141 fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
13142 fprintf (file, "%*sselected arch = %s%s\n", indent, "",
13143 arch->name, extension.c_str ());
13144 }
13145
13146 static GTY(()) tree aarch64_previous_fndecl;
13147
13148 void
13149 aarch64_reset_previous_fndecl (void)
13150 {
13151 aarch64_previous_fndecl = NULL;
13152 }
13153
13154 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
13155 Used by aarch64_set_current_function and aarch64_pragma_target_parse to
13156 make sure optab availability predicates are recomputed when necessary. */
13157
13158 void
13159 aarch64_save_restore_target_globals (tree new_tree)
13160 {
13161 if (TREE_TARGET_GLOBALS (new_tree))
13162 restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
13163 else if (new_tree == target_option_default_node)
13164 restore_target_globals (&default_target_globals);
13165 else
13166 TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
13167 }
13168
13169 /* Implement TARGET_SET_CURRENT_FUNCTION. Unpack the codegen decisions
13170 like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
13171 of the function, if such exists. This function may be called multiple
13172 times on a single function so use aarch64_previous_fndecl to avoid
13173 setting up identical state. */
13174
13175 static void
13176 aarch64_set_current_function (tree fndecl)
13177 {
13178 if (!fndecl || fndecl == aarch64_previous_fndecl)
13179 return;
13180
13181 tree old_tree = (aarch64_previous_fndecl
13182 ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl)
13183 : NULL_TREE);
13184
13185 tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
13186
13187 /* If current function has no attributes but the previous one did,
13188 use the default node. */
13189 if (!new_tree && old_tree)
13190 new_tree = target_option_default_node;
13191
13192 /* If nothing to do, return. #pragma GCC reset or #pragma GCC pop to
13193 the default have been handled by aarch64_save_restore_target_globals from
13194 aarch64_pragma_target_parse. */
13195 if (old_tree == new_tree)
13196 return;
13197
13198 aarch64_previous_fndecl = fndecl;
13199
13200 /* First set the target options. */
13201 cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
13202
13203 aarch64_save_restore_target_globals (new_tree);
13204 }
13205
13206 /* Enum describing the various ways we can handle attributes.
13207 In many cases we can reuse the generic option handling machinery. */
13208
13209 enum aarch64_attr_opt_type
13210 {
13211 aarch64_attr_mask, /* Attribute should set a bit in target_flags. */
13212 aarch64_attr_bool, /* Attribute sets or unsets a boolean variable. */
13213 aarch64_attr_enum, /* Attribute sets an enum variable. */
13214 aarch64_attr_custom /* Attribute requires a custom handling function. */
13215 };
13216
13217 /* All the information needed to handle a target attribute.
13218 NAME is the name of the attribute.
13219 ATTR_TYPE specifies the type of behavior of the attribute as described
13220 in the definition of enum aarch64_attr_opt_type.
13221 ALLOW_NEG is true if the attribute supports a "no-" form.
13222 HANDLER is the function that takes the attribute string as an argument
13223 It is needed only when the ATTR_TYPE is aarch64_attr_custom.
13224 OPT_NUM is the enum specifying the option that the attribute modifies.
13225 This is needed for attributes that mirror the behavior of a command-line
13226 option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
13227 aarch64_attr_enum. */
13228
13229 struct aarch64_attribute_info
13230 {
13231 const char *name;
13232 enum aarch64_attr_opt_type attr_type;
13233 bool allow_neg;
13234 bool (*handler) (const char *);
13235 enum opt_code opt_num;
13236 };
13237
13238 /* Handle the ARCH_STR argument to the arch= target attribute. */
13239
13240 static bool
13241 aarch64_handle_attr_arch (const char *str)
13242 {
13243 const struct processor *tmp_arch = NULL;
13244 std::string invalid_extension;
13245 enum aarch64_parse_opt_result parse_res
13246 = aarch64_parse_arch (str, &tmp_arch, &aarch64_isa_flags, &invalid_extension);
13247
13248 if (parse_res == AARCH64_PARSE_OK)
13249 {
13250 gcc_assert (tmp_arch);
13251 selected_arch = tmp_arch;
13252 explicit_arch = selected_arch->arch;
13253 return true;
13254 }
13255
13256 switch (parse_res)
13257 {
13258 case AARCH64_PARSE_MISSING_ARG:
13259 error ("missing name in %<target(\"arch=\")%> pragma or attribute");
13260 break;
13261 case AARCH64_PARSE_INVALID_ARG:
13262 error ("invalid name (\"%s\") in %<target(\"arch=\")%> pragma or attribute", str);
13263 aarch64_print_hint_for_arch (str);
13264 break;
13265 case AARCH64_PARSE_INVALID_FEATURE:
13266 error ("invalid feature modifier %s of value (\"%s\") in "
13267 "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
13268 aarch64_print_hint_for_extensions (invalid_extension);
13269 break;
13270 default:
13271 gcc_unreachable ();
13272 }
13273
13274 return false;
13275 }
13276
13277 /* Handle the argument CPU_STR to the cpu= target attribute. */
13278
13279 static bool
13280 aarch64_handle_attr_cpu (const char *str)
13281 {
13282 const struct processor *tmp_cpu = NULL;
13283 std::string invalid_extension;
13284 enum aarch64_parse_opt_result parse_res
13285 = aarch64_parse_cpu (str, &tmp_cpu, &aarch64_isa_flags, &invalid_extension);
13286
13287 if (parse_res == AARCH64_PARSE_OK)
13288 {
13289 gcc_assert (tmp_cpu);
13290 selected_tune = tmp_cpu;
13291 explicit_tune_core = selected_tune->ident;
13292
13293 selected_arch = &all_architectures[tmp_cpu->arch];
13294 explicit_arch = selected_arch->arch;
13295 return true;
13296 }
13297
13298 switch (parse_res)
13299 {
13300 case AARCH64_PARSE_MISSING_ARG:
13301 error ("missing name in %<target(\"cpu=\")%> pragma or attribute");
13302 break;
13303 case AARCH64_PARSE_INVALID_ARG:
13304 error ("invalid name (\"%s\") in %<target(\"cpu=\")%> pragma or attribute", str);
13305 aarch64_print_hint_for_core (str);
13306 break;
13307 case AARCH64_PARSE_INVALID_FEATURE:
13308 error ("invalid feature modifier %s of value (\"%s\") in "
13309 "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
13310 aarch64_print_hint_for_extensions (invalid_extension);
13311 break;
13312 default:
13313 gcc_unreachable ();
13314 }
13315
13316 return false;
13317 }
13318
13319 /* Handle the argument STR to the branch-protection= attribute. */
13320
13321 static bool
13322 aarch64_handle_attr_branch_protection (const char* str)
13323 {
13324 char *err_str = (char *) xmalloc (strlen (str));
13325 enum aarch64_parse_opt_result res = aarch64_parse_branch_protection (str,
13326 &err_str);
13327 bool success = false;
13328 switch (res)
13329 {
13330 case AARCH64_PARSE_MISSING_ARG:
13331 error ("missing argument to %<target(\"branch-protection=\")%> pragma or"
13332 " attribute");
13333 break;
13334 case AARCH64_PARSE_INVALID_ARG:
13335 error ("invalid protection type (\"%s\") in %<target(\"branch-protection"
13336 "=\")%> pragma or attribute", err_str);
13337 break;
13338 case AARCH64_PARSE_OK:
13339 success = true;
13340 /* Fall through. */
13341 case AARCH64_PARSE_INVALID_FEATURE:
13342 break;
13343 default:
13344 gcc_unreachable ();
13345 }
13346 free (err_str);
13347 return success;
13348 }
13349
13350 /* Handle the argument STR to the tune= target attribute. */
13351
13352 static bool
13353 aarch64_handle_attr_tune (const char *str)
13354 {
13355 const struct processor *tmp_tune = NULL;
13356 enum aarch64_parse_opt_result parse_res
13357 = aarch64_parse_tune (str, &tmp_tune);
13358
13359 if (parse_res == AARCH64_PARSE_OK)
13360 {
13361 gcc_assert (tmp_tune);
13362 selected_tune = tmp_tune;
13363 explicit_tune_core = selected_tune->ident;
13364 return true;
13365 }
13366
13367 switch (parse_res)
13368 {
13369 case AARCH64_PARSE_INVALID_ARG:
13370 error ("invalid name (\"%s\") in %<target(\"tune=\")%> pragma or attribute", str);
13371 aarch64_print_hint_for_core (str);
13372 break;
13373 default:
13374 gcc_unreachable ();
13375 }
13376
13377 return false;
13378 }
13379
13380 /* Parse an architecture extensions target attribute string specified in STR.
13381 For example "+fp+nosimd". Show any errors if needed. Return TRUE
13382 if successful. Update aarch64_isa_flags to reflect the ISA features
13383 modified. */
13384
13385 static bool
13386 aarch64_handle_attr_isa_flags (char *str)
13387 {
13388 enum aarch64_parse_opt_result parse_res;
13389 uint64_t isa_flags = aarch64_isa_flags;
13390
13391 /* We allow "+nothing" in the beginning to clear out all architectural
13392 features if the user wants to handpick specific features. */
13393 if (strncmp ("+nothing", str, 8) == 0)
13394 {
13395 isa_flags = 0;
13396 str += 8;
13397 }
13398
13399 std::string invalid_extension;
13400 parse_res = aarch64_parse_extension (str, &isa_flags, &invalid_extension);
13401
13402 if (parse_res == AARCH64_PARSE_OK)
13403 {
13404 aarch64_isa_flags = isa_flags;
13405 return true;
13406 }
13407
13408 switch (parse_res)
13409 {
13410 case AARCH64_PARSE_MISSING_ARG:
13411 error ("missing value in %<target()%> pragma or attribute");
13412 break;
13413
13414 case AARCH64_PARSE_INVALID_FEATURE:
13415 error ("invalid feature modifier %s of value (\"%s\") in "
13416 "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
13417 break;
13418
13419 default:
13420 gcc_unreachable ();
13421 }
13422
13423 return false;
13424 }
13425
13426 /* The target attributes that we support. On top of these we also support just
13427 ISA extensions, like __attribute__ ((target ("+crc"))), but that case is
13428 handled explicitly in aarch64_process_one_target_attr. */
13429
13430 static const struct aarch64_attribute_info aarch64_attributes[] =
13431 {
13432 { "general-regs-only", aarch64_attr_mask, false, NULL,
13433 OPT_mgeneral_regs_only },
13434 { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
13435 OPT_mfix_cortex_a53_835769 },
13436 { "fix-cortex-a53-843419", aarch64_attr_bool, true, NULL,
13437 OPT_mfix_cortex_a53_843419 },
13438 { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
13439 { "strict-align", aarch64_attr_mask, true, NULL, OPT_mstrict_align },
13440 { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
13441 OPT_momit_leaf_frame_pointer },
13442 { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
13443 { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
13444 OPT_march_ },
13445 { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
13446 { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
13447 OPT_mtune_ },
13448 { "branch-protection", aarch64_attr_custom, false,
13449 aarch64_handle_attr_branch_protection, OPT_mbranch_protection_ },
13450 { "sign-return-address", aarch64_attr_enum, false, NULL,
13451 OPT_msign_return_address_ },
13452 { NULL, aarch64_attr_custom, false, NULL, OPT____ }
13453 };
13454
13455 /* Parse ARG_STR which contains the definition of one target attribute.
13456 Show appropriate errors if any or return true if the attribute is valid. */
13457
13458 static bool
13459 aarch64_process_one_target_attr (char *arg_str)
13460 {
13461 bool invert = false;
13462
13463 size_t len = strlen (arg_str);
13464
13465 if (len == 0)
13466 {
13467 error ("malformed %<target()%> pragma or attribute");
13468 return false;
13469 }
13470
13471 char *str_to_check = (char *) alloca (len + 1);
13472 strcpy (str_to_check, arg_str);
13473
13474 /* We have something like __attribute__ ((target ("+fp+nosimd"))).
13475 It is easier to detect and handle it explicitly here rather than going
13476 through the machinery for the rest of the target attributes in this
13477 function. */
13478 if (*str_to_check == '+')
13479 return aarch64_handle_attr_isa_flags (str_to_check);
13480
13481 if (len > 3 && strncmp (str_to_check, "no-", 3) == 0)
13482 {
13483 invert = true;
13484 str_to_check += 3;
13485 }
13486 char *arg = strchr (str_to_check, '=');
13487
13488 /* If we found opt=foo then terminate STR_TO_CHECK at the '='
13489 and point ARG to "foo". */
13490 if (arg)
13491 {
13492 *arg = '\0';
13493 arg++;
13494 }
13495 const struct aarch64_attribute_info *p_attr;
13496 bool found = false;
13497 for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
13498 {
13499 /* If the names don't match up, or the user has given an argument
13500 to an attribute that doesn't accept one, or didn't give an argument
13501 to an attribute that expects one, fail to match. */
13502 if (strcmp (str_to_check, p_attr->name) != 0)
13503 continue;
13504
13505 found = true;
13506 bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
13507 || p_attr->attr_type == aarch64_attr_enum;
13508
13509 if (attr_need_arg_p ^ (arg != NULL))
13510 {
13511 error ("pragma or attribute %<target(\"%s\")%> does not accept an argument", str_to_check);
13512 return false;
13513 }
13514
13515 /* If the name matches but the attribute does not allow "no-" versions
13516 then we can't match. */
13517 if (invert && !p_attr->allow_neg)
13518 {
13519 error ("pragma or attribute %<target(\"%s\")%> does not allow a negated form", str_to_check);
13520 return false;
13521 }
13522
13523 switch (p_attr->attr_type)
13524 {
13525 /* Has a custom handler registered.
13526 For example, cpu=, arch=, tune=. */
13527 case aarch64_attr_custom:
13528 gcc_assert (p_attr->handler);
13529 if (!p_attr->handler (arg))
13530 return false;
13531 break;
13532
13533 /* Either set or unset a boolean option. */
13534 case aarch64_attr_bool:
13535 {
13536 struct cl_decoded_option decoded;
13537
13538 generate_option (p_attr->opt_num, NULL, !invert,
13539 CL_TARGET, &decoded);
13540 aarch64_handle_option (&global_options, &global_options_set,
13541 &decoded, input_location);
13542 break;
13543 }
13544 /* Set or unset a bit in the target_flags. aarch64_handle_option
13545 should know what mask to apply given the option number. */
13546 case aarch64_attr_mask:
13547 {
13548 struct cl_decoded_option decoded;
13549 /* We only need to specify the option number.
13550 aarch64_handle_option will know which mask to apply. */
13551 decoded.opt_index = p_attr->opt_num;
13552 decoded.value = !invert;
13553 aarch64_handle_option (&global_options, &global_options_set,
13554 &decoded, input_location);
13555 break;
13556 }
13557 /* Use the option setting machinery to set an option to an enum. */
13558 case aarch64_attr_enum:
13559 {
13560 gcc_assert (arg);
13561 bool valid;
13562 int value;
13563 valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
13564 &value, CL_TARGET);
13565 if (valid)
13566 {
13567 set_option (&global_options, NULL, p_attr->opt_num, value,
13568 NULL, DK_UNSPECIFIED, input_location,
13569 global_dc);
13570 }
13571 else
13572 {
13573 error ("pragma or attribute %<target(\"%s=%s\")%> is not valid", str_to_check, arg);
13574 }
13575 break;
13576 }
13577 default:
13578 gcc_unreachable ();
13579 }
13580 }
13581
13582 /* If we reached here we either have found an attribute and validated
13583 it or didn't match any. If we matched an attribute but its arguments
13584 were malformed we will have returned false already. */
13585 return found;
13586 }
13587
13588 /* Count how many times the character C appears in
13589 NULL-terminated string STR. */
13590
13591 static unsigned int
13592 num_occurences_in_str (char c, char *str)
13593 {
13594 unsigned int res = 0;
13595 while (*str != '\0')
13596 {
13597 if (*str == c)
13598 res++;
13599
13600 str++;
13601 }
13602
13603 return res;
13604 }
13605
13606 /* Parse the tree in ARGS that contains the target attribute information
13607 and update the global target options space. */
13608
13609 bool
13610 aarch64_process_target_attr (tree args)
13611 {
13612 if (TREE_CODE (args) == TREE_LIST)
13613 {
13614 do
13615 {
13616 tree head = TREE_VALUE (args);
13617 if (head)
13618 {
13619 if (!aarch64_process_target_attr (head))
13620 return false;
13621 }
13622 args = TREE_CHAIN (args);
13623 } while (args);
13624
13625 return true;
13626 }
13627
13628 if (TREE_CODE (args) != STRING_CST)
13629 {
13630 error ("attribute %<target%> argument not a string");
13631 return false;
13632 }
13633
13634 size_t len = strlen (TREE_STRING_POINTER (args));
13635 char *str_to_check = (char *) alloca (len + 1);
13636 strcpy (str_to_check, TREE_STRING_POINTER (args));
13637
13638 if (len == 0)
13639 {
13640 error ("malformed %<target()%> pragma or attribute");
13641 return false;
13642 }
13643
13644 /* Used to catch empty spaces between commas i.e.
13645 attribute ((target ("attr1,,attr2"))). */
13646 unsigned int num_commas = num_occurences_in_str (',', str_to_check);
13647
13648 /* Handle multiple target attributes separated by ','. */
13649 char *token = strtok_r (str_to_check, ",", &str_to_check);
13650
13651 unsigned int num_attrs = 0;
13652 while (token)
13653 {
13654 num_attrs++;
13655 if (!aarch64_process_one_target_attr (token))
13656 {
13657 error ("pragma or attribute %<target(\"%s\")%> is not valid", token);
13658 return false;
13659 }
13660
13661 token = strtok_r (NULL, ",", &str_to_check);
13662 }
13663
13664 if (num_attrs != num_commas + 1)
13665 {
13666 error ("malformed %<target(\"%s\")%> pragma or attribute", TREE_STRING_POINTER (args));
13667 return false;
13668 }
13669
13670 return true;
13671 }
13672
13673 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P. This is used to
13674 process attribute ((target ("..."))). */
13675
13676 static bool
13677 aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
13678 {
13679 struct cl_target_option cur_target;
13680 bool ret;
13681 tree old_optimize;
13682 tree new_target, new_optimize;
13683 tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
13684
13685 /* If what we're processing is the current pragma string then the
13686 target option node is already stored in target_option_current_node
13687 by aarch64_pragma_target_parse in aarch64-c.c. Use that to avoid
13688 having to re-parse the string. This is especially useful to keep
13689 arm_neon.h compile times down since that header contains a lot
13690 of intrinsics enclosed in pragmas. */
13691 if (!existing_target && args == current_target_pragma)
13692 {
13693 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
13694 return true;
13695 }
13696 tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
13697
13698 old_optimize = build_optimization_node (&global_options);
13699 func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
13700
13701 /* If the function changed the optimization levels as well as setting
13702 target options, start with the optimizations specified. */
13703 if (func_optimize && func_optimize != old_optimize)
13704 cl_optimization_restore (&global_options,
13705 TREE_OPTIMIZATION (func_optimize));
13706
13707 /* Save the current target options to restore at the end. */
13708 cl_target_option_save (&cur_target, &global_options);
13709
13710 /* If fndecl already has some target attributes applied to it, unpack
13711 them so that we add this attribute on top of them, rather than
13712 overwriting them. */
13713 if (existing_target)
13714 {
13715 struct cl_target_option *existing_options
13716 = TREE_TARGET_OPTION (existing_target);
13717
13718 if (existing_options)
13719 cl_target_option_restore (&global_options, existing_options);
13720 }
13721 else
13722 cl_target_option_restore (&global_options,
13723 TREE_TARGET_OPTION (target_option_current_node));
13724
13725 ret = aarch64_process_target_attr (args);
13726
13727 /* Set up any additional state. */
13728 if (ret)
13729 {
13730 aarch64_override_options_internal (&global_options);
13731 /* Initialize SIMD builtins if we haven't already.
13732 Set current_target_pragma to NULL for the duration so that
13733 the builtin initialization code doesn't try to tag the functions
13734 being built with the attributes specified by any current pragma, thus
13735 going into an infinite recursion. */
13736 if (TARGET_SIMD)
13737 {
13738 tree saved_current_target_pragma = current_target_pragma;
13739 current_target_pragma = NULL;
13740 aarch64_init_simd_builtins ();
13741 current_target_pragma = saved_current_target_pragma;
13742 }
13743 new_target = build_target_option_node (&global_options);
13744 }
13745 else
13746 new_target = NULL;
13747
13748 new_optimize = build_optimization_node (&global_options);
13749
13750 if (fndecl && ret)
13751 {
13752 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
13753
13754 if (old_optimize != new_optimize)
13755 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
13756 }
13757
13758 cl_target_option_restore (&global_options, &cur_target);
13759
13760 if (old_optimize != new_optimize)
13761 cl_optimization_restore (&global_options,
13762 TREE_OPTIMIZATION (old_optimize));
13763 return ret;
13764 }
13765
13766 /* Helper for aarch64_can_inline_p. In the case where CALLER and CALLEE are
13767 tri-bool options (yes, no, don't care) and the default value is
13768 DEF, determine whether to reject inlining. */
13769
13770 static bool
13771 aarch64_tribools_ok_for_inlining_p (int caller, int callee,
13772 int dont_care, int def)
13773 {
13774 /* If the callee doesn't care, always allow inlining. */
13775 if (callee == dont_care)
13776 return true;
13777
13778 /* If the caller doesn't care, always allow inlining. */
13779 if (caller == dont_care)
13780 return true;
13781
13782 /* Otherwise, allow inlining if either the callee and caller values
13783 agree, or if the callee is using the default value. */
13784 return (callee == caller || callee == def);
13785 }
13786
13787 /* Implement TARGET_CAN_INLINE_P. Decide whether it is valid
13788 to inline CALLEE into CALLER based on target-specific info.
13789 Make sure that the caller and callee have compatible architectural
13790 features. Then go through the other possible target attributes
13791 and see if they can block inlining. Try not to reject always_inline
13792 callees unless they are incompatible architecturally. */
13793
13794 static bool
13795 aarch64_can_inline_p (tree caller, tree callee)
13796 {
13797 tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
13798 tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
13799
13800 struct cl_target_option *caller_opts
13801 = TREE_TARGET_OPTION (caller_tree ? caller_tree
13802 : target_option_default_node);
13803
13804 struct cl_target_option *callee_opts
13805 = TREE_TARGET_OPTION (callee_tree ? callee_tree
13806 : target_option_default_node);
13807
13808 /* Callee's ISA flags should be a subset of the caller's. */
13809 if ((caller_opts->x_aarch64_isa_flags & callee_opts->x_aarch64_isa_flags)
13810 != callee_opts->x_aarch64_isa_flags)
13811 return false;
13812
13813 /* Allow non-strict aligned functions inlining into strict
13814 aligned ones. */
13815 if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
13816 != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
13817 && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
13818 && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
13819 return false;
13820
13821 bool always_inline = lookup_attribute ("always_inline",
13822 DECL_ATTRIBUTES (callee));
13823
13824 /* If the architectural features match up and the callee is always_inline
13825 then the other attributes don't matter. */
13826 if (always_inline)
13827 return true;
13828
13829 if (caller_opts->x_aarch64_cmodel_var
13830 != callee_opts->x_aarch64_cmodel_var)
13831 return false;
13832
13833 if (caller_opts->x_aarch64_tls_dialect
13834 != callee_opts->x_aarch64_tls_dialect)
13835 return false;
13836
13837 /* Honour explicit requests to workaround errata. */
13838 if (!aarch64_tribools_ok_for_inlining_p (
13839 caller_opts->x_aarch64_fix_a53_err835769,
13840 callee_opts->x_aarch64_fix_a53_err835769,
13841 2, TARGET_FIX_ERR_A53_835769_DEFAULT))
13842 return false;
13843
13844 if (!aarch64_tribools_ok_for_inlining_p (
13845 caller_opts->x_aarch64_fix_a53_err843419,
13846 callee_opts->x_aarch64_fix_a53_err843419,
13847 2, TARGET_FIX_ERR_A53_843419))
13848 return false;
13849
13850 /* If the user explicitly specified -momit-leaf-frame-pointer for the
13851 caller and calle and they don't match up, reject inlining. */
13852 if (!aarch64_tribools_ok_for_inlining_p (
13853 caller_opts->x_flag_omit_leaf_frame_pointer,
13854 callee_opts->x_flag_omit_leaf_frame_pointer,
13855 2, 1))
13856 return false;
13857
13858 /* If the callee has specific tuning overrides, respect them. */
13859 if (callee_opts->x_aarch64_override_tune_string != NULL
13860 && caller_opts->x_aarch64_override_tune_string == NULL)
13861 return false;
13862
13863 /* If the user specified tuning override strings for the
13864 caller and callee and they don't match up, reject inlining.
13865 We just do a string compare here, we don't analyze the meaning
13866 of the string, as it would be too costly for little gain. */
13867 if (callee_opts->x_aarch64_override_tune_string
13868 && caller_opts->x_aarch64_override_tune_string
13869 && (strcmp (callee_opts->x_aarch64_override_tune_string,
13870 caller_opts->x_aarch64_override_tune_string) != 0))
13871 return false;
13872
13873 return true;
13874 }
13875
13876 /* Return true if SYMBOL_REF X binds locally. */
13877
13878 static bool
13879 aarch64_symbol_binds_local_p (const_rtx x)
13880 {
13881 return (SYMBOL_REF_DECL (x)
13882 ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
13883 : SYMBOL_REF_LOCAL_P (x));
13884 }
13885
13886 /* Return true if SYMBOL_REF X is thread local */
13887 static bool
13888 aarch64_tls_symbol_p (rtx x)
13889 {
13890 if (! TARGET_HAVE_TLS)
13891 return false;
13892
13893 if (GET_CODE (x) != SYMBOL_REF)
13894 return false;
13895
13896 return SYMBOL_REF_TLS_MODEL (x) != 0;
13897 }
13898
13899 /* Classify a TLS symbol into one of the TLS kinds. */
13900 enum aarch64_symbol_type
13901 aarch64_classify_tls_symbol (rtx x)
13902 {
13903 enum tls_model tls_kind = tls_symbolic_operand_type (x);
13904
13905 switch (tls_kind)
13906 {
13907 case TLS_MODEL_GLOBAL_DYNAMIC:
13908 case TLS_MODEL_LOCAL_DYNAMIC:
13909 return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
13910
13911 case TLS_MODEL_INITIAL_EXEC:
13912 switch (aarch64_cmodel)
13913 {
13914 case AARCH64_CMODEL_TINY:
13915 case AARCH64_CMODEL_TINY_PIC:
13916 return SYMBOL_TINY_TLSIE;
13917 default:
13918 return SYMBOL_SMALL_TLSIE;
13919 }
13920
13921 case TLS_MODEL_LOCAL_EXEC:
13922 if (aarch64_tls_size == 12)
13923 return SYMBOL_TLSLE12;
13924 else if (aarch64_tls_size == 24)
13925 return SYMBOL_TLSLE24;
13926 else if (aarch64_tls_size == 32)
13927 return SYMBOL_TLSLE32;
13928 else if (aarch64_tls_size == 48)
13929 return SYMBOL_TLSLE48;
13930 else
13931 gcc_unreachable ();
13932
13933 case TLS_MODEL_EMULATED:
13934 case TLS_MODEL_NONE:
13935 return SYMBOL_FORCE_TO_MEM;
13936
13937 default:
13938 gcc_unreachable ();
13939 }
13940 }
13941
13942 /* Return the correct method for accessing X + OFFSET, where X is either
13943 a SYMBOL_REF or LABEL_REF. */
13944
13945 enum aarch64_symbol_type
13946 aarch64_classify_symbol (rtx x, HOST_WIDE_INT offset)
13947 {
13948 if (GET_CODE (x) == LABEL_REF)
13949 {
13950 switch (aarch64_cmodel)
13951 {
13952 case AARCH64_CMODEL_LARGE:
13953 return SYMBOL_FORCE_TO_MEM;
13954
13955 case AARCH64_CMODEL_TINY_PIC:
13956 case AARCH64_CMODEL_TINY:
13957 return SYMBOL_TINY_ABSOLUTE;
13958
13959 case AARCH64_CMODEL_SMALL_SPIC:
13960 case AARCH64_CMODEL_SMALL_PIC:
13961 case AARCH64_CMODEL_SMALL:
13962 return SYMBOL_SMALL_ABSOLUTE;
13963
13964 default:
13965 gcc_unreachable ();
13966 }
13967 }
13968
13969 if (GET_CODE (x) == SYMBOL_REF)
13970 {
13971 if (aarch64_tls_symbol_p (x))
13972 return aarch64_classify_tls_symbol (x);
13973
13974 switch (aarch64_cmodel)
13975 {
13976 case AARCH64_CMODEL_TINY:
13977 /* When we retrieve symbol + offset address, we have to make sure
13978 the offset does not cause overflow of the final address. But
13979 we have no way of knowing the address of symbol at compile time
13980 so we can't accurately say if the distance between the PC and
13981 symbol + offset is outside the addressible range of +/-1M in the
13982 TINY code model. So we rely on images not being greater than
13983 1M and cap the offset at 1M and anything beyond 1M will have to
13984 be loaded using an alternative mechanism. Furthermore if the
13985 symbol is a weak reference to something that isn't known to
13986 resolve to a symbol in this module, then force to memory. */
13987 if ((SYMBOL_REF_WEAK (x)
13988 && !aarch64_symbol_binds_local_p (x))
13989 || !IN_RANGE (offset, -1048575, 1048575))
13990 return SYMBOL_FORCE_TO_MEM;
13991 return SYMBOL_TINY_ABSOLUTE;
13992
13993 case AARCH64_CMODEL_SMALL:
13994 /* Same reasoning as the tiny code model, but the offset cap here is
13995 4G. */
13996 if ((SYMBOL_REF_WEAK (x)
13997 && !aarch64_symbol_binds_local_p (x))
13998 || !IN_RANGE (offset, HOST_WIDE_INT_C (-4294967263),
13999 HOST_WIDE_INT_C (4294967264)))
14000 return SYMBOL_FORCE_TO_MEM;
14001 return SYMBOL_SMALL_ABSOLUTE;
14002
14003 case AARCH64_CMODEL_TINY_PIC:
14004 if (!aarch64_symbol_binds_local_p (x))
14005 return SYMBOL_TINY_GOT;
14006 return SYMBOL_TINY_ABSOLUTE;
14007
14008 case AARCH64_CMODEL_SMALL_SPIC:
14009 case AARCH64_CMODEL_SMALL_PIC:
14010 if (!aarch64_symbol_binds_local_p (x))
14011 return (aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
14012 ? SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G);
14013 return SYMBOL_SMALL_ABSOLUTE;
14014
14015 case AARCH64_CMODEL_LARGE:
14016 /* This is alright even in PIC code as the constant
14017 pool reference is always PC relative and within
14018 the same translation unit. */
14019 if (!aarch64_pcrelative_literal_loads && CONSTANT_POOL_ADDRESS_P (x))
14020 return SYMBOL_SMALL_ABSOLUTE;
14021 else
14022 return SYMBOL_FORCE_TO_MEM;
14023
14024 default:
14025 gcc_unreachable ();
14026 }
14027 }
14028
14029 /* By default push everything into the constant pool. */
14030 return SYMBOL_FORCE_TO_MEM;
14031 }
14032
14033 bool
14034 aarch64_constant_address_p (rtx x)
14035 {
14036 return (CONSTANT_P (x) && memory_address_p (DImode, x));
14037 }
14038
14039 bool
14040 aarch64_legitimate_pic_operand_p (rtx x)
14041 {
14042 if (GET_CODE (x) == SYMBOL_REF
14043 || (GET_CODE (x) == CONST
14044 && GET_CODE (XEXP (x, 0)) == PLUS
14045 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
14046 return false;
14047
14048 return true;
14049 }
14050
14051 /* Implement TARGET_LEGITIMATE_CONSTANT_P hook. Return true for constants
14052 that should be rematerialized rather than spilled. */
14053
14054 static bool
14055 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
14056 {
14057 /* Support CSE and rematerialization of common constants. */
14058 if (CONST_INT_P (x)
14059 || (CONST_DOUBLE_P (x) && GET_MODE_CLASS (mode) == MODE_FLOAT)
14060 || GET_CODE (x) == CONST_VECTOR)
14061 return true;
14062
14063 /* Do not allow vector struct mode constants for Advanced SIMD.
14064 We could support 0 and -1 easily, but they need support in
14065 aarch64-simd.md. */
14066 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
14067 if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
14068 return false;
14069
14070 /* Only accept variable-length vector constants if they can be
14071 handled directly.
14072
14073 ??? It would be possible to handle rematerialization of other
14074 constants via secondary reloads. */
14075 if (vec_flags & VEC_ANY_SVE)
14076 return aarch64_simd_valid_immediate (x, NULL);
14077
14078 if (GET_CODE (x) == HIGH)
14079 x = XEXP (x, 0);
14080
14081 /* Accept polynomial constants that can be calculated by using the
14082 destination of a move as the sole temporary. Constants that
14083 require a second temporary cannot be rematerialized (they can't be
14084 forced to memory and also aren't legitimate constants). */
14085 poly_int64 offset;
14086 if (poly_int_rtx_p (x, &offset))
14087 return aarch64_offset_temporaries (false, offset) <= 1;
14088
14089 /* If an offset is being added to something else, we need to allow the
14090 base to be moved into the destination register, meaning that there
14091 are no free temporaries for the offset. */
14092 x = strip_offset (x, &offset);
14093 if (!offset.is_constant () && aarch64_offset_temporaries (true, offset) > 0)
14094 return false;
14095
14096 /* Do not allow const (plus (anchor_symbol, const_int)). */
14097 if (maybe_ne (offset, 0) && SYMBOL_REF_P (x) && SYMBOL_REF_ANCHOR_P (x))
14098 return false;
14099
14100 /* Treat symbols as constants. Avoid TLS symbols as they are complex,
14101 so spilling them is better than rematerialization. */
14102 if (SYMBOL_REF_P (x) && !SYMBOL_REF_TLS_MODEL (x))
14103 return true;
14104
14105 /* Label references are always constant. */
14106 if (GET_CODE (x) == LABEL_REF)
14107 return true;
14108
14109 return false;
14110 }
14111
14112 rtx
14113 aarch64_load_tp (rtx target)
14114 {
14115 if (!target
14116 || GET_MODE (target) != Pmode
14117 || !register_operand (target, Pmode))
14118 target = gen_reg_rtx (Pmode);
14119
14120 /* Can return in any reg. */
14121 emit_insn (gen_aarch64_load_tp_hard (target));
14122 return target;
14123 }
14124
14125 /* On AAPCS systems, this is the "struct __va_list". */
14126 static GTY(()) tree va_list_type;
14127
14128 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
14129 Return the type to use as __builtin_va_list.
14130
14131 AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
14132
14133 struct __va_list
14134 {
14135 void *__stack;
14136 void *__gr_top;
14137 void *__vr_top;
14138 int __gr_offs;
14139 int __vr_offs;
14140 }; */
14141
14142 static tree
14143 aarch64_build_builtin_va_list (void)
14144 {
14145 tree va_list_name;
14146 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
14147
14148 /* Create the type. */
14149 va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
14150 /* Give it the required name. */
14151 va_list_name = build_decl (BUILTINS_LOCATION,
14152 TYPE_DECL,
14153 get_identifier ("__va_list"),
14154 va_list_type);
14155 DECL_ARTIFICIAL (va_list_name) = 1;
14156 TYPE_NAME (va_list_type) = va_list_name;
14157 TYPE_STUB_DECL (va_list_type) = va_list_name;
14158
14159 /* Create the fields. */
14160 f_stack = build_decl (BUILTINS_LOCATION,
14161 FIELD_DECL, get_identifier ("__stack"),
14162 ptr_type_node);
14163 f_grtop = build_decl (BUILTINS_LOCATION,
14164 FIELD_DECL, get_identifier ("__gr_top"),
14165 ptr_type_node);
14166 f_vrtop = build_decl (BUILTINS_LOCATION,
14167 FIELD_DECL, get_identifier ("__vr_top"),
14168 ptr_type_node);
14169 f_groff = build_decl (BUILTINS_LOCATION,
14170 FIELD_DECL, get_identifier ("__gr_offs"),
14171 integer_type_node);
14172 f_vroff = build_decl (BUILTINS_LOCATION,
14173 FIELD_DECL, get_identifier ("__vr_offs"),
14174 integer_type_node);
14175
14176 /* Tell tree-stdarg pass about our internal offset fields.
14177 NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
14178 purpose to identify whether the code is updating va_list internal
14179 offset fields through irregular way. */
14180 va_list_gpr_counter_field = f_groff;
14181 va_list_fpr_counter_field = f_vroff;
14182
14183 DECL_ARTIFICIAL (f_stack) = 1;
14184 DECL_ARTIFICIAL (f_grtop) = 1;
14185 DECL_ARTIFICIAL (f_vrtop) = 1;
14186 DECL_ARTIFICIAL (f_groff) = 1;
14187 DECL_ARTIFICIAL (f_vroff) = 1;
14188
14189 DECL_FIELD_CONTEXT (f_stack) = va_list_type;
14190 DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
14191 DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
14192 DECL_FIELD_CONTEXT (f_groff) = va_list_type;
14193 DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
14194
14195 TYPE_FIELDS (va_list_type) = f_stack;
14196 DECL_CHAIN (f_stack) = f_grtop;
14197 DECL_CHAIN (f_grtop) = f_vrtop;
14198 DECL_CHAIN (f_vrtop) = f_groff;
14199 DECL_CHAIN (f_groff) = f_vroff;
14200
14201 /* Compute its layout. */
14202 layout_type (va_list_type);
14203
14204 return va_list_type;
14205 }
14206
14207 /* Implement TARGET_EXPAND_BUILTIN_VA_START. */
14208 static void
14209 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
14210 {
14211 const CUMULATIVE_ARGS *cum;
14212 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
14213 tree stack, grtop, vrtop, groff, vroff;
14214 tree t;
14215 int gr_save_area_size = cfun->va_list_gpr_size;
14216 int vr_save_area_size = cfun->va_list_fpr_size;
14217 int vr_offset;
14218
14219 cum = &crtl->args.info;
14220 if (cfun->va_list_gpr_size)
14221 gr_save_area_size = MIN ((NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD,
14222 cfun->va_list_gpr_size);
14223 if (cfun->va_list_fpr_size)
14224 vr_save_area_size = MIN ((NUM_FP_ARG_REGS - cum->aapcs_nvrn)
14225 * UNITS_PER_VREG, cfun->va_list_fpr_size);
14226
14227 if (!TARGET_FLOAT)
14228 {
14229 gcc_assert (cum->aapcs_nvrn == 0);
14230 vr_save_area_size = 0;
14231 }
14232
14233 f_stack = TYPE_FIELDS (va_list_type_node);
14234 f_grtop = DECL_CHAIN (f_stack);
14235 f_vrtop = DECL_CHAIN (f_grtop);
14236 f_groff = DECL_CHAIN (f_vrtop);
14237 f_vroff = DECL_CHAIN (f_groff);
14238
14239 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
14240 NULL_TREE);
14241 grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
14242 NULL_TREE);
14243 vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
14244 NULL_TREE);
14245 groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
14246 NULL_TREE);
14247 vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
14248 NULL_TREE);
14249
14250 /* Emit code to initialize STACK, which points to the next varargs stack
14251 argument. CUM->AAPCS_STACK_SIZE gives the number of stack words used
14252 by named arguments. STACK is 8-byte aligned. */
14253 t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
14254 if (cum->aapcs_stack_size > 0)
14255 t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
14256 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
14257 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
14258
14259 /* Emit code to initialize GRTOP, the top of the GR save area.
14260 virtual_incoming_args_rtx should have been 16 byte aligned. */
14261 t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
14262 t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
14263 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
14264
14265 /* Emit code to initialize VRTOP, the top of the VR save area.
14266 This address is gr_save_area_bytes below GRTOP, rounded
14267 down to the next 16-byte boundary. */
14268 t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
14269 vr_offset = ROUND_UP (gr_save_area_size,
14270 STACK_BOUNDARY / BITS_PER_UNIT);
14271
14272 if (vr_offset)
14273 t = fold_build_pointer_plus_hwi (t, -vr_offset);
14274 t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
14275 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
14276
14277 /* Emit code to initialize GROFF, the offset from GRTOP of the
14278 next GPR argument. */
14279 t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
14280 build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
14281 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
14282
14283 /* Likewise emit code to initialize VROFF, the offset from FTOP
14284 of the next VR argument. */
14285 t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
14286 build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
14287 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
14288 }
14289
14290 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR. */
14291
14292 static tree
14293 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
14294 gimple_seq *post_p ATTRIBUTE_UNUSED)
14295 {
14296 tree addr;
14297 bool indirect_p;
14298 bool is_ha; /* is HFA or HVA. */
14299 bool dw_align; /* double-word align. */
14300 machine_mode ag_mode = VOIDmode;
14301 int nregs;
14302 machine_mode mode;
14303
14304 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
14305 tree stack, f_top, f_off, off, arg, roundup, on_stack;
14306 HOST_WIDE_INT size, rsize, adjust, align;
14307 tree t, u, cond1, cond2;
14308
14309 indirect_p = pass_va_arg_by_reference (type);
14310 if (indirect_p)
14311 type = build_pointer_type (type);
14312
14313 mode = TYPE_MODE (type);
14314
14315 f_stack = TYPE_FIELDS (va_list_type_node);
14316 f_grtop = DECL_CHAIN (f_stack);
14317 f_vrtop = DECL_CHAIN (f_grtop);
14318 f_groff = DECL_CHAIN (f_vrtop);
14319 f_vroff = DECL_CHAIN (f_groff);
14320
14321 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
14322 f_stack, NULL_TREE);
14323 size = int_size_in_bytes (type);
14324
14325 bool abi_break;
14326 align
14327 = aarch64_function_arg_alignment (mode, type, &abi_break) / BITS_PER_UNIT;
14328
14329 dw_align = false;
14330 adjust = 0;
14331 if (aarch64_vfp_is_call_or_return_candidate (mode,
14332 type,
14333 &ag_mode,
14334 &nregs,
14335 &is_ha))
14336 {
14337 /* No frontends can create types with variable-sized modes, so we
14338 shouldn't be asked to pass or return them. */
14339 unsigned int ag_size = GET_MODE_SIZE (ag_mode).to_constant ();
14340
14341 /* TYPE passed in fp/simd registers. */
14342 if (!TARGET_FLOAT)
14343 aarch64_err_no_fpadvsimd (mode);
14344
14345 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
14346 unshare_expr (valist), f_vrtop, NULL_TREE);
14347 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
14348 unshare_expr (valist), f_vroff, NULL_TREE);
14349
14350 rsize = nregs * UNITS_PER_VREG;
14351
14352 if (is_ha)
14353 {
14354 if (BYTES_BIG_ENDIAN && ag_size < UNITS_PER_VREG)
14355 adjust = UNITS_PER_VREG - ag_size;
14356 }
14357 else if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
14358 && size < UNITS_PER_VREG)
14359 {
14360 adjust = UNITS_PER_VREG - size;
14361 }
14362 }
14363 else
14364 {
14365 /* TYPE passed in general registers. */
14366 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
14367 unshare_expr (valist), f_grtop, NULL_TREE);
14368 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
14369 unshare_expr (valist), f_groff, NULL_TREE);
14370 rsize = ROUND_UP (size, UNITS_PER_WORD);
14371 nregs = rsize / UNITS_PER_WORD;
14372
14373 if (align > 8)
14374 {
14375 if (abi_break && warn_psabi)
14376 inform (input_location, "parameter passing for argument of type "
14377 "%qT changed in GCC 9.1", type);
14378 dw_align = true;
14379 }
14380
14381 if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
14382 && size < UNITS_PER_WORD)
14383 {
14384 adjust = UNITS_PER_WORD - size;
14385 }
14386 }
14387
14388 /* Get a local temporary for the field value. */
14389 off = get_initialized_tmp_var (f_off, pre_p, NULL);
14390
14391 /* Emit code to branch if off >= 0. */
14392 t = build2 (GE_EXPR, boolean_type_node, off,
14393 build_int_cst (TREE_TYPE (off), 0));
14394 cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
14395
14396 if (dw_align)
14397 {
14398 /* Emit: offs = (offs + 15) & -16. */
14399 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
14400 build_int_cst (TREE_TYPE (off), 15));
14401 t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
14402 build_int_cst (TREE_TYPE (off), -16));
14403 roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
14404 }
14405 else
14406 roundup = NULL;
14407
14408 /* Update ap.__[g|v]r_offs */
14409 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
14410 build_int_cst (TREE_TYPE (off), rsize));
14411 t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
14412
14413 /* String up. */
14414 if (roundup)
14415 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
14416
14417 /* [cond2] if (ap.__[g|v]r_offs > 0) */
14418 u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
14419 build_int_cst (TREE_TYPE (f_off), 0));
14420 cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
14421
14422 /* String up: make sure the assignment happens before the use. */
14423 t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
14424 COND_EXPR_ELSE (cond1) = t;
14425
14426 /* Prepare the trees handling the argument that is passed on the stack;
14427 the top level node will store in ON_STACK. */
14428 arg = get_initialized_tmp_var (stack, pre_p, NULL);
14429 if (align > 8)
14430 {
14431 /* if (alignof(type) > 8) (arg = arg + 15) & -16; */
14432 t = fold_build_pointer_plus_hwi (arg, 15);
14433 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
14434 build_int_cst (TREE_TYPE (t), -16));
14435 roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
14436 }
14437 else
14438 roundup = NULL;
14439 /* Advance ap.__stack */
14440 t = fold_build_pointer_plus_hwi (arg, size + 7);
14441 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
14442 build_int_cst (TREE_TYPE (t), -8));
14443 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
14444 /* String up roundup and advance. */
14445 if (roundup)
14446 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
14447 /* String up with arg */
14448 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
14449 /* Big-endianness related address adjustment. */
14450 if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
14451 && size < UNITS_PER_WORD)
14452 {
14453 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
14454 size_int (UNITS_PER_WORD - size));
14455 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
14456 }
14457
14458 COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
14459 COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
14460
14461 /* Adjustment to OFFSET in the case of BIG_ENDIAN. */
14462 t = off;
14463 if (adjust)
14464 t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
14465 build_int_cst (TREE_TYPE (off), adjust));
14466
14467 t = fold_convert (sizetype, t);
14468 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
14469
14470 if (is_ha)
14471 {
14472 /* type ha; // treat as "struct {ftype field[n];}"
14473 ... [computing offs]
14474 for (i = 0; i <nregs; ++i, offs += 16)
14475 ha.field[i] = *((ftype *)(ap.__vr_top + offs));
14476 return ha; */
14477 int i;
14478 tree tmp_ha, field_t, field_ptr_t;
14479
14480 /* Declare a local variable. */
14481 tmp_ha = create_tmp_var_raw (type, "ha");
14482 gimple_add_tmp_var (tmp_ha);
14483
14484 /* Establish the base type. */
14485 switch (ag_mode)
14486 {
14487 case E_SFmode:
14488 field_t = float_type_node;
14489 field_ptr_t = float_ptr_type_node;
14490 break;
14491 case E_DFmode:
14492 field_t = double_type_node;
14493 field_ptr_t = double_ptr_type_node;
14494 break;
14495 case E_TFmode:
14496 field_t = long_double_type_node;
14497 field_ptr_t = long_double_ptr_type_node;
14498 break;
14499 case E_HFmode:
14500 field_t = aarch64_fp16_type_node;
14501 field_ptr_t = aarch64_fp16_ptr_type_node;
14502 break;
14503 case E_V2SImode:
14504 case E_V4SImode:
14505 {
14506 tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
14507 field_t = build_vector_type_for_mode (innertype, ag_mode);
14508 field_ptr_t = build_pointer_type (field_t);
14509 }
14510 break;
14511 default:
14512 gcc_assert (0);
14513 }
14514
14515 /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area */
14516 tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
14517 addr = t;
14518 t = fold_convert (field_ptr_t, addr);
14519 t = build2 (MODIFY_EXPR, field_t,
14520 build1 (INDIRECT_REF, field_t, tmp_ha),
14521 build1 (INDIRECT_REF, field_t, t));
14522
14523 /* ha.field[i] = *((field_ptr_t)vr_saved_area + i) */
14524 for (i = 1; i < nregs; ++i)
14525 {
14526 addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
14527 u = fold_convert (field_ptr_t, addr);
14528 u = build2 (MODIFY_EXPR, field_t,
14529 build2 (MEM_REF, field_t, tmp_ha,
14530 build_int_cst (field_ptr_t,
14531 (i *
14532 int_size_in_bytes (field_t)))),
14533 build1 (INDIRECT_REF, field_t, u));
14534 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
14535 }
14536
14537 u = fold_convert (TREE_TYPE (f_top), tmp_ha);
14538 t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
14539 }
14540
14541 COND_EXPR_ELSE (cond2) = t;
14542 addr = fold_convert (build_pointer_type (type), cond1);
14543 addr = build_va_arg_indirect_ref (addr);
14544
14545 if (indirect_p)
14546 addr = build_va_arg_indirect_ref (addr);
14547
14548 return addr;
14549 }
14550
14551 /* Implement TARGET_SETUP_INCOMING_VARARGS. */
14552
14553 static void
14554 aarch64_setup_incoming_varargs (cumulative_args_t cum_v,
14555 const function_arg_info &arg,
14556 int *pretend_size ATTRIBUTE_UNUSED, int no_rtl)
14557 {
14558 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
14559 CUMULATIVE_ARGS local_cum;
14560 int gr_saved = cfun->va_list_gpr_size;
14561 int vr_saved = cfun->va_list_fpr_size;
14562
14563 /* The caller has advanced CUM up to, but not beyond, the last named
14564 argument. Advance a local copy of CUM past the last "real" named
14565 argument, to find out how many registers are left over. */
14566 local_cum = *cum;
14567 aarch64_function_arg_advance (pack_cumulative_args(&local_cum), arg);
14568
14569 /* Found out how many registers we need to save.
14570 Honor tree-stdvar analysis results. */
14571 if (cfun->va_list_gpr_size)
14572 gr_saved = MIN (NUM_ARG_REGS - local_cum.aapcs_ncrn,
14573 cfun->va_list_gpr_size / UNITS_PER_WORD);
14574 if (cfun->va_list_fpr_size)
14575 vr_saved = MIN (NUM_FP_ARG_REGS - local_cum.aapcs_nvrn,
14576 cfun->va_list_fpr_size / UNITS_PER_VREG);
14577
14578 if (!TARGET_FLOAT)
14579 {
14580 gcc_assert (local_cum.aapcs_nvrn == 0);
14581 vr_saved = 0;
14582 }
14583
14584 if (!no_rtl)
14585 {
14586 if (gr_saved > 0)
14587 {
14588 rtx ptr, mem;
14589
14590 /* virtual_incoming_args_rtx should have been 16-byte aligned. */
14591 ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
14592 - gr_saved * UNITS_PER_WORD);
14593 mem = gen_frame_mem (BLKmode, ptr);
14594 set_mem_alias_set (mem, get_varargs_alias_set ());
14595
14596 move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
14597 mem, gr_saved);
14598 }
14599 if (vr_saved > 0)
14600 {
14601 /* We can't use move_block_from_reg, because it will use
14602 the wrong mode, storing D regs only. */
14603 machine_mode mode = TImode;
14604 int off, i, vr_start;
14605
14606 /* Set OFF to the offset from virtual_incoming_args_rtx of
14607 the first vector register. The VR save area lies below
14608 the GR one, and is aligned to 16 bytes. */
14609 off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
14610 STACK_BOUNDARY / BITS_PER_UNIT);
14611 off -= vr_saved * UNITS_PER_VREG;
14612
14613 vr_start = V0_REGNUM + local_cum.aapcs_nvrn;
14614 for (i = 0; i < vr_saved; ++i)
14615 {
14616 rtx ptr, mem;
14617
14618 ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
14619 mem = gen_frame_mem (mode, ptr);
14620 set_mem_alias_set (mem, get_varargs_alias_set ());
14621 aarch64_emit_move (mem, gen_rtx_REG (mode, vr_start + i));
14622 off += UNITS_PER_VREG;
14623 }
14624 }
14625 }
14626
14627 /* We don't save the size into *PRETEND_SIZE because we want to avoid
14628 any complication of having crtl->args.pretend_args_size changed. */
14629 cfun->machine->frame.saved_varargs_size
14630 = (ROUND_UP (gr_saved * UNITS_PER_WORD,
14631 STACK_BOUNDARY / BITS_PER_UNIT)
14632 + vr_saved * UNITS_PER_VREG);
14633 }
14634
14635 static void
14636 aarch64_conditional_register_usage (void)
14637 {
14638 int i;
14639 if (!TARGET_FLOAT)
14640 {
14641 for (i = V0_REGNUM; i <= V31_REGNUM; i++)
14642 {
14643 fixed_regs[i] = 1;
14644 call_used_regs[i] = 1;
14645 }
14646 }
14647 if (!TARGET_SVE)
14648 for (i = P0_REGNUM; i <= P15_REGNUM; i++)
14649 {
14650 fixed_regs[i] = 1;
14651 call_used_regs[i] = 1;
14652 }
14653
14654 /* When tracking speculation, we need a couple of call-clobbered registers
14655 to track the speculation state. It would be nice to just use
14656 IP0 and IP1, but currently there are numerous places that just
14657 assume these registers are free for other uses (eg pointer
14658 authentication). */
14659 if (aarch64_track_speculation)
14660 {
14661 fixed_regs[SPECULATION_TRACKER_REGNUM] = 1;
14662 call_used_regs[SPECULATION_TRACKER_REGNUM] = 1;
14663 fixed_regs[SPECULATION_SCRATCH_REGNUM] = 1;
14664 call_used_regs[SPECULATION_SCRATCH_REGNUM] = 1;
14665 }
14666 }
14667
14668 /* Walk down the type tree of TYPE counting consecutive base elements.
14669 If *MODEP is VOIDmode, then set it to the first valid floating point
14670 type. If a non-floating point type is found, or if a floating point
14671 type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
14672 otherwise return the count in the sub-tree. */
14673 static int
14674 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
14675 {
14676 machine_mode mode;
14677 HOST_WIDE_INT size;
14678
14679 switch (TREE_CODE (type))
14680 {
14681 case REAL_TYPE:
14682 mode = TYPE_MODE (type);
14683 if (mode != DFmode && mode != SFmode
14684 && mode != TFmode && mode != HFmode)
14685 return -1;
14686
14687 if (*modep == VOIDmode)
14688 *modep = mode;
14689
14690 if (*modep == mode)
14691 return 1;
14692
14693 break;
14694
14695 case COMPLEX_TYPE:
14696 mode = TYPE_MODE (TREE_TYPE (type));
14697 if (mode != DFmode && mode != SFmode
14698 && mode != TFmode && mode != HFmode)
14699 return -1;
14700
14701 if (*modep == VOIDmode)
14702 *modep = mode;
14703
14704 if (*modep == mode)
14705 return 2;
14706
14707 break;
14708
14709 case VECTOR_TYPE:
14710 /* Use V2SImode and V4SImode as representatives of all 64-bit
14711 and 128-bit vector types. */
14712 size = int_size_in_bytes (type);
14713 switch (size)
14714 {
14715 case 8:
14716 mode = V2SImode;
14717 break;
14718 case 16:
14719 mode = V4SImode;
14720 break;
14721 default:
14722 return -1;
14723 }
14724
14725 if (*modep == VOIDmode)
14726 *modep = mode;
14727
14728 /* Vector modes are considered to be opaque: two vectors are
14729 equivalent for the purposes of being homogeneous aggregates
14730 if they are the same size. */
14731 if (*modep == mode)
14732 return 1;
14733
14734 break;
14735
14736 case ARRAY_TYPE:
14737 {
14738 int count;
14739 tree index = TYPE_DOMAIN (type);
14740
14741 /* Can't handle incomplete types nor sizes that are not
14742 fixed. */
14743 if (!COMPLETE_TYPE_P (type)
14744 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
14745 return -1;
14746
14747 count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
14748 if (count == -1
14749 || !index
14750 || !TYPE_MAX_VALUE (index)
14751 || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
14752 || !TYPE_MIN_VALUE (index)
14753 || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
14754 || count < 0)
14755 return -1;
14756
14757 count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
14758 - tree_to_uhwi (TYPE_MIN_VALUE (index)));
14759
14760 /* There must be no padding. */
14761 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
14762 count * GET_MODE_BITSIZE (*modep)))
14763 return -1;
14764
14765 return count;
14766 }
14767
14768 case RECORD_TYPE:
14769 {
14770 int count = 0;
14771 int sub_count;
14772 tree field;
14773
14774 /* Can't handle incomplete types nor sizes that are not
14775 fixed. */
14776 if (!COMPLETE_TYPE_P (type)
14777 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
14778 return -1;
14779
14780 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
14781 {
14782 if (TREE_CODE (field) != FIELD_DECL)
14783 continue;
14784
14785 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
14786 if (sub_count < 0)
14787 return -1;
14788 count += sub_count;
14789 }
14790
14791 /* There must be no padding. */
14792 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
14793 count * GET_MODE_BITSIZE (*modep)))
14794 return -1;
14795
14796 return count;
14797 }
14798
14799 case UNION_TYPE:
14800 case QUAL_UNION_TYPE:
14801 {
14802 /* These aren't very interesting except in a degenerate case. */
14803 int count = 0;
14804 int sub_count;
14805 tree field;
14806
14807 /* Can't handle incomplete types nor sizes that are not
14808 fixed. */
14809 if (!COMPLETE_TYPE_P (type)
14810 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
14811 return -1;
14812
14813 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
14814 {
14815 if (TREE_CODE (field) != FIELD_DECL)
14816 continue;
14817
14818 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
14819 if (sub_count < 0)
14820 return -1;
14821 count = count > sub_count ? count : sub_count;
14822 }
14823
14824 /* There must be no padding. */
14825 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
14826 count * GET_MODE_BITSIZE (*modep)))
14827 return -1;
14828
14829 return count;
14830 }
14831
14832 default:
14833 break;
14834 }
14835
14836 return -1;
14837 }
14838
14839 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
14840 type as described in AAPCS64 \S 4.1.2.
14841
14842 See the comment above aarch64_composite_type_p for the notes on MODE. */
14843
14844 static bool
14845 aarch64_short_vector_p (const_tree type,
14846 machine_mode mode)
14847 {
14848 poly_int64 size = -1;
14849
14850 if (type && TREE_CODE (type) == VECTOR_TYPE)
14851 size = int_size_in_bytes (type);
14852 else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
14853 || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
14854 size = GET_MODE_SIZE (mode);
14855
14856 return known_eq (size, 8) || known_eq (size, 16);
14857 }
14858
14859 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
14860 type as described in AAPCS64 \S 4.3. This includes aggregate, union and
14861 array types. The C99 floating-point complex types are also considered
14862 as composite types, according to AAPCS64 \S 7.1.1. The complex integer
14863 types, which are GCC extensions and out of the scope of AAPCS64, are
14864 treated as composite types here as well.
14865
14866 Note that MODE itself is not sufficient in determining whether a type
14867 is such a composite type or not. This is because
14868 stor-layout.c:compute_record_mode may have already changed the MODE
14869 (BLKmode) of a RECORD_TYPE TYPE to some other mode. For example, a
14870 structure with only one field may have its MODE set to the mode of the
14871 field. Also an integer mode whose size matches the size of the
14872 RECORD_TYPE type may be used to substitute the original mode
14873 (i.e. BLKmode) in certain circumstances. In other words, MODE cannot be
14874 solely relied on. */
14875
14876 static bool
14877 aarch64_composite_type_p (const_tree type,
14878 machine_mode mode)
14879 {
14880 if (aarch64_short_vector_p (type, mode))
14881 return false;
14882
14883 if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
14884 return true;
14885
14886 if (mode == BLKmode
14887 || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
14888 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
14889 return true;
14890
14891 return false;
14892 }
14893
14894 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
14895 shall be passed or returned in simd/fp register(s) (providing these
14896 parameter passing registers are available).
14897
14898 Upon successful return, *COUNT returns the number of needed registers,
14899 *BASE_MODE returns the mode of the individual register and when IS_HAF
14900 is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
14901 floating-point aggregate or a homogeneous short-vector aggregate. */
14902
14903 static bool
14904 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
14905 const_tree type,
14906 machine_mode *base_mode,
14907 int *count,
14908 bool *is_ha)
14909 {
14910 machine_mode new_mode = VOIDmode;
14911 bool composite_p = aarch64_composite_type_p (type, mode);
14912
14913 if (is_ha != NULL) *is_ha = false;
14914
14915 if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
14916 || aarch64_short_vector_p (type, mode))
14917 {
14918 *count = 1;
14919 new_mode = mode;
14920 }
14921 else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
14922 {
14923 if (is_ha != NULL) *is_ha = true;
14924 *count = 2;
14925 new_mode = GET_MODE_INNER (mode);
14926 }
14927 else if (type && composite_p)
14928 {
14929 int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
14930
14931 if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
14932 {
14933 if (is_ha != NULL) *is_ha = true;
14934 *count = ag_count;
14935 }
14936 else
14937 return false;
14938 }
14939 else
14940 return false;
14941
14942 *base_mode = new_mode;
14943 return true;
14944 }
14945
14946 /* Implement TARGET_STRUCT_VALUE_RTX. */
14947
14948 static rtx
14949 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
14950 int incoming ATTRIBUTE_UNUSED)
14951 {
14952 return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
14953 }
14954
14955 /* Implements target hook vector_mode_supported_p. */
14956 static bool
14957 aarch64_vector_mode_supported_p (machine_mode mode)
14958 {
14959 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
14960 return vec_flags != 0 && (vec_flags & VEC_STRUCT) == 0;
14961 }
14962
14963 /* Return the full-width SVE vector mode for element mode MODE, if one
14964 exists. */
14965 opt_machine_mode
14966 aarch64_full_sve_mode (scalar_mode mode)
14967 {
14968 switch (mode)
14969 {
14970 case E_DFmode:
14971 return VNx2DFmode;
14972 case E_SFmode:
14973 return VNx4SFmode;
14974 case E_HFmode:
14975 return VNx8HFmode;
14976 case E_DImode:
14977 return VNx2DImode;
14978 case E_SImode:
14979 return VNx4SImode;
14980 case E_HImode:
14981 return VNx8HImode;
14982 case E_QImode:
14983 return VNx16QImode;
14984 default:
14985 return opt_machine_mode ();
14986 }
14987 }
14988
14989 /* Return the 128-bit Advanced SIMD vector mode for element mode MODE,
14990 if it exists. */
14991 opt_machine_mode
14992 aarch64_vq_mode (scalar_mode mode)
14993 {
14994 switch (mode)
14995 {
14996 case E_DFmode:
14997 return V2DFmode;
14998 case E_SFmode:
14999 return V4SFmode;
15000 case E_HFmode:
15001 return V8HFmode;
15002 case E_SImode:
15003 return V4SImode;
15004 case E_HImode:
15005 return V8HImode;
15006 case E_QImode:
15007 return V16QImode;
15008 case E_DImode:
15009 return V2DImode;
15010 default:
15011 return opt_machine_mode ();
15012 }
15013 }
15014
15015 /* Return appropriate SIMD container
15016 for MODE within a vector of WIDTH bits. */
15017 static machine_mode
15018 aarch64_simd_container_mode (scalar_mode mode, poly_int64 width)
15019 {
15020 if (TARGET_SVE && known_eq (width, BITS_PER_SVE_VECTOR))
15021 return aarch64_full_sve_mode (mode).else_mode (word_mode);
15022
15023 gcc_assert (known_eq (width, 64) || known_eq (width, 128));
15024 if (TARGET_SIMD)
15025 {
15026 if (known_eq (width, 128))
15027 return aarch64_vq_mode (mode).else_mode (word_mode);
15028 else
15029 switch (mode)
15030 {
15031 case E_SFmode:
15032 return V2SFmode;
15033 case E_HFmode:
15034 return V4HFmode;
15035 case E_SImode:
15036 return V2SImode;
15037 case E_HImode:
15038 return V4HImode;
15039 case E_QImode:
15040 return V8QImode;
15041 default:
15042 break;
15043 }
15044 }
15045 return word_mode;
15046 }
15047
15048 /* Return 128-bit container as the preferred SIMD mode for MODE. */
15049 static machine_mode
15050 aarch64_preferred_simd_mode (scalar_mode mode)
15051 {
15052 poly_int64 bits = TARGET_SVE ? BITS_PER_SVE_VECTOR : 128;
15053 return aarch64_simd_container_mode (mode, bits);
15054 }
15055
15056 /* Return a list of possible vector sizes for the vectorizer
15057 to iterate over. */
15058 static void
15059 aarch64_autovectorize_vector_sizes (vector_sizes *sizes, bool)
15060 {
15061 if (TARGET_SVE)
15062 sizes->safe_push (BYTES_PER_SVE_VECTOR);
15063 sizes->safe_push (16);
15064 sizes->safe_push (8);
15065 }
15066
15067 /* Implement TARGET_MANGLE_TYPE. */
15068
15069 static const char *
15070 aarch64_mangle_type (const_tree type)
15071 {
15072 /* The AArch64 ABI documents say that "__va_list" has to be
15073 mangled as if it is in the "std" namespace. */
15074 if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
15075 return "St9__va_list";
15076
15077 /* Half-precision float. */
15078 if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16)
15079 return "Dh";
15080
15081 /* Mangle AArch64-specific internal types. TYPE_NAME is non-NULL_TREE for
15082 builtin types. */
15083 if (TYPE_NAME (type) != NULL)
15084 return aarch64_mangle_builtin_type (type);
15085
15086 /* Use the default mangling. */
15087 return NULL;
15088 }
15089
15090 /* Find the first rtx_insn before insn that will generate an assembly
15091 instruction. */
15092
15093 static rtx_insn *
15094 aarch64_prev_real_insn (rtx_insn *insn)
15095 {
15096 if (!insn)
15097 return NULL;
15098
15099 do
15100 {
15101 insn = prev_real_insn (insn);
15102 }
15103 while (insn && recog_memoized (insn) < 0);
15104
15105 return insn;
15106 }
15107
15108 static bool
15109 is_madd_op (enum attr_type t1)
15110 {
15111 unsigned int i;
15112 /* A number of these may be AArch32 only. */
15113 enum attr_type mlatypes[] = {
15114 TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
15115 TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
15116 TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
15117 };
15118
15119 for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
15120 {
15121 if (t1 == mlatypes[i])
15122 return true;
15123 }
15124
15125 return false;
15126 }
15127
15128 /* Check if there is a register dependency between a load and the insn
15129 for which we hold recog_data. */
15130
15131 static bool
15132 dep_between_memop_and_curr (rtx memop)
15133 {
15134 rtx load_reg;
15135 int opno;
15136
15137 gcc_assert (GET_CODE (memop) == SET);
15138
15139 if (!REG_P (SET_DEST (memop)))
15140 return false;
15141
15142 load_reg = SET_DEST (memop);
15143 for (opno = 1; opno < recog_data.n_operands; opno++)
15144 {
15145 rtx operand = recog_data.operand[opno];
15146 if (REG_P (operand)
15147 && reg_overlap_mentioned_p (load_reg, operand))
15148 return true;
15149
15150 }
15151 return false;
15152 }
15153
15154
15155 /* When working around the Cortex-A53 erratum 835769,
15156 given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
15157 instruction and has a preceding memory instruction such that a NOP
15158 should be inserted between them. */
15159
15160 bool
15161 aarch64_madd_needs_nop (rtx_insn* insn)
15162 {
15163 enum attr_type attr_type;
15164 rtx_insn *prev;
15165 rtx body;
15166
15167 if (!TARGET_FIX_ERR_A53_835769)
15168 return false;
15169
15170 if (!INSN_P (insn) || recog_memoized (insn) < 0)
15171 return false;
15172
15173 attr_type = get_attr_type (insn);
15174 if (!is_madd_op (attr_type))
15175 return false;
15176
15177 prev = aarch64_prev_real_insn (insn);
15178 /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
15179 Restore recog state to INSN to avoid state corruption. */
15180 extract_constrain_insn_cached (insn);
15181
15182 if (!prev || !contains_mem_rtx_p (PATTERN (prev)))
15183 return false;
15184
15185 body = single_set (prev);
15186
15187 /* If the previous insn is a memory op and there is no dependency between
15188 it and the DImode madd, emit a NOP between them. If body is NULL then we
15189 have a complex memory operation, probably a load/store pair.
15190 Be conservative for now and emit a NOP. */
15191 if (GET_MODE (recog_data.operand[0]) == DImode
15192 && (!body || !dep_between_memop_and_curr (body)))
15193 return true;
15194
15195 return false;
15196
15197 }
15198
15199
15200 /* Implement FINAL_PRESCAN_INSN. */
15201
15202 void
15203 aarch64_final_prescan_insn (rtx_insn *insn)
15204 {
15205 if (aarch64_madd_needs_nop (insn))
15206 fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
15207 }
15208
15209
15210 /* Return true if BASE_OR_STEP is a valid immediate operand for an SVE INDEX
15211 instruction. */
15212
15213 bool
15214 aarch64_sve_index_immediate_p (rtx base_or_step)
15215 {
15216 return (CONST_INT_P (base_or_step)
15217 && IN_RANGE (INTVAL (base_or_step), -16, 15));
15218 }
15219
15220 /* Return true if X is a valid immediate for the SVE ADD and SUB
15221 instructions. Negate X first if NEGATE_P is true. */
15222
15223 bool
15224 aarch64_sve_arith_immediate_p (rtx x, bool negate_p)
15225 {
15226 rtx elt;
15227
15228 if (!const_vec_duplicate_p (x, &elt)
15229 || !CONST_INT_P (elt))
15230 return false;
15231
15232 HOST_WIDE_INT val = INTVAL (elt);
15233 if (negate_p)
15234 val = -val;
15235 val &= GET_MODE_MASK (GET_MODE_INNER (GET_MODE (x)));
15236
15237 if (val & 0xff)
15238 return IN_RANGE (val, 0, 0xff);
15239 return IN_RANGE (val, 0, 0xff00);
15240 }
15241
15242 /* Return true if X is a valid immediate operand for an SVE logical
15243 instruction such as AND. */
15244
15245 bool
15246 aarch64_sve_bitmask_immediate_p (rtx x)
15247 {
15248 rtx elt;
15249
15250 return (const_vec_duplicate_p (x, &elt)
15251 && CONST_INT_P (elt)
15252 && aarch64_bitmask_imm (INTVAL (elt),
15253 GET_MODE_INNER (GET_MODE (x))));
15254 }
15255
15256 /* Return true if X is a valid immediate for the SVE DUP and CPY
15257 instructions. */
15258
15259 bool
15260 aarch64_sve_dup_immediate_p (rtx x)
15261 {
15262 x = aarch64_bit_representation (unwrap_const_vec_duplicate (x));
15263 if (!CONST_INT_P (x))
15264 return false;
15265
15266 HOST_WIDE_INT val = INTVAL (x);
15267 if (val & 0xff)
15268 return IN_RANGE (val, -0x80, 0x7f);
15269 return IN_RANGE (val, -0x8000, 0x7f00);
15270 }
15271
15272 /* Return true if X is a valid immediate operand for an SVE CMP instruction.
15273 SIGNED_P says whether the operand is signed rather than unsigned. */
15274
15275 bool
15276 aarch64_sve_cmp_immediate_p (rtx x, bool signed_p)
15277 {
15278 rtx elt;
15279
15280 return (const_vec_duplicate_p (x, &elt)
15281 && CONST_INT_P (elt)
15282 && (signed_p
15283 ? IN_RANGE (INTVAL (elt), -16, 15)
15284 : IN_RANGE (INTVAL (elt), 0, 127)));
15285 }
15286
15287 /* Return true if X is a valid immediate operand for an SVE FADD or FSUB
15288 instruction. Negate X first if NEGATE_P is true. */
15289
15290 bool
15291 aarch64_sve_float_arith_immediate_p (rtx x, bool negate_p)
15292 {
15293 rtx elt;
15294 REAL_VALUE_TYPE r;
15295
15296 if (!const_vec_duplicate_p (x, &elt)
15297 || GET_CODE (elt) != CONST_DOUBLE)
15298 return false;
15299
15300 r = *CONST_DOUBLE_REAL_VALUE (elt);
15301
15302 if (negate_p)
15303 r = real_value_negate (&r);
15304
15305 if (real_equal (&r, &dconst1))
15306 return true;
15307 if (real_equal (&r, &dconsthalf))
15308 return true;
15309 return false;
15310 }
15311
15312 /* Return true if X is a valid immediate operand for an SVE FMUL
15313 instruction. */
15314
15315 bool
15316 aarch64_sve_float_mul_immediate_p (rtx x)
15317 {
15318 rtx elt;
15319
15320 return (const_vec_duplicate_p (x, &elt)
15321 && GET_CODE (elt) == CONST_DOUBLE
15322 && (real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconsthalf)
15323 || real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconst2)));
15324 }
15325
15326 /* Return true if replicating VAL32 is a valid 2-byte or 4-byte immediate
15327 for the Advanced SIMD operation described by WHICH and INSN. If INFO
15328 is nonnull, use it to describe valid immediates. */
15329 static bool
15330 aarch64_advsimd_valid_immediate_hs (unsigned int val32,
15331 simd_immediate_info *info,
15332 enum simd_immediate_check which,
15333 simd_immediate_info::insn_type insn)
15334 {
15335 /* Try a 4-byte immediate with LSL. */
15336 for (unsigned int shift = 0; shift < 32; shift += 8)
15337 if ((val32 & (0xff << shift)) == val32)
15338 {
15339 if (info)
15340 *info = simd_immediate_info (SImode, val32 >> shift, insn,
15341 simd_immediate_info::LSL, shift);
15342 return true;
15343 }
15344
15345 /* Try a 2-byte immediate with LSL. */
15346 unsigned int imm16 = val32 & 0xffff;
15347 if (imm16 == (val32 >> 16))
15348 for (unsigned int shift = 0; shift < 16; shift += 8)
15349 if ((imm16 & (0xff << shift)) == imm16)
15350 {
15351 if (info)
15352 *info = simd_immediate_info (HImode, imm16 >> shift, insn,
15353 simd_immediate_info::LSL, shift);
15354 return true;
15355 }
15356
15357 /* Try a 4-byte immediate with MSL, except for cases that MVN
15358 can handle. */
15359 if (which == AARCH64_CHECK_MOV)
15360 for (unsigned int shift = 8; shift < 24; shift += 8)
15361 {
15362 unsigned int low = (1 << shift) - 1;
15363 if (((val32 & (0xff << shift)) | low) == val32)
15364 {
15365 if (info)
15366 *info = simd_immediate_info (SImode, val32 >> shift, insn,
15367 simd_immediate_info::MSL, shift);
15368 return true;
15369 }
15370 }
15371
15372 return false;
15373 }
15374
15375 /* Return true if replicating VAL64 is a valid immediate for the
15376 Advanced SIMD operation described by WHICH. If INFO is nonnull,
15377 use it to describe valid immediates. */
15378 static bool
15379 aarch64_advsimd_valid_immediate (unsigned HOST_WIDE_INT val64,
15380 simd_immediate_info *info,
15381 enum simd_immediate_check which)
15382 {
15383 unsigned int val32 = val64 & 0xffffffff;
15384 unsigned int val16 = val64 & 0xffff;
15385 unsigned int val8 = val64 & 0xff;
15386
15387 if (val32 == (val64 >> 32))
15388 {
15389 if ((which & AARCH64_CHECK_ORR) != 0
15390 && aarch64_advsimd_valid_immediate_hs (val32, info, which,
15391 simd_immediate_info::MOV))
15392 return true;
15393
15394 if ((which & AARCH64_CHECK_BIC) != 0
15395 && aarch64_advsimd_valid_immediate_hs (~val32, info, which,
15396 simd_immediate_info::MVN))
15397 return true;
15398
15399 /* Try using a replicated byte. */
15400 if (which == AARCH64_CHECK_MOV
15401 && val16 == (val32 >> 16)
15402 && val8 == (val16 >> 8))
15403 {
15404 if (info)
15405 *info = simd_immediate_info (QImode, val8);
15406 return true;
15407 }
15408 }
15409
15410 /* Try using a bit-to-bytemask. */
15411 if (which == AARCH64_CHECK_MOV)
15412 {
15413 unsigned int i;
15414 for (i = 0; i < 64; i += 8)
15415 {
15416 unsigned char byte = (val64 >> i) & 0xff;
15417 if (byte != 0 && byte != 0xff)
15418 break;
15419 }
15420 if (i == 64)
15421 {
15422 if (info)
15423 *info = simd_immediate_info (DImode, val64);
15424 return true;
15425 }
15426 }
15427 return false;
15428 }
15429
15430 /* Return true if replicating VAL64 gives a valid immediate for an SVE MOV
15431 instruction. If INFO is nonnull, use it to describe valid immediates. */
15432
15433 static bool
15434 aarch64_sve_valid_immediate (unsigned HOST_WIDE_INT val64,
15435 simd_immediate_info *info)
15436 {
15437 scalar_int_mode mode = DImode;
15438 unsigned int val32 = val64 & 0xffffffff;
15439 if (val32 == (val64 >> 32))
15440 {
15441 mode = SImode;
15442 unsigned int val16 = val32 & 0xffff;
15443 if (val16 == (val32 >> 16))
15444 {
15445 mode = HImode;
15446 unsigned int val8 = val16 & 0xff;
15447 if (val8 == (val16 >> 8))
15448 mode = QImode;
15449 }
15450 }
15451 HOST_WIDE_INT val = trunc_int_for_mode (val64, mode);
15452 if (IN_RANGE (val, -0x80, 0x7f))
15453 {
15454 /* DUP with no shift. */
15455 if (info)
15456 *info = simd_immediate_info (mode, val);
15457 return true;
15458 }
15459 if ((val & 0xff) == 0 && IN_RANGE (val, -0x8000, 0x7f00))
15460 {
15461 /* DUP with LSL #8. */
15462 if (info)
15463 *info = simd_immediate_info (mode, val);
15464 return true;
15465 }
15466 if (aarch64_bitmask_imm (val64, mode))
15467 {
15468 /* DUPM. */
15469 if (info)
15470 *info = simd_immediate_info (mode, val);
15471 return true;
15472 }
15473 return false;
15474 }
15475
15476 /* Return true if X is a valid SVE predicate. If INFO is nonnull, use
15477 it to describe valid immediates. */
15478
15479 static bool
15480 aarch64_sve_pred_valid_immediate (rtx x, simd_immediate_info *info)
15481 {
15482 if (x == CONST0_RTX (GET_MODE (x)))
15483 {
15484 if (info)
15485 *info = simd_immediate_info (DImode, 0);
15486 return true;
15487 }
15488
15489 /* Analyze the value as a VNx16BImode. This should be relatively
15490 efficient, since rtx_vector_builder has enough built-in capacity
15491 to store all VLA predicate constants without needing the heap. */
15492 rtx_vector_builder builder;
15493 if (!aarch64_get_sve_pred_bits (builder, x))
15494 return false;
15495
15496 unsigned int elt_size = aarch64_widest_sve_pred_elt_size (builder);
15497 if (int vl = aarch64_partial_ptrue_length (builder, elt_size))
15498 {
15499 machine_mode mode = aarch64_sve_pred_mode (elt_size).require ();
15500 aarch64_svpattern pattern = aarch64_svpattern_for_vl (mode, vl);
15501 if (pattern != AARCH64_NUM_SVPATTERNS)
15502 {
15503 if (info)
15504 {
15505 scalar_int_mode int_mode = aarch64_sve_element_int_mode (mode);
15506 *info = simd_immediate_info (int_mode, pattern);
15507 }
15508 return true;
15509 }
15510 }
15511 return false;
15512 }
15513
15514 /* Return true if OP is a valid SIMD immediate for the operation
15515 described by WHICH. If INFO is nonnull, use it to describe valid
15516 immediates. */
15517 bool
15518 aarch64_simd_valid_immediate (rtx op, simd_immediate_info *info,
15519 enum simd_immediate_check which)
15520 {
15521 machine_mode mode = GET_MODE (op);
15522 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
15523 if (vec_flags == 0 || vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
15524 return false;
15525
15526 if (vec_flags & VEC_SVE_PRED)
15527 return aarch64_sve_pred_valid_immediate (op, info);
15528
15529 scalar_mode elt_mode = GET_MODE_INNER (mode);
15530 rtx base, step;
15531 unsigned int n_elts;
15532 if (GET_CODE (op) == CONST_VECTOR
15533 && CONST_VECTOR_DUPLICATE_P (op))
15534 n_elts = CONST_VECTOR_NPATTERNS (op);
15535 else if ((vec_flags & VEC_SVE_DATA)
15536 && const_vec_series_p (op, &base, &step))
15537 {
15538 gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_INT);
15539 if (!aarch64_sve_index_immediate_p (base)
15540 || !aarch64_sve_index_immediate_p (step))
15541 return false;
15542
15543 if (info)
15544 *info = simd_immediate_info (elt_mode, base, step);
15545 return true;
15546 }
15547 else if (GET_CODE (op) == CONST_VECTOR
15548 && CONST_VECTOR_NUNITS (op).is_constant (&n_elts))
15549 /* N_ELTS set above. */;
15550 else
15551 return false;
15552
15553 scalar_float_mode elt_float_mode;
15554 if (n_elts == 1
15555 && is_a <scalar_float_mode> (elt_mode, &elt_float_mode))
15556 {
15557 rtx elt = CONST_VECTOR_ENCODED_ELT (op, 0);
15558 if (aarch64_float_const_zero_rtx_p (elt)
15559 || aarch64_float_const_representable_p (elt))
15560 {
15561 if (info)
15562 *info = simd_immediate_info (elt_float_mode, elt);
15563 return true;
15564 }
15565 }
15566
15567 unsigned int elt_size = GET_MODE_SIZE (elt_mode);
15568 if (elt_size > 8)
15569 return false;
15570
15571 scalar_int_mode elt_int_mode = int_mode_for_mode (elt_mode).require ();
15572
15573 /* Expand the vector constant out into a byte vector, with the least
15574 significant byte of the register first. */
15575 auto_vec<unsigned char, 16> bytes;
15576 bytes.reserve (n_elts * elt_size);
15577 for (unsigned int i = 0; i < n_elts; i++)
15578 {
15579 /* The vector is provided in gcc endian-neutral fashion.
15580 For aarch64_be Advanced SIMD, it must be laid out in the vector
15581 register in reverse order. */
15582 bool swap_p = ((vec_flags & VEC_ADVSIMD) != 0 && BYTES_BIG_ENDIAN);
15583 rtx elt = CONST_VECTOR_ELT (op, swap_p ? (n_elts - 1 - i) : i);
15584
15585 if (elt_mode != elt_int_mode)
15586 elt = gen_lowpart (elt_int_mode, elt);
15587
15588 if (!CONST_INT_P (elt))
15589 return false;
15590
15591 unsigned HOST_WIDE_INT elt_val = INTVAL (elt);
15592 for (unsigned int byte = 0; byte < elt_size; byte++)
15593 {
15594 bytes.quick_push (elt_val & 0xff);
15595 elt_val >>= BITS_PER_UNIT;
15596 }
15597 }
15598
15599 /* The immediate must repeat every eight bytes. */
15600 unsigned int nbytes = bytes.length ();
15601 for (unsigned i = 8; i < nbytes; ++i)
15602 if (bytes[i] != bytes[i - 8])
15603 return false;
15604
15605 /* Get the repeating 8-byte value as an integer. No endian correction
15606 is needed here because bytes is already in lsb-first order. */
15607 unsigned HOST_WIDE_INT val64 = 0;
15608 for (unsigned int i = 0; i < 8; i++)
15609 val64 |= ((unsigned HOST_WIDE_INT) bytes[i % nbytes]
15610 << (i * BITS_PER_UNIT));
15611
15612 if (vec_flags & VEC_SVE_DATA)
15613 return aarch64_sve_valid_immediate (val64, info);
15614 else
15615 return aarch64_advsimd_valid_immediate (val64, info, which);
15616 }
15617
15618 /* Check whether X is a VEC_SERIES-like constant that starts at 0 and
15619 has a step in the range of INDEX. Return the index expression if so,
15620 otherwise return null. */
15621 rtx
15622 aarch64_check_zero_based_sve_index_immediate (rtx x)
15623 {
15624 rtx base, step;
15625 if (const_vec_series_p (x, &base, &step)
15626 && base == const0_rtx
15627 && aarch64_sve_index_immediate_p (step))
15628 return step;
15629 return NULL_RTX;
15630 }
15631
15632 /* Check of immediate shift constants are within range. */
15633 bool
15634 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
15635 {
15636 int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
15637 if (left)
15638 return aarch64_const_vec_all_same_in_range_p (x, 0, bit_width - 1);
15639 else
15640 return aarch64_const_vec_all_same_in_range_p (x, 1, bit_width);
15641 }
15642
15643 /* Return the bitmask CONST_INT to select the bits required by a zero extract
15644 operation of width WIDTH at bit position POS. */
15645
15646 rtx
15647 aarch64_mask_from_zextract_ops (rtx width, rtx pos)
15648 {
15649 gcc_assert (CONST_INT_P (width));
15650 gcc_assert (CONST_INT_P (pos));
15651
15652 unsigned HOST_WIDE_INT mask
15653 = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1;
15654 return GEN_INT (mask << UINTVAL (pos));
15655 }
15656
15657 bool
15658 aarch64_mov_operand_p (rtx x, machine_mode mode)
15659 {
15660 if (GET_CODE (x) == HIGH
15661 && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
15662 return true;
15663
15664 if (CONST_INT_P (x))
15665 return true;
15666
15667 if (VECTOR_MODE_P (GET_MODE (x)))
15668 {
15669 /* Require predicate constants to be VNx16BI before RA, so that we
15670 force everything to have a canonical form. */
15671 if (!lra_in_progress
15672 && !reload_completed
15673 && GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_BOOL
15674 && GET_MODE (x) != VNx16BImode)
15675 return false;
15676
15677 return aarch64_simd_valid_immediate (x, NULL);
15678 }
15679
15680 if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
15681 return true;
15682
15683 if (aarch64_sve_cnt_immediate_p (x))
15684 return true;
15685
15686 return aarch64_classify_symbolic_expression (x)
15687 == SYMBOL_TINY_ABSOLUTE;
15688 }
15689
15690 /* Return a const_int vector of VAL. */
15691 rtx
15692 aarch64_simd_gen_const_vector_dup (machine_mode mode, HOST_WIDE_INT val)
15693 {
15694 rtx c = gen_int_mode (val, GET_MODE_INNER (mode));
15695 return gen_const_vec_duplicate (mode, c);
15696 }
15697
15698 /* Check OP is a legal scalar immediate for the MOVI instruction. */
15699
15700 bool
15701 aarch64_simd_scalar_immediate_valid_for_move (rtx op, scalar_int_mode mode)
15702 {
15703 machine_mode vmode;
15704
15705 vmode = aarch64_simd_container_mode (mode, 64);
15706 rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
15707 return aarch64_simd_valid_immediate (op_v, NULL);
15708 }
15709
15710 /* Construct and return a PARALLEL RTX vector with elements numbering the
15711 lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
15712 the vector - from the perspective of the architecture. This does not
15713 line up with GCC's perspective on lane numbers, so we end up with
15714 different masks depending on our target endian-ness. The diagram
15715 below may help. We must draw the distinction when building masks
15716 which select one half of the vector. An instruction selecting
15717 architectural low-lanes for a big-endian target, must be described using
15718 a mask selecting GCC high-lanes.
15719
15720 Big-Endian Little-Endian
15721
15722 GCC 0 1 2 3 3 2 1 0
15723 | x | x | x | x | | x | x | x | x |
15724 Architecture 3 2 1 0 3 2 1 0
15725
15726 Low Mask: { 2, 3 } { 0, 1 }
15727 High Mask: { 0, 1 } { 2, 3 }
15728
15729 MODE Is the mode of the vector and NUNITS is the number of units in it. */
15730
15731 rtx
15732 aarch64_simd_vect_par_cnst_half (machine_mode mode, int nunits, bool high)
15733 {
15734 rtvec v = rtvec_alloc (nunits / 2);
15735 int high_base = nunits / 2;
15736 int low_base = 0;
15737 int base;
15738 rtx t1;
15739 int i;
15740
15741 if (BYTES_BIG_ENDIAN)
15742 base = high ? low_base : high_base;
15743 else
15744 base = high ? high_base : low_base;
15745
15746 for (i = 0; i < nunits / 2; i++)
15747 RTVEC_ELT (v, i) = GEN_INT (base + i);
15748
15749 t1 = gen_rtx_PARALLEL (mode, v);
15750 return t1;
15751 }
15752
15753 /* Check OP for validity as a PARALLEL RTX vector with elements
15754 numbering the lanes of either the high (HIGH == TRUE) or low lanes,
15755 from the perspective of the architecture. See the diagram above
15756 aarch64_simd_vect_par_cnst_half for more details. */
15757
15758 bool
15759 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
15760 bool high)
15761 {
15762 int nelts;
15763 if (!VECTOR_MODE_P (mode) || !GET_MODE_NUNITS (mode).is_constant (&nelts))
15764 return false;
15765
15766 rtx ideal = aarch64_simd_vect_par_cnst_half (mode, nelts, high);
15767 HOST_WIDE_INT count_op = XVECLEN (op, 0);
15768 HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
15769 int i = 0;
15770
15771 if (count_op != count_ideal)
15772 return false;
15773
15774 for (i = 0; i < count_ideal; i++)
15775 {
15776 rtx elt_op = XVECEXP (op, 0, i);
15777 rtx elt_ideal = XVECEXP (ideal, 0, i);
15778
15779 if (!CONST_INT_P (elt_op)
15780 || INTVAL (elt_ideal) != INTVAL (elt_op))
15781 return false;
15782 }
15783 return true;
15784 }
15785
15786 /* Return a PARALLEL containing NELTS elements, with element I equal
15787 to BASE + I * STEP. */
15788
15789 rtx
15790 aarch64_gen_stepped_int_parallel (unsigned int nelts, int base, int step)
15791 {
15792 rtvec vec = rtvec_alloc (nelts);
15793 for (unsigned int i = 0; i < nelts; ++i)
15794 RTVEC_ELT (vec, i) = gen_int_mode (base + i * step, DImode);
15795 return gen_rtx_PARALLEL (VOIDmode, vec);
15796 }
15797
15798 /* Return true if OP is a PARALLEL of CONST_INTs that form a linear
15799 series with step STEP. */
15800
15801 bool
15802 aarch64_stepped_int_parallel_p (rtx op, int step)
15803 {
15804 if (GET_CODE (op) != PARALLEL || !CONST_INT_P (XVECEXP (op, 0, 0)))
15805 return false;
15806
15807 unsigned HOST_WIDE_INT base = UINTVAL (XVECEXP (op, 0, 0));
15808 for (int i = 1; i < XVECLEN (op, 0); ++i)
15809 if (!CONST_INT_P (XVECEXP (op, 0, i))
15810 || UINTVAL (XVECEXP (op, 0, i)) != base + i * step)
15811 return false;
15812
15813 return true;
15814 }
15815
15816 /* Bounds-check lanes. Ensure OPERAND lies between LOW (inclusive) and
15817 HIGH (exclusive). */
15818 void
15819 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
15820 const_tree exp)
15821 {
15822 HOST_WIDE_INT lane;
15823 gcc_assert (CONST_INT_P (operand));
15824 lane = INTVAL (operand);
15825
15826 if (lane < low || lane >= high)
15827 {
15828 if (exp)
15829 error ("%Klane %wd out of range %wd - %wd", exp, lane, low, high - 1);
15830 else
15831 error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
15832 }
15833 }
15834
15835 /* Peform endian correction on lane number N, which indexes a vector
15836 of mode MODE, and return the result as an SImode rtx. */
15837
15838 rtx
15839 aarch64_endian_lane_rtx (machine_mode mode, unsigned int n)
15840 {
15841 return gen_int_mode (ENDIAN_LANE_N (GET_MODE_NUNITS (mode), n), SImode);
15842 }
15843
15844 /* Return TRUE if OP is a valid vector addressing mode. */
15845
15846 bool
15847 aarch64_simd_mem_operand_p (rtx op)
15848 {
15849 return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
15850 || REG_P (XEXP (op, 0)));
15851 }
15852
15853 /* Return true if OP is a valid MEM operand for an SVE LD1R instruction. */
15854
15855 bool
15856 aarch64_sve_ld1r_operand_p (rtx op)
15857 {
15858 struct aarch64_address_info addr;
15859 scalar_mode mode;
15860
15861 return (MEM_P (op)
15862 && is_a <scalar_mode> (GET_MODE (op), &mode)
15863 && aarch64_classify_address (&addr, XEXP (op, 0), mode, false)
15864 && addr.type == ADDRESS_REG_IMM
15865 && offset_6bit_unsigned_scaled_p (mode, addr.const_offset));
15866 }
15867
15868 /* Return true if OP is a valid MEM operand for an SVE LD1RQ instruction. */
15869 bool
15870 aarch64_sve_ld1rq_operand_p (rtx op)
15871 {
15872 struct aarch64_address_info addr;
15873 scalar_mode elem_mode = GET_MODE_INNER (GET_MODE (op));
15874 if (!MEM_P (op)
15875 || !aarch64_classify_address (&addr, XEXP (op, 0), elem_mode, false))
15876 return false;
15877
15878 if (addr.type == ADDRESS_REG_IMM)
15879 return offset_4bit_signed_scaled_p (TImode, addr.const_offset);
15880
15881 if (addr.type == ADDRESS_REG_REG)
15882 return (1U << addr.shift) == GET_MODE_SIZE (elem_mode);
15883
15884 return false;
15885 }
15886
15887 /* Return true if OP is a valid MEM operand for an SVE LDR instruction.
15888 The conditions for STR are the same. */
15889 bool
15890 aarch64_sve_ldr_operand_p (rtx op)
15891 {
15892 struct aarch64_address_info addr;
15893
15894 return (MEM_P (op)
15895 && aarch64_classify_address (&addr, XEXP (op, 0), GET_MODE (op),
15896 false, ADDR_QUERY_ANY)
15897 && addr.type == ADDRESS_REG_IMM);
15898 }
15899
15900 /* Return true if OP is a valid MEM operand for an SVE_STRUCT mode.
15901 We need to be able to access the individual pieces, so the range
15902 is different from LD[234] and ST[234]. */
15903 bool
15904 aarch64_sve_struct_memory_operand_p (rtx op)
15905 {
15906 if (!MEM_P (op))
15907 return false;
15908
15909 machine_mode mode = GET_MODE (op);
15910 struct aarch64_address_info addr;
15911 if (!aarch64_classify_address (&addr, XEXP (op, 0), SVE_BYTE_MODE, false,
15912 ADDR_QUERY_ANY)
15913 || addr.type != ADDRESS_REG_IMM)
15914 return false;
15915
15916 poly_int64 first = addr.const_offset;
15917 poly_int64 last = first + GET_MODE_SIZE (mode) - BYTES_PER_SVE_VECTOR;
15918 return (offset_4bit_signed_scaled_p (SVE_BYTE_MODE, first)
15919 && offset_4bit_signed_scaled_p (SVE_BYTE_MODE, last));
15920 }
15921
15922 /* Emit a register copy from operand to operand, taking care not to
15923 early-clobber source registers in the process.
15924
15925 COUNT is the number of components into which the copy needs to be
15926 decomposed. */
15927 void
15928 aarch64_simd_emit_reg_reg_move (rtx *operands, machine_mode mode,
15929 unsigned int count)
15930 {
15931 unsigned int i;
15932 int rdest = REGNO (operands[0]);
15933 int rsrc = REGNO (operands[1]);
15934
15935 if (!reg_overlap_mentioned_p (operands[0], operands[1])
15936 || rdest < rsrc)
15937 for (i = 0; i < count; i++)
15938 emit_move_insn (gen_rtx_REG (mode, rdest + i),
15939 gen_rtx_REG (mode, rsrc + i));
15940 else
15941 for (i = 0; i < count; i++)
15942 emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
15943 gen_rtx_REG (mode, rsrc + count - i - 1));
15944 }
15945
15946 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
15947 one of VSTRUCT modes: OI, CI, or XI. */
15948 int
15949 aarch64_simd_attr_length_rglist (machine_mode mode)
15950 {
15951 /* This is only used (and only meaningful) for Advanced SIMD, not SVE. */
15952 return (GET_MODE_SIZE (mode).to_constant () / UNITS_PER_VREG) * 4;
15953 }
15954
15955 /* Implement target hook TARGET_VECTOR_ALIGNMENT. The AAPCS64 sets the maximum
15956 alignment of a vector to 128 bits. SVE predicates have an alignment of
15957 16 bits. */
15958 static HOST_WIDE_INT
15959 aarch64_simd_vector_alignment (const_tree type)
15960 {
15961 /* ??? Checking the mode isn't ideal, but VECTOR_BOOLEAN_TYPE_P can
15962 be set for non-predicate vectors of booleans. Modes are the most
15963 direct way we have of identifying real SVE predicate types. */
15964 if (GET_MODE_CLASS (TYPE_MODE (type)) == MODE_VECTOR_BOOL)
15965 return 16;
15966 if (TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
15967 return 128;
15968 return wi::umin (wi::to_wide (TYPE_SIZE (type)), 128).to_uhwi ();
15969 }
15970
15971 /* Implement target hook TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT. */
15972 static poly_uint64
15973 aarch64_vectorize_preferred_vector_alignment (const_tree type)
15974 {
15975 if (aarch64_sve_data_mode_p (TYPE_MODE (type)))
15976 {
15977 /* If the length of the vector is fixed, try to align to that length,
15978 otherwise don't try to align at all. */
15979 HOST_WIDE_INT result;
15980 if (!BITS_PER_SVE_VECTOR.is_constant (&result))
15981 result = TYPE_ALIGN (TREE_TYPE (type));
15982 return result;
15983 }
15984 return TYPE_ALIGN (type);
15985 }
15986
15987 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE. */
15988 static bool
15989 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
15990 {
15991 if (is_packed)
15992 return false;
15993
15994 /* For fixed-length vectors, check that the vectorizer will aim for
15995 full-vector alignment. This isn't true for generic GCC vectors
15996 that are wider than the ABI maximum of 128 bits. */
15997 poly_uint64 preferred_alignment =
15998 aarch64_vectorize_preferred_vector_alignment (type);
15999 if (TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
16000 && maybe_ne (wi::to_widest (TYPE_SIZE (type)),
16001 preferred_alignment))
16002 return false;
16003
16004 /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned. */
16005 return true;
16006 }
16007
16008 /* Return true if the vector misalignment factor is supported by the
16009 target. */
16010 static bool
16011 aarch64_builtin_support_vector_misalignment (machine_mode mode,
16012 const_tree type, int misalignment,
16013 bool is_packed)
16014 {
16015 if (TARGET_SIMD && STRICT_ALIGNMENT)
16016 {
16017 /* Return if movmisalign pattern is not supported for this mode. */
16018 if (optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing)
16019 return false;
16020
16021 /* Misalignment factor is unknown at compile time. */
16022 if (misalignment == -1)
16023 return false;
16024 }
16025 return default_builtin_support_vector_misalignment (mode, type, misalignment,
16026 is_packed);
16027 }
16028
16029 /* If VALS is a vector constant that can be loaded into a register
16030 using DUP, generate instructions to do so and return an RTX to
16031 assign to the register. Otherwise return NULL_RTX. */
16032 static rtx
16033 aarch64_simd_dup_constant (rtx vals)
16034 {
16035 machine_mode mode = GET_MODE (vals);
16036 machine_mode inner_mode = GET_MODE_INNER (mode);
16037 rtx x;
16038
16039 if (!const_vec_duplicate_p (vals, &x))
16040 return NULL_RTX;
16041
16042 /* We can load this constant by using DUP and a constant in a
16043 single ARM register. This will be cheaper than a vector
16044 load. */
16045 x = copy_to_mode_reg (inner_mode, x);
16046 return gen_vec_duplicate (mode, x);
16047 }
16048
16049
16050 /* Generate code to load VALS, which is a PARALLEL containing only
16051 constants (for vec_init) or CONST_VECTOR, efficiently into a
16052 register. Returns an RTX to copy into the register, or NULL_RTX
16053 for a PARALLEL that cannot be converted into a CONST_VECTOR. */
16054 static rtx
16055 aarch64_simd_make_constant (rtx vals)
16056 {
16057 machine_mode mode = GET_MODE (vals);
16058 rtx const_dup;
16059 rtx const_vec = NULL_RTX;
16060 int n_const = 0;
16061 int i;
16062
16063 if (GET_CODE (vals) == CONST_VECTOR)
16064 const_vec = vals;
16065 else if (GET_CODE (vals) == PARALLEL)
16066 {
16067 /* A CONST_VECTOR must contain only CONST_INTs and
16068 CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
16069 Only store valid constants in a CONST_VECTOR. */
16070 int n_elts = XVECLEN (vals, 0);
16071 for (i = 0; i < n_elts; ++i)
16072 {
16073 rtx x = XVECEXP (vals, 0, i);
16074 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
16075 n_const++;
16076 }
16077 if (n_const == n_elts)
16078 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
16079 }
16080 else
16081 gcc_unreachable ();
16082
16083 if (const_vec != NULL_RTX
16084 && aarch64_simd_valid_immediate (const_vec, NULL))
16085 /* Load using MOVI/MVNI. */
16086 return const_vec;
16087 else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
16088 /* Loaded using DUP. */
16089 return const_dup;
16090 else if (const_vec != NULL_RTX)
16091 /* Load from constant pool. We cannot take advantage of single-cycle
16092 LD1 because we need a PC-relative addressing mode. */
16093 return const_vec;
16094 else
16095 /* A PARALLEL containing something not valid inside CONST_VECTOR.
16096 We cannot construct an initializer. */
16097 return NULL_RTX;
16098 }
16099
16100 /* Expand a vector initialisation sequence, such that TARGET is
16101 initialised to contain VALS. */
16102
16103 void
16104 aarch64_expand_vector_init (rtx target, rtx vals)
16105 {
16106 machine_mode mode = GET_MODE (target);
16107 scalar_mode inner_mode = GET_MODE_INNER (mode);
16108 /* The number of vector elements. */
16109 int n_elts = XVECLEN (vals, 0);
16110 /* The number of vector elements which are not constant. */
16111 int n_var = 0;
16112 rtx any_const = NULL_RTX;
16113 /* The first element of vals. */
16114 rtx v0 = XVECEXP (vals, 0, 0);
16115 bool all_same = true;
16116
16117 /* This is a special vec_init<M><N> where N is not an element mode but a
16118 vector mode with half the elements of M. We expect to find two entries
16119 of mode N in VALS and we must put their concatentation into TARGET. */
16120 if (XVECLEN (vals, 0) == 2 && VECTOR_MODE_P (GET_MODE (XVECEXP (vals, 0, 0))))
16121 {
16122 gcc_assert (known_eq (GET_MODE_SIZE (mode),
16123 2 * GET_MODE_SIZE (GET_MODE (XVECEXP (vals, 0, 0)))));
16124 rtx lo = XVECEXP (vals, 0, 0);
16125 rtx hi = XVECEXP (vals, 0, 1);
16126 machine_mode narrow_mode = GET_MODE (lo);
16127 gcc_assert (GET_MODE_INNER (narrow_mode) == inner_mode);
16128 gcc_assert (narrow_mode == GET_MODE (hi));
16129
16130 /* When we want to concatenate a half-width vector with zeroes we can
16131 use the aarch64_combinez[_be] patterns. Just make sure that the
16132 zeroes are in the right half. */
16133 if (BYTES_BIG_ENDIAN
16134 && aarch64_simd_imm_zero (lo, narrow_mode)
16135 && general_operand (hi, narrow_mode))
16136 emit_insn (gen_aarch64_combinez_be (narrow_mode, target, hi, lo));
16137 else if (!BYTES_BIG_ENDIAN
16138 && aarch64_simd_imm_zero (hi, narrow_mode)
16139 && general_operand (lo, narrow_mode))
16140 emit_insn (gen_aarch64_combinez (narrow_mode, target, lo, hi));
16141 else
16142 {
16143 /* Else create the two half-width registers and combine them. */
16144 if (!REG_P (lo))
16145 lo = force_reg (GET_MODE (lo), lo);
16146 if (!REG_P (hi))
16147 hi = force_reg (GET_MODE (hi), hi);
16148
16149 if (BYTES_BIG_ENDIAN)
16150 std::swap (lo, hi);
16151 emit_insn (gen_aarch64_simd_combine (narrow_mode, target, lo, hi));
16152 }
16153 return;
16154 }
16155
16156 /* Count the number of variable elements to initialise. */
16157 for (int i = 0; i < n_elts; ++i)
16158 {
16159 rtx x = XVECEXP (vals, 0, i);
16160 if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
16161 ++n_var;
16162 else
16163 any_const = x;
16164
16165 all_same &= rtx_equal_p (x, v0);
16166 }
16167
16168 /* No variable elements, hand off to aarch64_simd_make_constant which knows
16169 how best to handle this. */
16170 if (n_var == 0)
16171 {
16172 rtx constant = aarch64_simd_make_constant (vals);
16173 if (constant != NULL_RTX)
16174 {
16175 emit_move_insn (target, constant);
16176 return;
16177 }
16178 }
16179
16180 /* Splat a single non-constant element if we can. */
16181 if (all_same)
16182 {
16183 rtx x = copy_to_mode_reg (inner_mode, v0);
16184 aarch64_emit_move (target, gen_vec_duplicate (mode, x));
16185 return;
16186 }
16187
16188 enum insn_code icode = optab_handler (vec_set_optab, mode);
16189 gcc_assert (icode != CODE_FOR_nothing);
16190
16191 /* If there are only variable elements, try to optimize
16192 the insertion using dup for the most common element
16193 followed by insertions. */
16194
16195 /* The algorithm will fill matches[*][0] with the earliest matching element,
16196 and matches[X][1] with the count of duplicate elements (if X is the
16197 earliest element which has duplicates). */
16198
16199 if (n_var == n_elts && n_elts <= 16)
16200 {
16201 int matches[16][2] = {0};
16202 for (int i = 0; i < n_elts; i++)
16203 {
16204 for (int j = 0; j <= i; j++)
16205 {
16206 if (rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, j)))
16207 {
16208 matches[i][0] = j;
16209 matches[j][1]++;
16210 break;
16211 }
16212 }
16213 }
16214 int maxelement = 0;
16215 int maxv = 0;
16216 for (int i = 0; i < n_elts; i++)
16217 if (matches[i][1] > maxv)
16218 {
16219 maxelement = i;
16220 maxv = matches[i][1];
16221 }
16222
16223 /* Create a duplicate of the most common element, unless all elements
16224 are equally useless to us, in which case just immediately set the
16225 vector register using the first element. */
16226
16227 if (maxv == 1)
16228 {
16229 /* For vectors of two 64-bit elements, we can do even better. */
16230 if (n_elts == 2
16231 && (inner_mode == E_DImode
16232 || inner_mode == E_DFmode))
16233
16234 {
16235 rtx x0 = XVECEXP (vals, 0, 0);
16236 rtx x1 = XVECEXP (vals, 0, 1);
16237 /* Combine can pick up this case, but handling it directly
16238 here leaves clearer RTL.
16239
16240 This is load_pair_lanes<mode>, and also gives us a clean-up
16241 for store_pair_lanes<mode>. */
16242 if (memory_operand (x0, inner_mode)
16243 && memory_operand (x1, inner_mode)
16244 && !STRICT_ALIGNMENT
16245 && rtx_equal_p (XEXP (x1, 0),
16246 plus_constant (Pmode,
16247 XEXP (x0, 0),
16248 GET_MODE_SIZE (inner_mode))))
16249 {
16250 rtx t;
16251 if (inner_mode == DFmode)
16252 t = gen_load_pair_lanesdf (target, x0, x1);
16253 else
16254 t = gen_load_pair_lanesdi (target, x0, x1);
16255 emit_insn (t);
16256 return;
16257 }
16258 }
16259 /* The subreg-move sequence below will move into lane zero of the
16260 vector register. For big-endian we want that position to hold
16261 the last element of VALS. */
16262 maxelement = BYTES_BIG_ENDIAN ? n_elts - 1 : 0;
16263 rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
16264 aarch64_emit_move (target, lowpart_subreg (mode, x, inner_mode));
16265 }
16266 else
16267 {
16268 rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
16269 aarch64_emit_move (target, gen_vec_duplicate (mode, x));
16270 }
16271
16272 /* Insert the rest. */
16273 for (int i = 0; i < n_elts; i++)
16274 {
16275 rtx x = XVECEXP (vals, 0, i);
16276 if (matches[i][0] == maxelement)
16277 continue;
16278 x = copy_to_mode_reg (inner_mode, x);
16279 emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
16280 }
16281 return;
16282 }
16283
16284 /* Initialise a vector which is part-variable. We want to first try
16285 to build those lanes which are constant in the most efficient way we
16286 can. */
16287 if (n_var != n_elts)
16288 {
16289 rtx copy = copy_rtx (vals);
16290
16291 /* Load constant part of vector. We really don't care what goes into the
16292 parts we will overwrite, but we're more likely to be able to load the
16293 constant efficiently if it has fewer, larger, repeating parts
16294 (see aarch64_simd_valid_immediate). */
16295 for (int i = 0; i < n_elts; i++)
16296 {
16297 rtx x = XVECEXP (vals, 0, i);
16298 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
16299 continue;
16300 rtx subst = any_const;
16301 for (int bit = n_elts / 2; bit > 0; bit /= 2)
16302 {
16303 /* Look in the copied vector, as more elements are const. */
16304 rtx test = XVECEXP (copy, 0, i ^ bit);
16305 if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
16306 {
16307 subst = test;
16308 break;
16309 }
16310 }
16311 XVECEXP (copy, 0, i) = subst;
16312 }
16313 aarch64_expand_vector_init (target, copy);
16314 }
16315
16316 /* Insert the variable lanes directly. */
16317 for (int i = 0; i < n_elts; i++)
16318 {
16319 rtx x = XVECEXP (vals, 0, i);
16320 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
16321 continue;
16322 x = copy_to_mode_reg (inner_mode, x);
16323 emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
16324 }
16325 }
16326
16327 /* Emit RTL corresponding to:
16328 insr TARGET, ELEM. */
16329
16330 static void
16331 emit_insr (rtx target, rtx elem)
16332 {
16333 machine_mode mode = GET_MODE (target);
16334 scalar_mode elem_mode = GET_MODE_INNER (mode);
16335 elem = force_reg (elem_mode, elem);
16336
16337 insn_code icode = optab_handler (vec_shl_insert_optab, mode);
16338 gcc_assert (icode != CODE_FOR_nothing);
16339 emit_insn (GEN_FCN (icode) (target, target, elem));
16340 }
16341
16342 /* Subroutine of aarch64_sve_expand_vector_init for handling
16343 trailing constants.
16344 This function works as follows:
16345 (a) Create a new vector consisting of trailing constants.
16346 (b) Initialize TARGET with the constant vector using emit_move_insn.
16347 (c) Insert remaining elements in TARGET using insr.
16348 NELTS is the total number of elements in original vector while
16349 while NELTS_REQD is the number of elements that are actually
16350 significant.
16351
16352 ??? The heuristic used is to do above only if number of constants
16353 is at least half the total number of elements. May need fine tuning. */
16354
16355 static bool
16356 aarch64_sve_expand_vector_init_handle_trailing_constants
16357 (rtx target, const rtx_vector_builder &builder, int nelts, int nelts_reqd)
16358 {
16359 machine_mode mode = GET_MODE (target);
16360 scalar_mode elem_mode = GET_MODE_INNER (mode);
16361 int n_trailing_constants = 0;
16362
16363 for (int i = nelts_reqd - 1;
16364 i >= 0 && aarch64_legitimate_constant_p (elem_mode, builder.elt (i));
16365 i--)
16366 n_trailing_constants++;
16367
16368 if (n_trailing_constants >= nelts_reqd / 2)
16369 {
16370 rtx_vector_builder v (mode, 1, nelts);
16371 for (int i = 0; i < nelts; i++)
16372 v.quick_push (builder.elt (i + nelts_reqd - n_trailing_constants));
16373 rtx const_vec = v.build ();
16374 emit_move_insn (target, const_vec);
16375
16376 for (int i = nelts_reqd - n_trailing_constants - 1; i >= 0; i--)
16377 emit_insr (target, builder.elt (i));
16378
16379 return true;
16380 }
16381
16382 return false;
16383 }
16384
16385 /* Subroutine of aarch64_sve_expand_vector_init.
16386 Works as follows:
16387 (a) Initialize TARGET by broadcasting element NELTS_REQD - 1 of BUILDER.
16388 (b) Skip trailing elements from BUILDER, which are the same as
16389 element NELTS_REQD - 1.
16390 (c) Insert earlier elements in reverse order in TARGET using insr. */
16391
16392 static void
16393 aarch64_sve_expand_vector_init_insert_elems (rtx target,
16394 const rtx_vector_builder &builder,
16395 int nelts_reqd)
16396 {
16397 machine_mode mode = GET_MODE (target);
16398 scalar_mode elem_mode = GET_MODE_INNER (mode);
16399
16400 struct expand_operand ops[2];
16401 enum insn_code icode = optab_handler (vec_duplicate_optab, mode);
16402 gcc_assert (icode != CODE_FOR_nothing);
16403
16404 create_output_operand (&ops[0], target, mode);
16405 create_input_operand (&ops[1], builder.elt (nelts_reqd - 1), elem_mode);
16406 expand_insn (icode, 2, ops);
16407
16408 int ndups = builder.count_dups (nelts_reqd - 1, -1, -1);
16409 for (int i = nelts_reqd - ndups - 1; i >= 0; i--)
16410 emit_insr (target, builder.elt (i));
16411 }
16412
16413 /* Subroutine of aarch64_sve_expand_vector_init to handle case
16414 when all trailing elements of builder are same.
16415 This works as follows:
16416 (a) Use expand_insn interface to broadcast last vector element in TARGET.
16417 (b) Insert remaining elements in TARGET using insr.
16418
16419 ??? The heuristic used is to do above if number of same trailing elements
16420 is at least 3/4 of total number of elements, loosely based on
16421 heuristic from mostly_zeros_p. May need fine-tuning. */
16422
16423 static bool
16424 aarch64_sve_expand_vector_init_handle_trailing_same_elem
16425 (rtx target, const rtx_vector_builder &builder, int nelts_reqd)
16426 {
16427 int ndups = builder.count_dups (nelts_reqd - 1, -1, -1);
16428 if (ndups >= (3 * nelts_reqd) / 4)
16429 {
16430 aarch64_sve_expand_vector_init_insert_elems (target, builder,
16431 nelts_reqd - ndups + 1);
16432 return true;
16433 }
16434
16435 return false;
16436 }
16437
16438 /* Initialize register TARGET from BUILDER. NELTS is the constant number
16439 of elements in BUILDER.
16440
16441 The function tries to initialize TARGET from BUILDER if it fits one
16442 of the special cases outlined below.
16443
16444 Failing that, the function divides BUILDER into two sub-vectors:
16445 v_even = even elements of BUILDER;
16446 v_odd = odd elements of BUILDER;
16447
16448 and recursively calls itself with v_even and v_odd.
16449
16450 if (recursive call succeeded for v_even or v_odd)
16451 TARGET = zip (v_even, v_odd)
16452
16453 The function returns true if it managed to build TARGET from BUILDER
16454 with one of the special cases, false otherwise.
16455
16456 Example: {a, 1, b, 2, c, 3, d, 4}
16457
16458 The vector gets divided into:
16459 v_even = {a, b, c, d}
16460 v_odd = {1, 2, 3, 4}
16461
16462 aarch64_sve_expand_vector_init(v_odd) hits case 1 and
16463 initialize tmp2 from constant vector v_odd using emit_move_insn.
16464
16465 aarch64_sve_expand_vector_init(v_even) fails since v_even contains
16466 4 elements, so we construct tmp1 from v_even using insr:
16467 tmp1 = dup(d)
16468 insr tmp1, c
16469 insr tmp1, b
16470 insr tmp1, a
16471
16472 And finally:
16473 TARGET = zip (tmp1, tmp2)
16474 which sets TARGET to {a, 1, b, 2, c, 3, d, 4}. */
16475
16476 static bool
16477 aarch64_sve_expand_vector_init (rtx target, const rtx_vector_builder &builder,
16478 int nelts, int nelts_reqd)
16479 {
16480 machine_mode mode = GET_MODE (target);
16481
16482 /* Case 1: Vector contains trailing constants. */
16483
16484 if (aarch64_sve_expand_vector_init_handle_trailing_constants
16485 (target, builder, nelts, nelts_reqd))
16486 return true;
16487
16488 /* Case 2: Vector contains leading constants. */
16489
16490 rtx_vector_builder rev_builder (mode, 1, nelts_reqd);
16491 for (int i = 0; i < nelts_reqd; i++)
16492 rev_builder.quick_push (builder.elt (nelts_reqd - i - 1));
16493 rev_builder.finalize ();
16494
16495 if (aarch64_sve_expand_vector_init_handle_trailing_constants
16496 (target, rev_builder, nelts, nelts_reqd))
16497 {
16498 emit_insn (gen_aarch64_sve_rev (mode, target, target));
16499 return true;
16500 }
16501
16502 /* Case 3: Vector contains trailing same element. */
16503
16504 if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
16505 (target, builder, nelts_reqd))
16506 return true;
16507
16508 /* Case 4: Vector contains leading same element. */
16509
16510 if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
16511 (target, rev_builder, nelts_reqd) && nelts_reqd == nelts)
16512 {
16513 emit_insn (gen_aarch64_sve_rev (mode, target, target));
16514 return true;
16515 }
16516
16517 /* Avoid recursing below 4-elements.
16518 ??? The threshold 4 may need fine-tuning. */
16519
16520 if (nelts_reqd <= 4)
16521 return false;
16522
16523 rtx_vector_builder v_even (mode, 1, nelts);
16524 rtx_vector_builder v_odd (mode, 1, nelts);
16525
16526 for (int i = 0; i < nelts * 2; i += 2)
16527 {
16528 v_even.quick_push (builder.elt (i));
16529 v_odd.quick_push (builder.elt (i + 1));
16530 }
16531
16532 v_even.finalize ();
16533 v_odd.finalize ();
16534
16535 rtx tmp1 = gen_reg_rtx (mode);
16536 bool did_even_p = aarch64_sve_expand_vector_init (tmp1, v_even,
16537 nelts, nelts_reqd / 2);
16538
16539 rtx tmp2 = gen_reg_rtx (mode);
16540 bool did_odd_p = aarch64_sve_expand_vector_init (tmp2, v_odd,
16541 nelts, nelts_reqd / 2);
16542
16543 if (!did_even_p && !did_odd_p)
16544 return false;
16545
16546 /* Initialize v_even and v_odd using INSR if it didn't match any of the
16547 special cases and zip v_even, v_odd. */
16548
16549 if (!did_even_p)
16550 aarch64_sve_expand_vector_init_insert_elems (tmp1, v_even, nelts_reqd / 2);
16551
16552 if (!did_odd_p)
16553 aarch64_sve_expand_vector_init_insert_elems (tmp2, v_odd, nelts_reqd / 2);
16554
16555 rtvec v = gen_rtvec (2, tmp1, tmp2);
16556 emit_set_insn (target, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
16557 return true;
16558 }
16559
16560 /* Initialize register TARGET from the elements in PARALLEL rtx VALS. */
16561
16562 void
16563 aarch64_sve_expand_vector_init (rtx target, rtx vals)
16564 {
16565 machine_mode mode = GET_MODE (target);
16566 int nelts = XVECLEN (vals, 0);
16567
16568 rtx_vector_builder v (mode, 1, nelts);
16569 for (int i = 0; i < nelts; i++)
16570 v.quick_push (XVECEXP (vals, 0, i));
16571 v.finalize ();
16572
16573 /* If neither sub-vectors of v could be initialized specially,
16574 then use INSR to insert all elements from v into TARGET.
16575 ??? This might not be optimal for vectors with large
16576 initializers like 16-element or above.
16577 For nelts < 4, it probably isn't useful to handle specially. */
16578
16579 if (nelts < 4
16580 || !aarch64_sve_expand_vector_init (target, v, nelts, nelts))
16581 aarch64_sve_expand_vector_init_insert_elems (target, v, nelts);
16582 }
16583
16584 /* Check whether VALUE is a vector constant in which every element
16585 is either a power of 2 or a negated power of 2. If so, return
16586 a constant vector of log2s, and flip CODE between PLUS and MINUS
16587 if VALUE contains negated powers of 2. Return NULL_RTX otherwise. */
16588
16589 static rtx
16590 aarch64_convert_mult_to_shift (rtx value, rtx_code &code)
16591 {
16592 if (GET_CODE (value) != CONST_VECTOR)
16593 return NULL_RTX;
16594
16595 rtx_vector_builder builder;
16596 if (!builder.new_unary_operation (GET_MODE (value), value, false))
16597 return NULL_RTX;
16598
16599 scalar_mode int_mode = GET_MODE_INNER (GET_MODE (value));
16600 /* 1 if the result of the multiplication must be negated,
16601 0 if it mustn't, or -1 if we don't yet care. */
16602 int negate = -1;
16603 unsigned int encoded_nelts = const_vector_encoded_nelts (value);
16604 for (unsigned int i = 0; i < encoded_nelts; ++i)
16605 {
16606 rtx elt = CONST_VECTOR_ENCODED_ELT (value, i);
16607 if (!CONST_SCALAR_INT_P (elt))
16608 return NULL_RTX;
16609 rtx_mode_t val (elt, int_mode);
16610 wide_int pow2 = wi::neg (val);
16611 if (val != pow2)
16612 {
16613 /* It matters whether we negate or not. Make that choice,
16614 and make sure that it's consistent with previous elements. */
16615 if (negate == !wi::neg_p (val))
16616 return NULL_RTX;
16617 negate = wi::neg_p (val);
16618 if (!negate)
16619 pow2 = val;
16620 }
16621 /* POW2 is now the value that we want to be a power of 2. */
16622 int shift = wi::exact_log2 (pow2);
16623 if (shift < 0)
16624 return NULL_RTX;
16625 builder.quick_push (gen_int_mode (shift, int_mode));
16626 }
16627 if (negate == -1)
16628 /* PLUS and MINUS are equivalent; canonicalize on PLUS. */
16629 code = PLUS;
16630 else if (negate == 1)
16631 code = code == PLUS ? MINUS : PLUS;
16632 return builder.build ();
16633 }
16634
16635 /* Prepare for an integer SVE multiply-add or multiply-subtract pattern;
16636 CODE is PLUS for the former and MINUS for the latter. OPERANDS is the
16637 operands array, in the same order as for fma_optab. Return true if
16638 the function emitted all the necessary instructions, false if the caller
16639 should generate the pattern normally with the new OPERANDS array. */
16640
16641 bool
16642 aarch64_prepare_sve_int_fma (rtx *operands, rtx_code code)
16643 {
16644 machine_mode mode = GET_MODE (operands[0]);
16645 if (rtx shifts = aarch64_convert_mult_to_shift (operands[2], code))
16646 {
16647 rtx product = expand_binop (mode, vashl_optab, operands[1], shifts,
16648 NULL_RTX, true, OPTAB_DIRECT);
16649 force_expand_binop (mode, code == PLUS ? add_optab : sub_optab,
16650 operands[3], product, operands[0], true,
16651 OPTAB_DIRECT);
16652 return true;
16653 }
16654 operands[2] = force_reg (mode, operands[2]);
16655 return false;
16656 }
16657
16658 /* Likewise, but for a conditional pattern. */
16659
16660 bool
16661 aarch64_prepare_sve_cond_int_fma (rtx *operands, rtx_code code)
16662 {
16663 machine_mode mode = GET_MODE (operands[0]);
16664 if (rtx shifts = aarch64_convert_mult_to_shift (operands[3], code))
16665 {
16666 rtx product = expand_binop (mode, vashl_optab, operands[2], shifts,
16667 NULL_RTX, true, OPTAB_DIRECT);
16668 emit_insn (gen_cond (code, mode, operands[0], operands[1],
16669 operands[4], product, operands[5]));
16670 return true;
16671 }
16672 operands[3] = force_reg (mode, operands[3]);
16673 return false;
16674 }
16675
16676 static unsigned HOST_WIDE_INT
16677 aarch64_shift_truncation_mask (machine_mode mode)
16678 {
16679 if (!SHIFT_COUNT_TRUNCATED || aarch64_vector_data_mode_p (mode))
16680 return 0;
16681 return GET_MODE_UNIT_BITSIZE (mode) - 1;
16682 }
16683
16684 /* Select a format to encode pointers in exception handling data. */
16685 int
16686 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
16687 {
16688 int type;
16689 switch (aarch64_cmodel)
16690 {
16691 case AARCH64_CMODEL_TINY:
16692 case AARCH64_CMODEL_TINY_PIC:
16693 case AARCH64_CMODEL_SMALL:
16694 case AARCH64_CMODEL_SMALL_PIC:
16695 case AARCH64_CMODEL_SMALL_SPIC:
16696 /* text+got+data < 4Gb. 4-byte signed relocs are sufficient
16697 for everything. */
16698 type = DW_EH_PE_sdata4;
16699 break;
16700 default:
16701 /* No assumptions here. 8-byte relocs required. */
16702 type = DW_EH_PE_sdata8;
16703 break;
16704 }
16705 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
16706 }
16707
16708 /* Output .variant_pcs for aarch64_vector_pcs function symbols. */
16709
16710 static void
16711 aarch64_asm_output_variant_pcs (FILE *stream, const tree decl, const char* name)
16712 {
16713 if (aarch64_simd_decl_p (decl))
16714 {
16715 fprintf (stream, "\t.variant_pcs\t");
16716 assemble_name (stream, name);
16717 fprintf (stream, "\n");
16718 }
16719 }
16720
16721 /* The last .arch and .tune assembly strings that we printed. */
16722 static std::string aarch64_last_printed_arch_string;
16723 static std::string aarch64_last_printed_tune_string;
16724
16725 /* Implement ASM_DECLARE_FUNCTION_NAME. Output the ISA features used
16726 by the function fndecl. */
16727
16728 void
16729 aarch64_declare_function_name (FILE *stream, const char* name,
16730 tree fndecl)
16731 {
16732 tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
16733
16734 struct cl_target_option *targ_options;
16735 if (target_parts)
16736 targ_options = TREE_TARGET_OPTION (target_parts);
16737 else
16738 targ_options = TREE_TARGET_OPTION (target_option_current_node);
16739 gcc_assert (targ_options);
16740
16741 const struct processor *this_arch
16742 = aarch64_get_arch (targ_options->x_explicit_arch);
16743
16744 uint64_t isa_flags = targ_options->x_aarch64_isa_flags;
16745 std::string extension
16746 = aarch64_get_extension_string_for_isa_flags (isa_flags,
16747 this_arch->flags);
16748 /* Only update the assembler .arch string if it is distinct from the last
16749 such string we printed. */
16750 std::string to_print = this_arch->name + extension;
16751 if (to_print != aarch64_last_printed_arch_string)
16752 {
16753 asm_fprintf (asm_out_file, "\t.arch %s\n", to_print.c_str ());
16754 aarch64_last_printed_arch_string = to_print;
16755 }
16756
16757 /* Print the cpu name we're tuning for in the comments, might be
16758 useful to readers of the generated asm. Do it only when it changes
16759 from function to function and verbose assembly is requested. */
16760 const struct processor *this_tune
16761 = aarch64_get_tune_cpu (targ_options->x_explicit_tune_core);
16762
16763 if (flag_debug_asm && aarch64_last_printed_tune_string != this_tune->name)
16764 {
16765 asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
16766 this_tune->name);
16767 aarch64_last_printed_tune_string = this_tune->name;
16768 }
16769
16770 aarch64_asm_output_variant_pcs (stream, fndecl, name);
16771
16772 /* Don't forget the type directive for ELF. */
16773 ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
16774 ASM_OUTPUT_LABEL (stream, name);
16775 }
16776
16777 /* Implement ASM_OUTPUT_DEF_FROM_DECLS. Output .variant_pcs for aliases. */
16778
16779 void
16780 aarch64_asm_output_alias (FILE *stream, const tree decl, const tree target)
16781 {
16782 const char *name = XSTR (XEXP (DECL_RTL (decl), 0), 0);
16783 const char *value = IDENTIFIER_POINTER (target);
16784 aarch64_asm_output_variant_pcs (stream, decl, name);
16785 ASM_OUTPUT_DEF (stream, name, value);
16786 }
16787
16788 /* Implement ASM_OUTPUT_EXTERNAL. Output .variant_pcs for undefined
16789 function symbol references. */
16790
16791 void
16792 aarch64_asm_output_external (FILE *stream, tree decl, const char* name)
16793 {
16794 default_elf_asm_output_external (stream, decl, name);
16795 aarch64_asm_output_variant_pcs (stream, decl, name);
16796 }
16797
16798 /* Triggered after a .cfi_startproc directive is emitted into the assembly file.
16799 Used to output the .cfi_b_key_frame directive when signing the current
16800 function with the B key. */
16801
16802 void
16803 aarch64_post_cfi_startproc (FILE *f, tree ignored ATTRIBUTE_UNUSED)
16804 {
16805 if (cfun->machine->frame.laid_out && aarch64_return_address_signing_enabled ()
16806 && aarch64_ra_sign_key == AARCH64_KEY_B)
16807 asm_fprintf (f, "\t.cfi_b_key_frame\n");
16808 }
16809
16810 /* Implements TARGET_ASM_FILE_START. Output the assembly header. */
16811
16812 static void
16813 aarch64_start_file (void)
16814 {
16815 struct cl_target_option *default_options
16816 = TREE_TARGET_OPTION (target_option_default_node);
16817
16818 const struct processor *default_arch
16819 = aarch64_get_arch (default_options->x_explicit_arch);
16820 uint64_t default_isa_flags = default_options->x_aarch64_isa_flags;
16821 std::string extension
16822 = aarch64_get_extension_string_for_isa_flags (default_isa_flags,
16823 default_arch->flags);
16824
16825 aarch64_last_printed_arch_string = default_arch->name + extension;
16826 aarch64_last_printed_tune_string = "";
16827 asm_fprintf (asm_out_file, "\t.arch %s\n",
16828 aarch64_last_printed_arch_string.c_str ());
16829
16830 default_file_start ();
16831 }
16832
16833 /* Emit load exclusive. */
16834
16835 static void
16836 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
16837 rtx mem, rtx model_rtx)
16838 {
16839 if (mode == TImode)
16840 emit_insn (gen_aarch64_load_exclusive_pair (gen_lowpart (DImode, rval),
16841 gen_highpart (DImode, rval),
16842 mem, model_rtx));
16843 else
16844 emit_insn (gen_aarch64_load_exclusive (mode, rval, mem, model_rtx));
16845 }
16846
16847 /* Emit store exclusive. */
16848
16849 static void
16850 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
16851 rtx mem, rtx rval, rtx model_rtx)
16852 {
16853 if (mode == TImode)
16854 emit_insn (gen_aarch64_store_exclusive_pair
16855 (bval, mem, operand_subword (rval, 0, 0, TImode),
16856 operand_subword (rval, 1, 0, TImode), model_rtx));
16857 else
16858 emit_insn (gen_aarch64_store_exclusive (mode, bval, mem, rval, model_rtx));
16859 }
16860
16861 /* Mark the previous jump instruction as unlikely. */
16862
16863 static void
16864 aarch64_emit_unlikely_jump (rtx insn)
16865 {
16866 rtx_insn *jump = emit_jump_insn (insn);
16867 add_reg_br_prob_note (jump, profile_probability::very_unlikely ());
16868 }
16869
16870 /* We store the names of the various atomic helpers in a 5x4 array.
16871 Return the libcall function given MODE, MODEL and NAMES. */
16872
16873 rtx
16874 aarch64_atomic_ool_func(machine_mode mode, rtx model_rtx,
16875 const atomic_ool_names *names)
16876 {
16877 memmodel model = memmodel_base (INTVAL (model_rtx));
16878 int mode_idx, model_idx;
16879
16880 switch (mode)
16881 {
16882 case E_QImode:
16883 mode_idx = 0;
16884 break;
16885 case E_HImode:
16886 mode_idx = 1;
16887 break;
16888 case E_SImode:
16889 mode_idx = 2;
16890 break;
16891 case E_DImode:
16892 mode_idx = 3;
16893 break;
16894 case E_TImode:
16895 mode_idx = 4;
16896 break;
16897 default:
16898 gcc_unreachable ();
16899 }
16900
16901 switch (model)
16902 {
16903 case MEMMODEL_RELAXED:
16904 model_idx = 0;
16905 break;
16906 case MEMMODEL_CONSUME:
16907 case MEMMODEL_ACQUIRE:
16908 model_idx = 1;
16909 break;
16910 case MEMMODEL_RELEASE:
16911 model_idx = 2;
16912 break;
16913 case MEMMODEL_ACQ_REL:
16914 case MEMMODEL_SEQ_CST:
16915 model_idx = 3;
16916 break;
16917 default:
16918 gcc_unreachable ();
16919 }
16920
16921 return init_one_libfunc_visibility (names->str[mode_idx][model_idx],
16922 VISIBILITY_HIDDEN);
16923 }
16924
16925 #define DEF0(B, N) \
16926 { "__aarch64_" #B #N "_relax", \
16927 "__aarch64_" #B #N "_acq", \
16928 "__aarch64_" #B #N "_rel", \
16929 "__aarch64_" #B #N "_acq_rel" }
16930
16931 #define DEF4(B) DEF0(B, 1), DEF0(B, 2), DEF0(B, 4), DEF0(B, 8), \
16932 { NULL, NULL, NULL, NULL }
16933 #define DEF5(B) DEF0(B, 1), DEF0(B, 2), DEF0(B, 4), DEF0(B, 8), DEF0(B, 16)
16934
16935 static const atomic_ool_names aarch64_ool_cas_names = { { DEF5(cas) } };
16936 const atomic_ool_names aarch64_ool_swp_names = { { DEF4(swp) } };
16937 const atomic_ool_names aarch64_ool_ldadd_names = { { DEF4(ldadd) } };
16938 const atomic_ool_names aarch64_ool_ldset_names = { { DEF4(ldset) } };
16939 const atomic_ool_names aarch64_ool_ldclr_names = { { DEF4(ldclr) } };
16940 const atomic_ool_names aarch64_ool_ldeor_names = { { DEF4(ldeor) } };
16941
16942 #undef DEF0
16943 #undef DEF4
16944 #undef DEF5
16945
16946 /* Expand a compare and swap pattern. */
16947
16948 void
16949 aarch64_expand_compare_and_swap (rtx operands[])
16950 {
16951 rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x, cc_reg;
16952 machine_mode mode, r_mode;
16953
16954 bval = operands[0];
16955 rval = operands[1];
16956 mem = operands[2];
16957 oldval = operands[3];
16958 newval = operands[4];
16959 is_weak = operands[5];
16960 mod_s = operands[6];
16961 mod_f = operands[7];
16962 mode = GET_MODE (mem);
16963
16964 /* Normally the succ memory model must be stronger than fail, but in the
16965 unlikely event of fail being ACQUIRE and succ being RELEASE we need to
16966 promote succ to ACQ_REL so that we don't lose the acquire semantics. */
16967 if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
16968 && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
16969 mod_s = GEN_INT (MEMMODEL_ACQ_REL);
16970
16971 r_mode = mode;
16972 if (mode == QImode || mode == HImode)
16973 {
16974 r_mode = SImode;
16975 rval = gen_reg_rtx (r_mode);
16976 }
16977
16978 if (TARGET_LSE)
16979 {
16980 /* The CAS insn requires oldval and rval overlap, but we need to
16981 have a copy of oldval saved across the operation to tell if
16982 the operation is successful. */
16983 if (reg_overlap_mentioned_p (rval, oldval))
16984 rval = copy_to_mode_reg (r_mode, oldval);
16985 else
16986 emit_move_insn (rval, gen_lowpart (r_mode, oldval));
16987
16988 emit_insn (gen_aarch64_compare_and_swap_lse (mode, rval, mem,
16989 newval, mod_s));
16990 cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
16991 }
16992 else if (TARGET_OUTLINE_ATOMICS)
16993 {
16994 /* Oldval must satisfy compare afterward. */
16995 if (!aarch64_plus_operand (oldval, mode))
16996 oldval = force_reg (mode, oldval);
16997 rtx func = aarch64_atomic_ool_func (mode, mod_s, &aarch64_ool_cas_names);
16998 rval = emit_library_call_value (func, NULL_RTX, LCT_NORMAL, r_mode,
16999 oldval, mode, newval, mode,
17000 XEXP (mem, 0), Pmode);
17001 cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
17002 }
17003 else
17004 {
17005 /* The oldval predicate varies by mode. Test it and force to reg. */
17006 insn_code code = code_for_aarch64_compare_and_swap (mode);
17007 if (!insn_data[code].operand[2].predicate (oldval, mode))
17008 oldval = force_reg (mode, oldval);
17009
17010 emit_insn (GEN_FCN (code) (rval, mem, oldval, newval,
17011 is_weak, mod_s, mod_f));
17012 cc_reg = gen_rtx_REG (CCmode, CC_REGNUM);
17013 }
17014
17015 if (r_mode != mode)
17016 rval = gen_lowpart (mode, rval);
17017 emit_move_insn (operands[1], rval);
17018
17019 x = gen_rtx_EQ (SImode, cc_reg, const0_rtx);
17020 emit_insn (gen_rtx_SET (bval, x));
17021 }
17022
17023 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
17024 sequence implementing an atomic operation. */
17025
17026 static void
17027 aarch64_emit_post_barrier (enum memmodel model)
17028 {
17029 const enum memmodel base_model = memmodel_base (model);
17030
17031 if (is_mm_sync (model)
17032 && (base_model == MEMMODEL_ACQUIRE
17033 || base_model == MEMMODEL_ACQ_REL
17034 || base_model == MEMMODEL_SEQ_CST))
17035 {
17036 emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
17037 }
17038 }
17039
17040 /* Split a compare and swap pattern. */
17041
17042 void
17043 aarch64_split_compare_and_swap (rtx operands[])
17044 {
17045 rtx rval, mem, oldval, newval, scratch, x, model_rtx;
17046 machine_mode mode;
17047 bool is_weak;
17048 rtx_code_label *label1, *label2;
17049 enum memmodel model;
17050
17051 rval = operands[0];
17052 mem = operands[1];
17053 oldval = operands[2];
17054 newval = operands[3];
17055 is_weak = (operands[4] != const0_rtx);
17056 model_rtx = operands[5];
17057 scratch = operands[7];
17058 mode = GET_MODE (mem);
17059 model = memmodel_from_int (INTVAL (model_rtx));
17060
17061 /* When OLDVAL is zero and we want the strong version we can emit a tighter
17062 loop:
17063 .label1:
17064 LD[A]XR rval, [mem]
17065 CBNZ rval, .label2
17066 ST[L]XR scratch, newval, [mem]
17067 CBNZ scratch, .label1
17068 .label2:
17069 CMP rval, 0. */
17070 bool strong_zero_p = (!is_weak && !aarch64_track_speculation &&
17071 oldval == const0_rtx && mode != TImode);
17072
17073 label1 = NULL;
17074 if (!is_weak)
17075 {
17076 label1 = gen_label_rtx ();
17077 emit_label (label1);
17078 }
17079 label2 = gen_label_rtx ();
17080
17081 /* The initial load can be relaxed for a __sync operation since a final
17082 barrier will be emitted to stop code hoisting. */
17083 if (is_mm_sync (model))
17084 aarch64_emit_load_exclusive (mode, rval, mem, GEN_INT (MEMMODEL_RELAXED));
17085 else
17086 aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
17087
17088 if (strong_zero_p)
17089 x = gen_rtx_NE (VOIDmode, rval, const0_rtx);
17090 else
17091 {
17092 rtx cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
17093 x = gen_rtx_NE (VOIDmode, cc_reg, const0_rtx);
17094 }
17095 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
17096 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
17097 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
17098
17099 aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
17100
17101 if (!is_weak)
17102 {
17103 if (aarch64_track_speculation)
17104 {
17105 /* Emit an explicit compare instruction, so that we can correctly
17106 track the condition codes. */
17107 rtx cc_reg = aarch64_gen_compare_reg (NE, scratch, const0_rtx);
17108 x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
17109 }
17110 else
17111 x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
17112
17113 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
17114 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
17115 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
17116 }
17117 else
17118 aarch64_gen_compare_reg (NE, scratch, const0_rtx);
17119
17120 emit_label (label2);
17121
17122 /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
17123 to set the condition flags. If this is not used it will be removed by
17124 later passes. */
17125 if (strong_zero_p)
17126 aarch64_gen_compare_reg (NE, rval, const0_rtx);
17127
17128 /* Emit any final barrier needed for a __sync operation. */
17129 if (is_mm_sync (model))
17130 aarch64_emit_post_barrier (model);
17131 }
17132
17133 /* Split an atomic operation. */
17134
17135 void
17136 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
17137 rtx value, rtx model_rtx, rtx cond)
17138 {
17139 machine_mode mode = GET_MODE (mem);
17140 machine_mode wmode = (mode == DImode ? DImode : SImode);
17141 const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
17142 const bool is_sync = is_mm_sync (model);
17143 rtx_code_label *label;
17144 rtx x;
17145
17146 /* Split the atomic operation into a sequence. */
17147 label = gen_label_rtx ();
17148 emit_label (label);
17149
17150 if (new_out)
17151 new_out = gen_lowpart (wmode, new_out);
17152 if (old_out)
17153 old_out = gen_lowpart (wmode, old_out);
17154 else
17155 old_out = new_out;
17156 value = simplify_gen_subreg (wmode, value, mode, 0);
17157
17158 /* The initial load can be relaxed for a __sync operation since a final
17159 barrier will be emitted to stop code hoisting. */
17160 if (is_sync)
17161 aarch64_emit_load_exclusive (mode, old_out, mem,
17162 GEN_INT (MEMMODEL_RELAXED));
17163 else
17164 aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
17165
17166 switch (code)
17167 {
17168 case SET:
17169 new_out = value;
17170 break;
17171
17172 case NOT:
17173 x = gen_rtx_AND (wmode, old_out, value);
17174 emit_insn (gen_rtx_SET (new_out, x));
17175 x = gen_rtx_NOT (wmode, new_out);
17176 emit_insn (gen_rtx_SET (new_out, x));
17177 break;
17178
17179 case MINUS:
17180 if (CONST_INT_P (value))
17181 {
17182 value = GEN_INT (-INTVAL (value));
17183 code = PLUS;
17184 }
17185 /* Fall through. */
17186
17187 default:
17188 x = gen_rtx_fmt_ee (code, wmode, old_out, value);
17189 emit_insn (gen_rtx_SET (new_out, x));
17190 break;
17191 }
17192
17193 aarch64_emit_store_exclusive (mode, cond, mem,
17194 gen_lowpart (mode, new_out), model_rtx);
17195
17196 if (aarch64_track_speculation)
17197 {
17198 /* Emit an explicit compare instruction, so that we can correctly
17199 track the condition codes. */
17200 rtx cc_reg = aarch64_gen_compare_reg (NE, cond, const0_rtx);
17201 x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
17202 }
17203 else
17204 x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
17205
17206 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
17207 gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
17208 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
17209
17210 /* Emit any final barrier needed for a __sync operation. */
17211 if (is_sync)
17212 aarch64_emit_post_barrier (model);
17213 }
17214
17215 static void
17216 aarch64_init_libfuncs (void)
17217 {
17218 /* Half-precision float operations. The compiler handles all operations
17219 with NULL libfuncs by converting to SFmode. */
17220
17221 /* Conversions. */
17222 set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
17223 set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
17224
17225 /* Arithmetic. */
17226 set_optab_libfunc (add_optab, HFmode, NULL);
17227 set_optab_libfunc (sdiv_optab, HFmode, NULL);
17228 set_optab_libfunc (smul_optab, HFmode, NULL);
17229 set_optab_libfunc (neg_optab, HFmode, NULL);
17230 set_optab_libfunc (sub_optab, HFmode, NULL);
17231
17232 /* Comparisons. */
17233 set_optab_libfunc (eq_optab, HFmode, NULL);
17234 set_optab_libfunc (ne_optab, HFmode, NULL);
17235 set_optab_libfunc (lt_optab, HFmode, NULL);
17236 set_optab_libfunc (le_optab, HFmode, NULL);
17237 set_optab_libfunc (ge_optab, HFmode, NULL);
17238 set_optab_libfunc (gt_optab, HFmode, NULL);
17239 set_optab_libfunc (unord_optab, HFmode, NULL);
17240 }
17241
17242 /* Target hook for c_mode_for_suffix. */
17243 static machine_mode
17244 aarch64_c_mode_for_suffix (char suffix)
17245 {
17246 if (suffix == 'q')
17247 return TFmode;
17248
17249 return VOIDmode;
17250 }
17251
17252 /* We can only represent floating point constants which will fit in
17253 "quarter-precision" values. These values are characterised by
17254 a sign bit, a 4-bit mantissa and a 3-bit exponent. And are given
17255 by:
17256
17257 (-1)^s * (n/16) * 2^r
17258
17259 Where:
17260 's' is the sign bit.
17261 'n' is an integer in the range 16 <= n <= 31.
17262 'r' is an integer in the range -3 <= r <= 4. */
17263
17264 /* Return true iff X can be represented by a quarter-precision
17265 floating point immediate operand X. Note, we cannot represent 0.0. */
17266 bool
17267 aarch64_float_const_representable_p (rtx x)
17268 {
17269 /* This represents our current view of how many bits
17270 make up the mantissa. */
17271 int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
17272 int exponent;
17273 unsigned HOST_WIDE_INT mantissa, mask;
17274 REAL_VALUE_TYPE r, m;
17275 bool fail;
17276
17277 x = unwrap_const_vec_duplicate (x);
17278 if (!CONST_DOUBLE_P (x))
17279 return false;
17280
17281 if (GET_MODE (x) == VOIDmode
17282 || (GET_MODE (x) == HFmode && !TARGET_FP_F16INST))
17283 return false;
17284
17285 r = *CONST_DOUBLE_REAL_VALUE (x);
17286
17287 /* We cannot represent infinities, NaNs or +/-zero. We won't
17288 know if we have +zero until we analyse the mantissa, but we
17289 can reject the other invalid values. */
17290 if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
17291 || REAL_VALUE_MINUS_ZERO (r))
17292 return false;
17293
17294 /* Extract exponent. */
17295 r = real_value_abs (&r);
17296 exponent = REAL_EXP (&r);
17297
17298 /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
17299 highest (sign) bit, with a fixed binary point at bit point_pos.
17300 m1 holds the low part of the mantissa, m2 the high part.
17301 WARNING: If we ever have a representation using more than 2 * H_W_I - 1
17302 bits for the mantissa, this can fail (low bits will be lost). */
17303 real_ldexp (&m, &r, point_pos - exponent);
17304 wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
17305
17306 /* If the low part of the mantissa has bits set we cannot represent
17307 the value. */
17308 if (w.ulow () != 0)
17309 return false;
17310 /* We have rejected the lower HOST_WIDE_INT, so update our
17311 understanding of how many bits lie in the mantissa and
17312 look only at the high HOST_WIDE_INT. */
17313 mantissa = w.elt (1);
17314 point_pos -= HOST_BITS_PER_WIDE_INT;
17315
17316 /* We can only represent values with a mantissa of the form 1.xxxx. */
17317 mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
17318 if ((mantissa & mask) != 0)
17319 return false;
17320
17321 /* Having filtered unrepresentable values, we may now remove all
17322 but the highest 5 bits. */
17323 mantissa >>= point_pos - 5;
17324
17325 /* We cannot represent the value 0.0, so reject it. This is handled
17326 elsewhere. */
17327 if (mantissa == 0)
17328 return false;
17329
17330 /* Then, as bit 4 is always set, we can mask it off, leaving
17331 the mantissa in the range [0, 15]. */
17332 mantissa &= ~(1 << 4);
17333 gcc_assert (mantissa <= 15);
17334
17335 /* GCC internally does not use IEEE754-like encoding (where normalized
17336 significands are in the range [1, 2). GCC uses [0.5, 1) (see real.c).
17337 Our mantissa values are shifted 4 places to the left relative to
17338 normalized IEEE754 so we must modify the exponent returned by REAL_EXP
17339 by 5 places to correct for GCC's representation. */
17340 exponent = 5 - exponent;
17341
17342 return (exponent >= 0 && exponent <= 7);
17343 }
17344
17345 /* Returns the string with the instruction for AdvSIMD MOVI, MVNI, ORR or BIC
17346 immediate with a CONST_VECTOR of MODE and WIDTH. WHICH selects whether to
17347 output MOVI/MVNI, ORR or BIC immediate. */
17348 char*
17349 aarch64_output_simd_mov_immediate (rtx const_vector, unsigned width,
17350 enum simd_immediate_check which)
17351 {
17352 bool is_valid;
17353 static char templ[40];
17354 const char *mnemonic;
17355 const char *shift_op;
17356 unsigned int lane_count = 0;
17357 char element_char;
17358
17359 struct simd_immediate_info info;
17360
17361 /* This will return true to show const_vector is legal for use as either
17362 a AdvSIMD MOVI instruction (or, implicitly, MVNI), ORR or BIC immediate.
17363 It will also update INFO to show how the immediate should be generated.
17364 WHICH selects whether to check for MOVI/MVNI, ORR or BIC. */
17365 is_valid = aarch64_simd_valid_immediate (const_vector, &info, which);
17366 gcc_assert (is_valid);
17367
17368 element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
17369 lane_count = width / GET_MODE_BITSIZE (info.elt_mode);
17370
17371 if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
17372 {
17373 gcc_assert (info.insn == simd_immediate_info::MOV
17374 && info.u.mov.shift == 0);
17375 /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
17376 move immediate path. */
17377 if (aarch64_float_const_zero_rtx_p (info.u.mov.value))
17378 info.u.mov.value = GEN_INT (0);
17379 else
17380 {
17381 const unsigned int buf_size = 20;
17382 char float_buf[buf_size] = {'\0'};
17383 real_to_decimal_for_mode (float_buf,
17384 CONST_DOUBLE_REAL_VALUE (info.u.mov.value),
17385 buf_size, buf_size, 1, info.elt_mode);
17386
17387 if (lane_count == 1)
17388 snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
17389 else
17390 snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
17391 lane_count, element_char, float_buf);
17392 return templ;
17393 }
17394 }
17395
17396 gcc_assert (CONST_INT_P (info.u.mov.value));
17397
17398 if (which == AARCH64_CHECK_MOV)
17399 {
17400 mnemonic = info.insn == simd_immediate_info::MVN ? "mvni" : "movi";
17401 shift_op = (info.u.mov.modifier == simd_immediate_info::MSL
17402 ? "msl" : "lsl");
17403 if (lane_count == 1)
17404 snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
17405 mnemonic, UINTVAL (info.u.mov.value));
17406 else if (info.u.mov.shift)
17407 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
17408 HOST_WIDE_INT_PRINT_HEX ", %s %d", mnemonic, lane_count,
17409 element_char, UINTVAL (info.u.mov.value), shift_op,
17410 info.u.mov.shift);
17411 else
17412 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
17413 HOST_WIDE_INT_PRINT_HEX, mnemonic, lane_count,
17414 element_char, UINTVAL (info.u.mov.value));
17415 }
17416 else
17417 {
17418 /* For AARCH64_CHECK_BIC and AARCH64_CHECK_ORR. */
17419 mnemonic = info.insn == simd_immediate_info::MVN ? "bic" : "orr";
17420 if (info.u.mov.shift)
17421 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
17422 HOST_WIDE_INT_PRINT_DEC ", %s #%d", mnemonic, lane_count,
17423 element_char, UINTVAL (info.u.mov.value), "lsl",
17424 info.u.mov.shift);
17425 else
17426 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
17427 HOST_WIDE_INT_PRINT_DEC, mnemonic, lane_count,
17428 element_char, UINTVAL (info.u.mov.value));
17429 }
17430 return templ;
17431 }
17432
17433 char*
17434 aarch64_output_scalar_simd_mov_immediate (rtx immediate, scalar_int_mode mode)
17435 {
17436
17437 /* If a floating point number was passed and we desire to use it in an
17438 integer mode do the conversion to integer. */
17439 if (CONST_DOUBLE_P (immediate) && GET_MODE_CLASS (mode) == MODE_INT)
17440 {
17441 unsigned HOST_WIDE_INT ival;
17442 if (!aarch64_reinterpret_float_as_int (immediate, &ival))
17443 gcc_unreachable ();
17444 immediate = gen_int_mode (ival, mode);
17445 }
17446
17447 machine_mode vmode;
17448 /* use a 64 bit mode for everything except for DI/DF mode, where we use
17449 a 128 bit vector mode. */
17450 int width = GET_MODE_BITSIZE (mode) == 64 ? 128 : 64;
17451
17452 vmode = aarch64_simd_container_mode (mode, width);
17453 rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
17454 return aarch64_output_simd_mov_immediate (v_op, width);
17455 }
17456
17457 /* Return the output string to use for moving immediate CONST_VECTOR
17458 into an SVE register. */
17459
17460 char *
17461 aarch64_output_sve_mov_immediate (rtx const_vector)
17462 {
17463 static char templ[40];
17464 struct simd_immediate_info info;
17465 char element_char;
17466
17467 bool is_valid = aarch64_simd_valid_immediate (const_vector, &info);
17468 gcc_assert (is_valid);
17469
17470 element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
17471
17472 machine_mode vec_mode = GET_MODE (const_vector);
17473 if (aarch64_sve_pred_mode_p (vec_mode))
17474 {
17475 static char buf[sizeof ("ptrue\t%0.N, vlNNNNN")];
17476 if (info.insn == simd_immediate_info::MOV)
17477 {
17478 gcc_assert (info.u.mov.value == const0_rtx);
17479 snprintf (buf, sizeof (buf), "pfalse\t%%0.b");
17480 }
17481 else
17482 {
17483 gcc_assert (info.insn == simd_immediate_info::PTRUE);
17484 unsigned int total_bytes;
17485 if (info.u.pattern == AARCH64_SV_ALL
17486 && BYTES_PER_SVE_VECTOR.is_constant (&total_bytes))
17487 snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, vl%d", element_char,
17488 total_bytes / GET_MODE_SIZE (info.elt_mode));
17489 else
17490 snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, %s", element_char,
17491 svpattern_token (info.u.pattern));
17492 }
17493 return buf;
17494 }
17495
17496 if (info.insn == simd_immediate_info::INDEX)
17497 {
17498 snprintf (templ, sizeof (templ), "index\t%%0.%c, #"
17499 HOST_WIDE_INT_PRINT_DEC ", #" HOST_WIDE_INT_PRINT_DEC,
17500 element_char, INTVAL (info.u.index.base),
17501 INTVAL (info.u.index.step));
17502 return templ;
17503 }
17504
17505 if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
17506 {
17507 if (aarch64_float_const_zero_rtx_p (info.u.mov.value))
17508 info.u.mov.value = GEN_INT (0);
17509 else
17510 {
17511 const int buf_size = 20;
17512 char float_buf[buf_size] = {};
17513 real_to_decimal_for_mode (float_buf,
17514 CONST_DOUBLE_REAL_VALUE (info.u.mov.value),
17515 buf_size, buf_size, 1, info.elt_mode);
17516
17517 snprintf (templ, sizeof (templ), "fmov\t%%0.%c, #%s",
17518 element_char, float_buf);
17519 return templ;
17520 }
17521 }
17522
17523 snprintf (templ, sizeof (templ), "mov\t%%0.%c, #" HOST_WIDE_INT_PRINT_DEC,
17524 element_char, INTVAL (info.u.mov.value));
17525 return templ;
17526 }
17527
17528 /* Split operands into moves from op[1] + op[2] into op[0]. */
17529
17530 void
17531 aarch64_split_combinev16qi (rtx operands[3])
17532 {
17533 unsigned int dest = REGNO (operands[0]);
17534 unsigned int src1 = REGNO (operands[1]);
17535 unsigned int src2 = REGNO (operands[2]);
17536 machine_mode halfmode = GET_MODE (operands[1]);
17537 unsigned int halfregs = REG_NREGS (operands[1]);
17538 rtx destlo, desthi;
17539
17540 gcc_assert (halfmode == V16QImode);
17541
17542 if (src1 == dest && src2 == dest + halfregs)
17543 {
17544 /* No-op move. Can't split to nothing; emit something. */
17545 emit_note (NOTE_INSN_DELETED);
17546 return;
17547 }
17548
17549 /* Preserve register attributes for variable tracking. */
17550 destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
17551 desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
17552 GET_MODE_SIZE (halfmode));
17553
17554 /* Special case of reversed high/low parts. */
17555 if (reg_overlap_mentioned_p (operands[2], destlo)
17556 && reg_overlap_mentioned_p (operands[1], desthi))
17557 {
17558 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
17559 emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
17560 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
17561 }
17562 else if (!reg_overlap_mentioned_p (operands[2], destlo))
17563 {
17564 /* Try to avoid unnecessary moves if part of the result
17565 is in the right place already. */
17566 if (src1 != dest)
17567 emit_move_insn (destlo, operands[1]);
17568 if (src2 != dest + halfregs)
17569 emit_move_insn (desthi, operands[2]);
17570 }
17571 else
17572 {
17573 if (src2 != dest + halfregs)
17574 emit_move_insn (desthi, operands[2]);
17575 if (src1 != dest)
17576 emit_move_insn (destlo, operands[1]);
17577 }
17578 }
17579
17580 /* vec_perm support. */
17581
17582 struct expand_vec_perm_d
17583 {
17584 rtx target, op0, op1;
17585 vec_perm_indices perm;
17586 machine_mode vmode;
17587 unsigned int vec_flags;
17588 bool one_vector_p;
17589 bool testing_p;
17590 };
17591
17592 /* Generate a variable permutation. */
17593
17594 static void
17595 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
17596 {
17597 machine_mode vmode = GET_MODE (target);
17598 bool one_vector_p = rtx_equal_p (op0, op1);
17599
17600 gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
17601 gcc_checking_assert (GET_MODE (op0) == vmode);
17602 gcc_checking_assert (GET_MODE (op1) == vmode);
17603 gcc_checking_assert (GET_MODE (sel) == vmode);
17604 gcc_checking_assert (TARGET_SIMD);
17605
17606 if (one_vector_p)
17607 {
17608 if (vmode == V8QImode)
17609 {
17610 /* Expand the argument to a V16QI mode by duplicating it. */
17611 rtx pair = gen_reg_rtx (V16QImode);
17612 emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
17613 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
17614 }
17615 else
17616 {
17617 emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
17618 }
17619 }
17620 else
17621 {
17622 rtx pair;
17623
17624 if (vmode == V8QImode)
17625 {
17626 pair = gen_reg_rtx (V16QImode);
17627 emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
17628 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
17629 }
17630 else
17631 {
17632 pair = gen_reg_rtx (OImode);
17633 emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
17634 emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
17635 }
17636 }
17637 }
17638
17639 /* Expand a vec_perm with the operands given by TARGET, OP0, OP1 and SEL.
17640 NELT is the number of elements in the vector. */
17641
17642 void
17643 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel,
17644 unsigned int nelt)
17645 {
17646 machine_mode vmode = GET_MODE (target);
17647 bool one_vector_p = rtx_equal_p (op0, op1);
17648 rtx mask;
17649
17650 /* The TBL instruction does not use a modulo index, so we must take care
17651 of that ourselves. */
17652 mask = aarch64_simd_gen_const_vector_dup (vmode,
17653 one_vector_p ? nelt - 1 : 2 * nelt - 1);
17654 sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
17655
17656 /* For big-endian, we also need to reverse the index within the vector
17657 (but not which vector). */
17658 if (BYTES_BIG_ENDIAN)
17659 {
17660 /* If one_vector_p, mask is a vector of (nelt - 1)'s already. */
17661 if (!one_vector_p)
17662 mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
17663 sel = expand_simple_binop (vmode, XOR, sel, mask,
17664 NULL, 0, OPTAB_LIB_WIDEN);
17665 }
17666 aarch64_expand_vec_perm_1 (target, op0, op1, sel);
17667 }
17668
17669 /* Generate (set TARGET (unspec [OP0 OP1] CODE)). */
17670
17671 static void
17672 emit_unspec2 (rtx target, int code, rtx op0, rtx op1)
17673 {
17674 emit_insn (gen_rtx_SET (target,
17675 gen_rtx_UNSPEC (GET_MODE (target),
17676 gen_rtvec (2, op0, op1), code)));
17677 }
17678
17679 /* Expand an SVE vec_perm with the given operands. */
17680
17681 void
17682 aarch64_expand_sve_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
17683 {
17684 machine_mode data_mode = GET_MODE (target);
17685 machine_mode sel_mode = GET_MODE (sel);
17686 /* Enforced by the pattern condition. */
17687 int nunits = GET_MODE_NUNITS (sel_mode).to_constant ();
17688
17689 /* Note: vec_perm indices are supposed to wrap when they go beyond the
17690 size of the two value vectors, i.e. the upper bits of the indices
17691 are effectively ignored. SVE TBL instead produces 0 for any
17692 out-of-range indices, so we need to modulo all the vec_perm indices
17693 to ensure they are all in range. */
17694 rtx sel_reg = force_reg (sel_mode, sel);
17695
17696 /* Check if the sel only references the first values vector. */
17697 if (GET_CODE (sel) == CONST_VECTOR
17698 && aarch64_const_vec_all_in_range_p (sel, 0, nunits - 1))
17699 {
17700 emit_unspec2 (target, UNSPEC_TBL, op0, sel_reg);
17701 return;
17702 }
17703
17704 /* Check if the two values vectors are the same. */
17705 if (rtx_equal_p (op0, op1))
17706 {
17707 rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode, nunits - 1);
17708 rtx sel_mod = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
17709 NULL, 0, OPTAB_DIRECT);
17710 emit_unspec2 (target, UNSPEC_TBL, op0, sel_mod);
17711 return;
17712 }
17713
17714 /* Run TBL on for each value vector and combine the results. */
17715
17716 rtx res0 = gen_reg_rtx (data_mode);
17717 rtx res1 = gen_reg_rtx (data_mode);
17718 rtx neg_num_elems = aarch64_simd_gen_const_vector_dup (sel_mode, -nunits);
17719 if (GET_CODE (sel) != CONST_VECTOR
17720 || !aarch64_const_vec_all_in_range_p (sel, 0, 2 * nunits - 1))
17721 {
17722 rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode,
17723 2 * nunits - 1);
17724 sel_reg = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
17725 NULL, 0, OPTAB_DIRECT);
17726 }
17727 emit_unspec2 (res0, UNSPEC_TBL, op0, sel_reg);
17728 rtx sel_sub = expand_simple_binop (sel_mode, PLUS, sel_reg, neg_num_elems,
17729 NULL, 0, OPTAB_DIRECT);
17730 emit_unspec2 (res1, UNSPEC_TBL, op1, sel_sub);
17731 if (GET_MODE_CLASS (data_mode) == MODE_VECTOR_INT)
17732 emit_insn (gen_rtx_SET (target, gen_rtx_IOR (data_mode, res0, res1)));
17733 else
17734 emit_unspec2 (target, UNSPEC_IORF, res0, res1);
17735 }
17736
17737 /* Recognize patterns suitable for the TRN instructions. */
17738 static bool
17739 aarch64_evpc_trn (struct expand_vec_perm_d *d)
17740 {
17741 HOST_WIDE_INT odd;
17742 poly_uint64 nelt = d->perm.length ();
17743 rtx out, in0, in1, x;
17744 machine_mode vmode = d->vmode;
17745
17746 if (GET_MODE_UNIT_SIZE (vmode) > 8)
17747 return false;
17748
17749 /* Note that these are little-endian tests.
17750 We correct for big-endian later. */
17751 if (!d->perm[0].is_constant (&odd)
17752 || (odd != 0 && odd != 1)
17753 || !d->perm.series_p (0, 2, odd, 2)
17754 || !d->perm.series_p (1, 2, nelt + odd, 2))
17755 return false;
17756
17757 /* Success! */
17758 if (d->testing_p)
17759 return true;
17760
17761 in0 = d->op0;
17762 in1 = d->op1;
17763 /* We don't need a big-endian lane correction for SVE; see the comment
17764 at the head of aarch64-sve.md for details. */
17765 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
17766 {
17767 x = in0, in0 = in1, in1 = x;
17768 odd = !odd;
17769 }
17770 out = d->target;
17771
17772 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
17773 odd ? UNSPEC_TRN2 : UNSPEC_TRN1));
17774 return true;
17775 }
17776
17777 /* Recognize patterns suitable for the UZP instructions. */
17778 static bool
17779 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
17780 {
17781 HOST_WIDE_INT odd;
17782 rtx out, in0, in1, x;
17783 machine_mode vmode = d->vmode;
17784
17785 if (GET_MODE_UNIT_SIZE (vmode) > 8)
17786 return false;
17787
17788 /* Note that these are little-endian tests.
17789 We correct for big-endian later. */
17790 if (!d->perm[0].is_constant (&odd)
17791 || (odd != 0 && odd != 1)
17792 || !d->perm.series_p (0, 1, odd, 2))
17793 return false;
17794
17795 /* Success! */
17796 if (d->testing_p)
17797 return true;
17798
17799 in0 = d->op0;
17800 in1 = d->op1;
17801 /* We don't need a big-endian lane correction for SVE; see the comment
17802 at the head of aarch64-sve.md for details. */
17803 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
17804 {
17805 x = in0, in0 = in1, in1 = x;
17806 odd = !odd;
17807 }
17808 out = d->target;
17809
17810 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
17811 odd ? UNSPEC_UZP2 : UNSPEC_UZP1));
17812 return true;
17813 }
17814
17815 /* Recognize patterns suitable for the ZIP instructions. */
17816 static bool
17817 aarch64_evpc_zip (struct expand_vec_perm_d *d)
17818 {
17819 unsigned int high;
17820 poly_uint64 nelt = d->perm.length ();
17821 rtx out, in0, in1, x;
17822 machine_mode vmode = d->vmode;
17823
17824 if (GET_MODE_UNIT_SIZE (vmode) > 8)
17825 return false;
17826
17827 /* Note that these are little-endian tests.
17828 We correct for big-endian later. */
17829 poly_uint64 first = d->perm[0];
17830 if ((maybe_ne (first, 0U) && maybe_ne (first * 2, nelt))
17831 || !d->perm.series_p (0, 2, first, 1)
17832 || !d->perm.series_p (1, 2, first + nelt, 1))
17833 return false;
17834 high = maybe_ne (first, 0U);
17835
17836 /* Success! */
17837 if (d->testing_p)
17838 return true;
17839
17840 in0 = d->op0;
17841 in1 = d->op1;
17842 /* We don't need a big-endian lane correction for SVE; see the comment
17843 at the head of aarch64-sve.md for details. */
17844 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
17845 {
17846 x = in0, in0 = in1, in1 = x;
17847 high = !high;
17848 }
17849 out = d->target;
17850
17851 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
17852 high ? UNSPEC_ZIP2 : UNSPEC_ZIP1));
17853 return true;
17854 }
17855
17856 /* Recognize patterns for the EXT insn. */
17857
17858 static bool
17859 aarch64_evpc_ext (struct expand_vec_perm_d *d)
17860 {
17861 HOST_WIDE_INT location;
17862 rtx offset;
17863
17864 /* The first element always refers to the first vector.
17865 Check if the extracted indices are increasing by one. */
17866 if (d->vec_flags == VEC_SVE_PRED
17867 || !d->perm[0].is_constant (&location)
17868 || !d->perm.series_p (0, 1, location, 1))
17869 return false;
17870
17871 /* Success! */
17872 if (d->testing_p)
17873 return true;
17874
17875 /* The case where (location == 0) is a no-op for both big- and little-endian,
17876 and is removed by the mid-end at optimization levels -O1 and higher.
17877
17878 We don't need a big-endian lane correction for SVE; see the comment
17879 at the head of aarch64-sve.md for details. */
17880 if (BYTES_BIG_ENDIAN && location != 0 && d->vec_flags == VEC_ADVSIMD)
17881 {
17882 /* After setup, we want the high elements of the first vector (stored
17883 at the LSB end of the register), and the low elements of the second
17884 vector (stored at the MSB end of the register). So swap. */
17885 std::swap (d->op0, d->op1);
17886 /* location != 0 (above), so safe to assume (nelt - location) < nelt.
17887 to_constant () is safe since this is restricted to Advanced SIMD
17888 vectors. */
17889 location = d->perm.length ().to_constant () - location;
17890 }
17891
17892 offset = GEN_INT (location);
17893 emit_set_insn (d->target,
17894 gen_rtx_UNSPEC (d->vmode,
17895 gen_rtvec (3, d->op0, d->op1, offset),
17896 UNSPEC_EXT));
17897 return true;
17898 }
17899
17900 /* Recognize patterns for the REV{64,32,16} insns, which reverse elements
17901 within each 64-bit, 32-bit or 16-bit granule. */
17902
17903 static bool
17904 aarch64_evpc_rev_local (struct expand_vec_perm_d *d)
17905 {
17906 HOST_WIDE_INT diff;
17907 unsigned int i, size, unspec;
17908 machine_mode pred_mode;
17909
17910 if (d->vec_flags == VEC_SVE_PRED
17911 || !d->one_vector_p
17912 || !d->perm[0].is_constant (&diff))
17913 return false;
17914
17915 size = (diff + 1) * GET_MODE_UNIT_SIZE (d->vmode);
17916 if (size == 8)
17917 {
17918 unspec = UNSPEC_REV64;
17919 pred_mode = VNx2BImode;
17920 }
17921 else if (size == 4)
17922 {
17923 unspec = UNSPEC_REV32;
17924 pred_mode = VNx4BImode;
17925 }
17926 else if (size == 2)
17927 {
17928 unspec = UNSPEC_REV16;
17929 pred_mode = VNx8BImode;
17930 }
17931 else
17932 return false;
17933
17934 unsigned int step = diff + 1;
17935 for (i = 0; i < step; ++i)
17936 if (!d->perm.series_p (i, step, diff - i, step))
17937 return false;
17938
17939 /* Success! */
17940 if (d->testing_p)
17941 return true;
17942
17943 if (d->vec_flags == VEC_SVE_DATA)
17944 {
17945 machine_mode int_mode = aarch64_sve_int_mode (pred_mode);
17946 rtx target = gen_reg_rtx (int_mode);
17947 if (BYTES_BIG_ENDIAN)
17948 /* The act of taking a subreg between INT_MODE and d->vmode
17949 is itself a reversing operation on big-endian targets;
17950 see the comment at the head of aarch64-sve.md for details.
17951 First reinterpret OP0 as INT_MODE without using a subreg
17952 and without changing the contents. */
17953 emit_insn (gen_aarch64_sve_reinterpret (int_mode, target, d->op0));
17954 else
17955 {
17956 /* For SVE we use REV[BHW] unspecs derived from the element size
17957 of v->mode and vector modes whose elements have SIZE bytes.
17958 This ensures that the vector modes match the predicate modes. */
17959 int unspec = aarch64_sve_rev_unspec (d->vmode);
17960 rtx pred = aarch64_ptrue_reg (pred_mode);
17961 emit_insn (gen_aarch64_pred (unspec, int_mode, target, pred,
17962 gen_lowpart (int_mode, d->op0)));
17963 }
17964 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
17965 return true;
17966 }
17967 rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), unspec);
17968 emit_set_insn (d->target, src);
17969 return true;
17970 }
17971
17972 /* Recognize patterns for the REV insn, which reverses elements within
17973 a full vector. */
17974
17975 static bool
17976 aarch64_evpc_rev_global (struct expand_vec_perm_d *d)
17977 {
17978 poly_uint64 nelt = d->perm.length ();
17979
17980 if (!d->one_vector_p || d->vec_flags != VEC_SVE_DATA)
17981 return false;
17982
17983 if (!d->perm.series_p (0, 1, nelt - 1, -1))
17984 return false;
17985
17986 /* Success! */
17987 if (d->testing_p)
17988 return true;
17989
17990 rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), UNSPEC_REV);
17991 emit_set_insn (d->target, src);
17992 return true;
17993 }
17994
17995 static bool
17996 aarch64_evpc_dup (struct expand_vec_perm_d *d)
17997 {
17998 rtx out = d->target;
17999 rtx in0;
18000 HOST_WIDE_INT elt;
18001 machine_mode vmode = d->vmode;
18002 rtx lane;
18003
18004 if (d->vec_flags == VEC_SVE_PRED
18005 || d->perm.encoding ().encoded_nelts () != 1
18006 || !d->perm[0].is_constant (&elt))
18007 return false;
18008
18009 if (d->vec_flags == VEC_SVE_DATA && elt >= 64 * GET_MODE_UNIT_SIZE (vmode))
18010 return false;
18011
18012 /* Success! */
18013 if (d->testing_p)
18014 return true;
18015
18016 /* The generic preparation in aarch64_expand_vec_perm_const_1
18017 swaps the operand order and the permute indices if it finds
18018 d->perm[0] to be in the second operand. Thus, we can always
18019 use d->op0 and need not do any extra arithmetic to get the
18020 correct lane number. */
18021 in0 = d->op0;
18022 lane = GEN_INT (elt); /* The pattern corrects for big-endian. */
18023
18024 rtx parallel = gen_rtx_PARALLEL (vmode, gen_rtvec (1, lane));
18025 rtx select = gen_rtx_VEC_SELECT (GET_MODE_INNER (vmode), in0, parallel);
18026 emit_set_insn (out, gen_rtx_VEC_DUPLICATE (vmode, select));
18027 return true;
18028 }
18029
18030 static bool
18031 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
18032 {
18033 rtx rperm[MAX_COMPILE_TIME_VEC_BYTES], sel;
18034 machine_mode vmode = d->vmode;
18035
18036 /* Make sure that the indices are constant. */
18037 unsigned int encoded_nelts = d->perm.encoding ().encoded_nelts ();
18038 for (unsigned int i = 0; i < encoded_nelts; ++i)
18039 if (!d->perm[i].is_constant ())
18040 return false;
18041
18042 if (d->testing_p)
18043 return true;
18044
18045 /* Generic code will try constant permutation twice. Once with the
18046 original mode and again with the elements lowered to QImode.
18047 So wait and don't do the selector expansion ourselves. */
18048 if (vmode != V8QImode && vmode != V16QImode)
18049 return false;
18050
18051 /* to_constant is safe since this routine is specific to Advanced SIMD
18052 vectors. */
18053 unsigned int nelt = d->perm.length ().to_constant ();
18054 for (unsigned int i = 0; i < nelt; ++i)
18055 /* If big-endian and two vectors we end up with a weird mixed-endian
18056 mode on NEON. Reverse the index within each word but not the word
18057 itself. to_constant is safe because we checked is_constant above. */
18058 rperm[i] = GEN_INT (BYTES_BIG_ENDIAN
18059 ? d->perm[i].to_constant () ^ (nelt - 1)
18060 : d->perm[i].to_constant ());
18061
18062 sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
18063 sel = force_reg (vmode, sel);
18064
18065 aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
18066 return true;
18067 }
18068
18069 /* Try to implement D using an SVE TBL instruction. */
18070
18071 static bool
18072 aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
18073 {
18074 unsigned HOST_WIDE_INT nelt;
18075
18076 /* Permuting two variable-length vectors could overflow the
18077 index range. */
18078 if (!d->one_vector_p && !d->perm.length ().is_constant (&nelt))
18079 return false;
18080
18081 if (d->testing_p)
18082 return true;
18083
18084 machine_mode sel_mode = mode_for_int_vector (d->vmode).require ();
18085 rtx sel = vec_perm_indices_to_rtx (sel_mode, d->perm);
18086 if (d->one_vector_p)
18087 emit_unspec2 (d->target, UNSPEC_TBL, d->op0, force_reg (sel_mode, sel));
18088 else
18089 aarch64_expand_sve_vec_perm (d->target, d->op0, d->op1, sel);
18090 return true;
18091 }
18092
18093 /* Try to implement D using SVE SEL instruction. */
18094
18095 static bool
18096 aarch64_evpc_sel (struct expand_vec_perm_d *d)
18097 {
18098 machine_mode vmode = d->vmode;
18099 int unit_size = GET_MODE_UNIT_SIZE (vmode);
18100
18101 if (d->vec_flags != VEC_SVE_DATA
18102 || unit_size > 8)
18103 return false;
18104
18105 int n_patterns = d->perm.encoding ().npatterns ();
18106 poly_int64 vec_len = d->perm.length ();
18107
18108 for (int i = 0; i < n_patterns; ++i)
18109 if (!known_eq (d->perm[i], i)
18110 && !known_eq (d->perm[i], vec_len + i))
18111 return false;
18112
18113 for (int i = n_patterns; i < n_patterns * 2; i++)
18114 if (!d->perm.series_p (i, n_patterns, i, n_patterns)
18115 && !d->perm.series_p (i, n_patterns, vec_len + i, n_patterns))
18116 return false;
18117
18118 if (d->testing_p)
18119 return true;
18120
18121 machine_mode pred_mode = aarch64_sve_pred_mode (unit_size).require ();
18122
18123 rtx_vector_builder builder (pred_mode, n_patterns, 2);
18124 for (int i = 0; i < n_patterns * 2; i++)
18125 {
18126 rtx elem = known_eq (d->perm[i], i) ? CONST1_RTX (BImode)
18127 : CONST0_RTX (BImode);
18128 builder.quick_push (elem);
18129 }
18130
18131 rtx const_vec = builder.build ();
18132 rtx pred = force_reg (pred_mode, const_vec);
18133 emit_insn (gen_vcond_mask (vmode, vmode, d->target, d->op1, d->op0, pred));
18134 return true;
18135 }
18136
18137 static bool
18138 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
18139 {
18140 /* The pattern matching functions above are written to look for a small
18141 number to begin the sequence (0, 1, N/2). If we begin with an index
18142 from the second operand, we can swap the operands. */
18143 poly_int64 nelt = d->perm.length ();
18144 if (known_ge (d->perm[0], nelt))
18145 {
18146 d->perm.rotate_inputs (1);
18147 std::swap (d->op0, d->op1);
18148 }
18149
18150 if ((d->vec_flags == VEC_ADVSIMD
18151 || d->vec_flags == VEC_SVE_DATA
18152 || d->vec_flags == VEC_SVE_PRED)
18153 && known_gt (nelt, 1))
18154 {
18155 if (aarch64_evpc_rev_local (d))
18156 return true;
18157 else if (aarch64_evpc_rev_global (d))
18158 return true;
18159 else if (aarch64_evpc_ext (d))
18160 return true;
18161 else if (aarch64_evpc_dup (d))
18162 return true;
18163 else if (aarch64_evpc_zip (d))
18164 return true;
18165 else if (aarch64_evpc_uzp (d))
18166 return true;
18167 else if (aarch64_evpc_trn (d))
18168 return true;
18169 else if (aarch64_evpc_sel (d))
18170 return true;
18171 if (d->vec_flags == VEC_SVE_DATA)
18172 return aarch64_evpc_sve_tbl (d);
18173 else if (d->vec_flags == VEC_ADVSIMD)
18174 return aarch64_evpc_tbl (d);
18175 }
18176 return false;
18177 }
18178
18179 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST. */
18180
18181 static bool
18182 aarch64_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
18183 rtx op1, const vec_perm_indices &sel)
18184 {
18185 struct expand_vec_perm_d d;
18186
18187 /* Check whether the mask can be applied to a single vector. */
18188 if (sel.ninputs () == 1
18189 || (op0 && rtx_equal_p (op0, op1)))
18190 d.one_vector_p = true;
18191 else if (sel.all_from_input_p (0))
18192 {
18193 d.one_vector_p = true;
18194 op1 = op0;
18195 }
18196 else if (sel.all_from_input_p (1))
18197 {
18198 d.one_vector_p = true;
18199 op0 = op1;
18200 }
18201 else
18202 d.one_vector_p = false;
18203
18204 d.perm.new_vector (sel.encoding (), d.one_vector_p ? 1 : 2,
18205 sel.nelts_per_input ());
18206 d.vmode = vmode;
18207 d.vec_flags = aarch64_classify_vector_mode (d.vmode);
18208 d.target = target;
18209 d.op0 = op0;
18210 d.op1 = op1;
18211 d.testing_p = !target;
18212
18213 if (!d.testing_p)
18214 return aarch64_expand_vec_perm_const_1 (&d);
18215
18216 rtx_insn *last = get_last_insn ();
18217 bool ret = aarch64_expand_vec_perm_const_1 (&d);
18218 gcc_assert (last == get_last_insn ());
18219
18220 return ret;
18221 }
18222
18223 /* Generate a byte permute mask for a register of mode MODE,
18224 which has NUNITS units. */
18225
18226 rtx
18227 aarch64_reverse_mask (machine_mode mode, unsigned int nunits)
18228 {
18229 /* We have to reverse each vector because we dont have
18230 a permuted load that can reverse-load according to ABI rules. */
18231 rtx mask;
18232 rtvec v = rtvec_alloc (16);
18233 unsigned int i, j;
18234 unsigned int usize = GET_MODE_UNIT_SIZE (mode);
18235
18236 gcc_assert (BYTES_BIG_ENDIAN);
18237 gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
18238
18239 for (i = 0; i < nunits; i++)
18240 for (j = 0; j < usize; j++)
18241 RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
18242 mask = gen_rtx_CONST_VECTOR (V16QImode, v);
18243 return force_reg (V16QImode, mask);
18244 }
18245
18246 /* Expand an SVE integer comparison using the SVE equivalent of:
18247
18248 (set TARGET (CODE OP0 OP1)). */
18249
18250 void
18251 aarch64_expand_sve_vec_cmp_int (rtx target, rtx_code code, rtx op0, rtx op1)
18252 {
18253 machine_mode pred_mode = GET_MODE (target);
18254 machine_mode data_mode = GET_MODE (op0);
18255 rtx res = aarch64_sve_emit_int_cmp (target, pred_mode, code, data_mode,
18256 op0, op1);
18257 if (!rtx_equal_p (target, res))
18258 emit_move_insn (target, res);
18259 }
18260
18261 /* Return the UNSPEC_COND_* code for comparison CODE. */
18262
18263 static unsigned int
18264 aarch64_unspec_cond_code (rtx_code code)
18265 {
18266 switch (code)
18267 {
18268 case NE:
18269 return UNSPEC_COND_FCMNE;
18270 case EQ:
18271 return UNSPEC_COND_FCMEQ;
18272 case LT:
18273 return UNSPEC_COND_FCMLT;
18274 case GT:
18275 return UNSPEC_COND_FCMGT;
18276 case LE:
18277 return UNSPEC_COND_FCMLE;
18278 case GE:
18279 return UNSPEC_COND_FCMGE;
18280 case UNORDERED:
18281 return UNSPEC_COND_FCMUO;
18282 default:
18283 gcc_unreachable ();
18284 }
18285 }
18286
18287 /* Emit:
18288
18289 (set TARGET (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X>))
18290
18291 where <X> is the operation associated with comparison CODE.
18292 KNOWN_PTRUE_P is true if PRED is known to be a PTRUE. */
18293
18294 static void
18295 aarch64_emit_sve_fp_cond (rtx target, rtx_code code, rtx pred,
18296 bool known_ptrue_p, rtx op0, rtx op1)
18297 {
18298 rtx flag = gen_int_mode (known_ptrue_p, SImode);
18299 rtx unspec = gen_rtx_UNSPEC (GET_MODE (pred),
18300 gen_rtvec (4, pred, flag, op0, op1),
18301 aarch64_unspec_cond_code (code));
18302 emit_set_insn (target, unspec);
18303 }
18304
18305 /* Emit the SVE equivalent of:
18306
18307 (set TMP1 (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X1>))
18308 (set TMP2 (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X2>))
18309 (set TARGET (ior:PRED_MODE TMP1 TMP2))
18310
18311 where <Xi> is the operation associated with comparison CODEi.
18312 KNOWN_PTRUE_P is true if PRED is known to be a PTRUE. */
18313
18314 static void
18315 aarch64_emit_sve_or_fp_conds (rtx target, rtx_code code1, rtx_code code2,
18316 rtx pred, bool known_ptrue_p, rtx op0, rtx op1)
18317 {
18318 machine_mode pred_mode = GET_MODE (pred);
18319 rtx tmp1 = gen_reg_rtx (pred_mode);
18320 aarch64_emit_sve_fp_cond (tmp1, code1, pred, known_ptrue_p, op0, op1);
18321 rtx tmp2 = gen_reg_rtx (pred_mode);
18322 aarch64_emit_sve_fp_cond (tmp2, code2, pred, known_ptrue_p, op0, op1);
18323 aarch64_emit_binop (target, ior_optab, tmp1, tmp2);
18324 }
18325
18326 /* Emit the SVE equivalent of:
18327
18328 (set TMP (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X>))
18329 (set TARGET (not TMP))
18330
18331 where <X> is the operation associated with comparison CODE.
18332 KNOWN_PTRUE_P is true if PRED is known to be a PTRUE. */
18333
18334 static void
18335 aarch64_emit_sve_invert_fp_cond (rtx target, rtx_code code, rtx pred,
18336 bool known_ptrue_p, rtx op0, rtx op1)
18337 {
18338 machine_mode pred_mode = GET_MODE (pred);
18339 rtx tmp = gen_reg_rtx (pred_mode);
18340 aarch64_emit_sve_fp_cond (tmp, code, pred, known_ptrue_p, op0, op1);
18341 aarch64_emit_unop (target, one_cmpl_optab, tmp);
18342 }
18343
18344 /* Expand an SVE floating-point comparison using the SVE equivalent of:
18345
18346 (set TARGET (CODE OP0 OP1))
18347
18348 If CAN_INVERT_P is true, the caller can also handle inverted results;
18349 return true if the result is in fact inverted. */
18350
18351 bool
18352 aarch64_expand_sve_vec_cmp_float (rtx target, rtx_code code,
18353 rtx op0, rtx op1, bool can_invert_p)
18354 {
18355 machine_mode pred_mode = GET_MODE (target);
18356 machine_mode data_mode = GET_MODE (op0);
18357
18358 rtx ptrue = aarch64_ptrue_reg (pred_mode);
18359 switch (code)
18360 {
18361 case UNORDERED:
18362 /* UNORDERED has no immediate form. */
18363 op1 = force_reg (data_mode, op1);
18364 /* fall through */
18365 case LT:
18366 case LE:
18367 case GT:
18368 case GE:
18369 case EQ:
18370 case NE:
18371 {
18372 /* There is native support for the comparison. */
18373 aarch64_emit_sve_fp_cond (target, code, ptrue, true, op0, op1);
18374 return false;
18375 }
18376
18377 case LTGT:
18378 /* This is a trapping operation (LT or GT). */
18379 aarch64_emit_sve_or_fp_conds (target, LT, GT, ptrue, true, op0, op1);
18380 return false;
18381
18382 case UNEQ:
18383 if (!flag_trapping_math)
18384 {
18385 /* This would trap for signaling NaNs. */
18386 op1 = force_reg (data_mode, op1);
18387 aarch64_emit_sve_or_fp_conds (target, UNORDERED, EQ,
18388 ptrue, true, op0, op1);
18389 return false;
18390 }
18391 /* fall through */
18392 case UNLT:
18393 case UNLE:
18394 case UNGT:
18395 case UNGE:
18396 if (flag_trapping_math)
18397 {
18398 /* Work out which elements are ordered. */
18399 rtx ordered = gen_reg_rtx (pred_mode);
18400 op1 = force_reg (data_mode, op1);
18401 aarch64_emit_sve_invert_fp_cond (ordered, UNORDERED,
18402 ptrue, true, op0, op1);
18403
18404 /* Test the opposite condition for the ordered elements,
18405 then invert the result. */
18406 if (code == UNEQ)
18407 code = NE;
18408 else
18409 code = reverse_condition_maybe_unordered (code);
18410 if (can_invert_p)
18411 {
18412 aarch64_emit_sve_fp_cond (target, code,
18413 ordered, false, op0, op1);
18414 return true;
18415 }
18416 aarch64_emit_sve_invert_fp_cond (target, code,
18417 ordered, false, op0, op1);
18418 return false;
18419 }
18420 break;
18421
18422 case ORDERED:
18423 /* ORDERED has no immediate form. */
18424 op1 = force_reg (data_mode, op1);
18425 break;
18426
18427 default:
18428 gcc_unreachable ();
18429 }
18430
18431 /* There is native support for the inverse comparison. */
18432 code = reverse_condition_maybe_unordered (code);
18433 if (can_invert_p)
18434 {
18435 aarch64_emit_sve_fp_cond (target, code, ptrue, true, op0, op1);
18436 return true;
18437 }
18438 aarch64_emit_sve_invert_fp_cond (target, code, ptrue, true, op0, op1);
18439 return false;
18440 }
18441
18442 /* Expand an SVE vcond pattern with operands OPS. DATA_MODE is the mode
18443 of the data being selected and CMP_MODE is the mode of the values being
18444 compared. */
18445
18446 void
18447 aarch64_expand_sve_vcond (machine_mode data_mode, machine_mode cmp_mode,
18448 rtx *ops)
18449 {
18450 machine_mode pred_mode
18451 = aarch64_get_mask_mode (GET_MODE_NUNITS (cmp_mode),
18452 GET_MODE_SIZE (cmp_mode)).require ();
18453 rtx pred = gen_reg_rtx (pred_mode);
18454 if (FLOAT_MODE_P (cmp_mode))
18455 {
18456 if (aarch64_expand_sve_vec_cmp_float (pred, GET_CODE (ops[3]),
18457 ops[4], ops[5], true))
18458 std::swap (ops[1], ops[2]);
18459 }
18460 else
18461 aarch64_expand_sve_vec_cmp_int (pred, GET_CODE (ops[3]), ops[4], ops[5]);
18462
18463 if (!aarch64_sve_reg_or_dup_imm (ops[1], data_mode))
18464 ops[1] = force_reg (data_mode, ops[1]);
18465 /* The "false" value can only be zero if the "true" value is a constant. */
18466 if (register_operand (ops[1], data_mode)
18467 || !aarch64_simd_reg_or_zero (ops[2], data_mode))
18468 ops[2] = force_reg (data_mode, ops[2]);
18469
18470 rtvec vec = gen_rtvec (3, pred, ops[1], ops[2]);
18471 emit_set_insn (ops[0], gen_rtx_UNSPEC (data_mode, vec, UNSPEC_SEL));
18472 }
18473
18474 /* Implement TARGET_MODES_TIEABLE_P. In principle we should always return
18475 true. However due to issues with register allocation it is preferable
18476 to avoid tieing integer scalar and FP scalar modes. Executing integer
18477 operations in general registers is better than treating them as scalar
18478 vector operations. This reduces latency and avoids redundant int<->FP
18479 moves. So tie modes if they are either the same class, or vector modes
18480 with other vector modes, vector structs or any scalar mode. */
18481
18482 static bool
18483 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
18484 {
18485 if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
18486 return true;
18487
18488 /* We specifically want to allow elements of "structure" modes to
18489 be tieable to the structure. This more general condition allows
18490 other rarer situations too. The reason we don't extend this to
18491 predicate modes is that there are no predicate structure modes
18492 nor any specific instructions for extracting part of a predicate
18493 register. */
18494 if (aarch64_vector_data_mode_p (mode1)
18495 && aarch64_vector_data_mode_p (mode2))
18496 return true;
18497
18498 /* Also allow any scalar modes with vectors. */
18499 if (aarch64_vector_mode_supported_p (mode1)
18500 || aarch64_vector_mode_supported_p (mode2))
18501 return true;
18502
18503 return false;
18504 }
18505
18506 /* Return a new RTX holding the result of moving POINTER forward by
18507 AMOUNT bytes. */
18508
18509 static rtx
18510 aarch64_move_pointer (rtx pointer, poly_int64 amount)
18511 {
18512 rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
18513
18514 return adjust_automodify_address (pointer, GET_MODE (pointer),
18515 next, amount);
18516 }
18517
18518 /* Return a new RTX holding the result of moving POINTER forward by the
18519 size of the mode it points to. */
18520
18521 static rtx
18522 aarch64_progress_pointer (rtx pointer)
18523 {
18524 return aarch64_move_pointer (pointer, GET_MODE_SIZE (GET_MODE (pointer)));
18525 }
18526
18527 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
18528 MODE bytes. */
18529
18530 static void
18531 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
18532 machine_mode mode)
18533 {
18534 rtx reg = gen_reg_rtx (mode);
18535
18536 /* "Cast" the pointers to the correct mode. */
18537 *src = adjust_address (*src, mode, 0);
18538 *dst = adjust_address (*dst, mode, 0);
18539 /* Emit the memcpy. */
18540 emit_move_insn (reg, *src);
18541 emit_move_insn (*dst, reg);
18542 /* Move the pointers forward. */
18543 *src = aarch64_progress_pointer (*src);
18544 *dst = aarch64_progress_pointer (*dst);
18545 }
18546
18547 /* Expand cpymem, as if from a __builtin_memcpy. Return true if
18548 we succeed, otherwise return false. */
18549
18550 bool
18551 aarch64_expand_cpymem (rtx *operands)
18552 {
18553 int n, mode_bits;
18554 rtx dst = operands[0];
18555 rtx src = operands[1];
18556 rtx base;
18557 machine_mode cur_mode = BLKmode, next_mode;
18558 bool speed_p = !optimize_function_for_size_p (cfun);
18559
18560 /* When optimizing for size, give a better estimate of the length of a
18561 memcpy call, but use the default otherwise. Moves larger than 8 bytes
18562 will always require an even number of instructions to do now. And each
18563 operation requires both a load+store, so devide the max number by 2. */
18564 int max_num_moves = (speed_p ? 16 : AARCH64_CALL_RATIO) / 2;
18565
18566 /* We can't do anything smart if the amount to copy is not constant. */
18567 if (!CONST_INT_P (operands[2]))
18568 return false;
18569
18570 n = INTVAL (operands[2]);
18571
18572 /* Try to keep the number of instructions low. For all cases we will do at
18573 most two moves for the residual amount, since we'll always overlap the
18574 remainder. */
18575 if (((n / 16) + (n % 16 ? 2 : 0)) > max_num_moves)
18576 return false;
18577
18578 base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
18579 dst = adjust_automodify_address (dst, VOIDmode, base, 0);
18580
18581 base = copy_to_mode_reg (Pmode, XEXP (src, 0));
18582 src = adjust_automodify_address (src, VOIDmode, base, 0);
18583
18584 /* Convert n to bits to make the rest of the code simpler. */
18585 n = n * BITS_PER_UNIT;
18586
18587 /* Maximum amount to copy in one go. The AArch64 back-end has integer modes
18588 larger than TImode, but we should not use them for loads/stores here. */
18589 const int copy_limit = GET_MODE_BITSIZE (TImode);
18590
18591 while (n > 0)
18592 {
18593 /* Find the largest mode in which to do the copy in without over reading
18594 or writing. */
18595 opt_scalar_int_mode mode_iter;
18596 FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
18597 if (GET_MODE_BITSIZE (mode_iter.require ()) <= MIN (n, copy_limit))
18598 cur_mode = mode_iter.require ();
18599
18600 gcc_assert (cur_mode != BLKmode);
18601
18602 mode_bits = GET_MODE_BITSIZE (cur_mode).to_constant ();
18603 aarch64_copy_one_block_and_progress_pointers (&src, &dst, cur_mode);
18604
18605 n -= mode_bits;
18606
18607 /* Do certain trailing copies as overlapping if it's going to be
18608 cheaper. i.e. less instructions to do so. For instance doing a 15
18609 byte copy it's more efficient to do two overlapping 8 byte copies than
18610 8 + 6 + 1. */
18611 if (n > 0 && n <= 8 * BITS_PER_UNIT)
18612 {
18613 next_mode = smallest_mode_for_size (n, MODE_INT);
18614 int n_bits = GET_MODE_BITSIZE (next_mode).to_constant ();
18615 src = aarch64_move_pointer (src, (n - n_bits) / BITS_PER_UNIT);
18616 dst = aarch64_move_pointer (dst, (n - n_bits) / BITS_PER_UNIT);
18617 n = n_bits;
18618 }
18619 }
18620
18621 return true;
18622 }
18623
18624 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
18625 SImode stores. Handle the case when the constant has identical
18626 bottom and top halves. This is beneficial when the two stores can be
18627 merged into an STP and we avoid synthesising potentially expensive
18628 immediates twice. Return true if such a split is possible. */
18629
18630 bool
18631 aarch64_split_dimode_const_store (rtx dst, rtx src)
18632 {
18633 rtx lo = gen_lowpart (SImode, src);
18634 rtx hi = gen_highpart_mode (SImode, DImode, src);
18635
18636 bool size_p = optimize_function_for_size_p (cfun);
18637
18638 if (!rtx_equal_p (lo, hi))
18639 return false;
18640
18641 unsigned int orig_cost
18642 = aarch64_internal_mov_immediate (NULL_RTX, src, false, DImode);
18643 unsigned int lo_cost
18644 = aarch64_internal_mov_immediate (NULL_RTX, lo, false, SImode);
18645
18646 /* We want to transform:
18647 MOV x1, 49370
18648 MOVK x1, 0x140, lsl 16
18649 MOVK x1, 0xc0da, lsl 32
18650 MOVK x1, 0x140, lsl 48
18651 STR x1, [x0]
18652 into:
18653 MOV w1, 49370
18654 MOVK w1, 0x140, lsl 16
18655 STP w1, w1, [x0]
18656 So we want to perform this only when we save two instructions
18657 or more. When optimizing for size, however, accept any code size
18658 savings we can. */
18659 if (size_p && orig_cost <= lo_cost)
18660 return false;
18661
18662 if (!size_p
18663 && (orig_cost <= lo_cost + 1))
18664 return false;
18665
18666 rtx mem_lo = adjust_address (dst, SImode, 0);
18667 if (!aarch64_mem_pair_operand (mem_lo, SImode))
18668 return false;
18669
18670 rtx tmp_reg = gen_reg_rtx (SImode);
18671 aarch64_expand_mov_immediate (tmp_reg, lo);
18672 rtx mem_hi = aarch64_move_pointer (mem_lo, GET_MODE_SIZE (SImode));
18673 /* Don't emit an explicit store pair as this may not be always profitable.
18674 Let the sched-fusion logic decide whether to merge them. */
18675 emit_move_insn (mem_lo, tmp_reg);
18676 emit_move_insn (mem_hi, tmp_reg);
18677
18678 return true;
18679 }
18680
18681 /* Generate RTL for a conditional branch with rtx comparison CODE in
18682 mode CC_MODE. The destination of the unlikely conditional branch
18683 is LABEL_REF. */
18684
18685 void
18686 aarch64_gen_unlikely_cbranch (enum rtx_code code, machine_mode cc_mode,
18687 rtx label_ref)
18688 {
18689 rtx x;
18690 x = gen_rtx_fmt_ee (code, VOIDmode,
18691 gen_rtx_REG (cc_mode, CC_REGNUM),
18692 const0_rtx);
18693
18694 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
18695 gen_rtx_LABEL_REF (VOIDmode, label_ref),
18696 pc_rtx);
18697 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
18698 }
18699
18700 /* Generate DImode scratch registers for 128-bit (TImode) addition.
18701
18702 OP1 represents the TImode destination operand 1
18703 OP2 represents the TImode destination operand 2
18704 LOW_DEST represents the low half (DImode) of TImode operand 0
18705 LOW_IN1 represents the low half (DImode) of TImode operand 1
18706 LOW_IN2 represents the low half (DImode) of TImode operand 2
18707 HIGH_DEST represents the high half (DImode) of TImode operand 0
18708 HIGH_IN1 represents the high half (DImode) of TImode operand 1
18709 HIGH_IN2 represents the high half (DImode) of TImode operand 2. */
18710
18711 void
18712 aarch64_addti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
18713 rtx *low_in1, rtx *low_in2,
18714 rtx *high_dest, rtx *high_in1,
18715 rtx *high_in2)
18716 {
18717 *low_dest = gen_reg_rtx (DImode);
18718 *low_in1 = gen_lowpart (DImode, op1);
18719 *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
18720 subreg_lowpart_offset (DImode, TImode));
18721 *high_dest = gen_reg_rtx (DImode);
18722 *high_in1 = gen_highpart (DImode, op1);
18723 *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
18724 subreg_highpart_offset (DImode, TImode));
18725 }
18726
18727 /* Generate DImode scratch registers for 128-bit (TImode) subtraction.
18728
18729 This function differs from 'arch64_addti_scratch_regs' in that
18730 OP1 can be an immediate constant (zero). We must call
18731 subreg_highpart_offset with DImode and TImode arguments, otherwise
18732 VOIDmode will be used for the const_int which generates an internal
18733 error from subreg_size_highpart_offset which does not expect a size of zero.
18734
18735 OP1 represents the TImode destination operand 1
18736 OP2 represents the TImode destination operand 2
18737 LOW_DEST represents the low half (DImode) of TImode operand 0
18738 LOW_IN1 represents the low half (DImode) of TImode operand 1
18739 LOW_IN2 represents the low half (DImode) of TImode operand 2
18740 HIGH_DEST represents the high half (DImode) of TImode operand 0
18741 HIGH_IN1 represents the high half (DImode) of TImode operand 1
18742 HIGH_IN2 represents the high half (DImode) of TImode operand 2. */
18743
18744
18745 void
18746 aarch64_subvti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
18747 rtx *low_in1, rtx *low_in2,
18748 rtx *high_dest, rtx *high_in1,
18749 rtx *high_in2)
18750 {
18751 *low_dest = gen_reg_rtx (DImode);
18752 *low_in1 = simplify_gen_subreg (DImode, op1, TImode,
18753 subreg_lowpart_offset (DImode, TImode));
18754
18755 *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
18756 subreg_lowpart_offset (DImode, TImode));
18757 *high_dest = gen_reg_rtx (DImode);
18758
18759 *high_in1 = simplify_gen_subreg (DImode, op1, TImode,
18760 subreg_highpart_offset (DImode, TImode));
18761 *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
18762 subreg_highpart_offset (DImode, TImode));
18763 }
18764
18765 /* Generate RTL for 128-bit (TImode) subtraction with overflow.
18766
18767 OP0 represents the TImode destination operand 0
18768 LOW_DEST represents the low half (DImode) of TImode operand 0
18769 LOW_IN1 represents the low half (DImode) of TImode operand 1
18770 LOW_IN2 represents the low half (DImode) of TImode operand 2
18771 HIGH_DEST represents the high half (DImode) of TImode operand 0
18772 HIGH_IN1 represents the high half (DImode) of TImode operand 1
18773 HIGH_IN2 represents the high half (DImode) of TImode operand 2
18774 UNSIGNED_P is true if the operation is being performed on unsigned
18775 values. */
18776 void
18777 aarch64_expand_subvti (rtx op0, rtx low_dest, rtx low_in1,
18778 rtx low_in2, rtx high_dest, rtx high_in1,
18779 rtx high_in2, bool unsigned_p)
18780 {
18781 if (low_in2 == const0_rtx)
18782 {
18783 low_dest = low_in1;
18784 high_in2 = force_reg (DImode, high_in2);
18785 if (unsigned_p)
18786 emit_insn (gen_subdi3_compare1 (high_dest, high_in1, high_in2));
18787 else
18788 emit_insn (gen_subvdi_insn (high_dest, high_in1, high_in2));
18789 }
18790 else
18791 {
18792 if (CONST_INT_P (low_in2))
18793 {
18794 high_in2 = force_reg (DImode, high_in2);
18795 emit_insn (gen_subdi3_compare1_imm (low_dest, low_in1, low_in2,
18796 GEN_INT (-INTVAL (low_in2))));
18797 }
18798 else
18799 emit_insn (gen_subdi3_compare1 (low_dest, low_in1, low_in2));
18800
18801 if (unsigned_p)
18802 emit_insn (gen_usubdi3_carryinC (high_dest, high_in1, high_in2));
18803 else
18804 emit_insn (gen_subdi3_carryinV (high_dest, high_in1, high_in2));
18805 }
18806
18807 emit_move_insn (gen_lowpart (DImode, op0), low_dest);
18808 emit_move_insn (gen_highpart (DImode, op0), high_dest);
18809
18810 }
18811
18812 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
18813
18814 static unsigned HOST_WIDE_INT
18815 aarch64_asan_shadow_offset (void)
18816 {
18817 if (TARGET_ILP32)
18818 return (HOST_WIDE_INT_1 << 29);
18819 else
18820 return (HOST_WIDE_INT_1 << 36);
18821 }
18822
18823 static rtx
18824 aarch64_gen_ccmp_first (rtx_insn **prep_seq, rtx_insn **gen_seq,
18825 int code, tree treeop0, tree treeop1)
18826 {
18827 machine_mode op_mode, cmp_mode, cc_mode = CCmode;
18828 rtx op0, op1;
18829 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
18830 insn_code icode;
18831 struct expand_operand ops[4];
18832
18833 start_sequence ();
18834 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
18835
18836 op_mode = GET_MODE (op0);
18837 if (op_mode == VOIDmode)
18838 op_mode = GET_MODE (op1);
18839
18840 switch (op_mode)
18841 {
18842 case E_QImode:
18843 case E_HImode:
18844 case E_SImode:
18845 cmp_mode = SImode;
18846 icode = CODE_FOR_cmpsi;
18847 break;
18848
18849 case E_DImode:
18850 cmp_mode = DImode;
18851 icode = CODE_FOR_cmpdi;
18852 break;
18853
18854 case E_SFmode:
18855 cmp_mode = SFmode;
18856 cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
18857 icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpesf : CODE_FOR_fcmpsf;
18858 break;
18859
18860 case E_DFmode:
18861 cmp_mode = DFmode;
18862 cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
18863 icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpedf : CODE_FOR_fcmpdf;
18864 break;
18865
18866 default:
18867 end_sequence ();
18868 return NULL_RTX;
18869 }
18870
18871 op0 = prepare_operand (icode, op0, 0, op_mode, cmp_mode, unsignedp);
18872 op1 = prepare_operand (icode, op1, 1, op_mode, cmp_mode, unsignedp);
18873 if (!op0 || !op1)
18874 {
18875 end_sequence ();
18876 return NULL_RTX;
18877 }
18878 *prep_seq = get_insns ();
18879 end_sequence ();
18880
18881 create_fixed_operand (&ops[0], op0);
18882 create_fixed_operand (&ops[1], op1);
18883
18884 start_sequence ();
18885 if (!maybe_expand_insn (icode, 2, ops))
18886 {
18887 end_sequence ();
18888 return NULL_RTX;
18889 }
18890 *gen_seq = get_insns ();
18891 end_sequence ();
18892
18893 return gen_rtx_fmt_ee ((rtx_code) code, cc_mode,
18894 gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx);
18895 }
18896
18897 static rtx
18898 aarch64_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev,
18899 int cmp_code, tree treeop0, tree treeop1, int bit_code)
18900 {
18901 rtx op0, op1, target;
18902 machine_mode op_mode, cmp_mode, cc_mode = CCmode;
18903 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
18904 insn_code icode;
18905 struct expand_operand ops[6];
18906 int aarch64_cond;
18907
18908 push_to_sequence (*prep_seq);
18909 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
18910
18911 op_mode = GET_MODE (op0);
18912 if (op_mode == VOIDmode)
18913 op_mode = GET_MODE (op1);
18914
18915 switch (op_mode)
18916 {
18917 case E_QImode:
18918 case E_HImode:
18919 case E_SImode:
18920 cmp_mode = SImode;
18921 icode = CODE_FOR_ccmpsi;
18922 break;
18923
18924 case E_DImode:
18925 cmp_mode = DImode;
18926 icode = CODE_FOR_ccmpdi;
18927 break;
18928
18929 case E_SFmode:
18930 cmp_mode = SFmode;
18931 cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
18932 icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpesf : CODE_FOR_fccmpsf;
18933 break;
18934
18935 case E_DFmode:
18936 cmp_mode = DFmode;
18937 cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
18938 icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpedf : CODE_FOR_fccmpdf;
18939 break;
18940
18941 default:
18942 end_sequence ();
18943 return NULL_RTX;
18944 }
18945
18946 op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
18947 op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
18948 if (!op0 || !op1)
18949 {
18950 end_sequence ();
18951 return NULL_RTX;
18952 }
18953 *prep_seq = get_insns ();
18954 end_sequence ();
18955
18956 target = gen_rtx_REG (cc_mode, CC_REGNUM);
18957 aarch64_cond = aarch64_get_condition_code_1 (cc_mode, (rtx_code) cmp_code);
18958
18959 if (bit_code != AND)
18960 {
18961 prev = gen_rtx_fmt_ee (REVERSE_CONDITION (GET_CODE (prev),
18962 GET_MODE (XEXP (prev, 0))),
18963 VOIDmode, XEXP (prev, 0), const0_rtx);
18964 aarch64_cond = AARCH64_INVERSE_CONDITION_CODE (aarch64_cond);
18965 }
18966
18967 create_fixed_operand (&ops[0], XEXP (prev, 0));
18968 create_fixed_operand (&ops[1], target);
18969 create_fixed_operand (&ops[2], op0);
18970 create_fixed_operand (&ops[3], op1);
18971 create_fixed_operand (&ops[4], prev);
18972 create_fixed_operand (&ops[5], GEN_INT (aarch64_cond));
18973
18974 push_to_sequence (*gen_seq);
18975 if (!maybe_expand_insn (icode, 6, ops))
18976 {
18977 end_sequence ();
18978 return NULL_RTX;
18979 }
18980
18981 *gen_seq = get_insns ();
18982 end_sequence ();
18983
18984 return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
18985 }
18986
18987 #undef TARGET_GEN_CCMP_FIRST
18988 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
18989
18990 #undef TARGET_GEN_CCMP_NEXT
18991 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
18992
18993 /* Implement TARGET_SCHED_MACRO_FUSION_P. Return true if target supports
18994 instruction fusion of some sort. */
18995
18996 static bool
18997 aarch64_macro_fusion_p (void)
18998 {
18999 return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
19000 }
19001
19002
19003 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P. Return true if PREV and CURR
19004 should be kept together during scheduling. */
19005
19006 static bool
19007 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
19008 {
19009 rtx set_dest;
19010 rtx prev_set = single_set (prev);
19011 rtx curr_set = single_set (curr);
19012 /* prev and curr are simple SET insns i.e. no flag setting or branching. */
19013 bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
19014
19015 if (!aarch64_macro_fusion_p ())
19016 return false;
19017
19018 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK))
19019 {
19020 /* We are trying to match:
19021 prev (mov) == (set (reg r0) (const_int imm16))
19022 curr (movk) == (set (zero_extract (reg r0)
19023 (const_int 16)
19024 (const_int 16))
19025 (const_int imm16_1)) */
19026
19027 set_dest = SET_DEST (curr_set);
19028
19029 if (GET_CODE (set_dest) == ZERO_EXTRACT
19030 && CONST_INT_P (SET_SRC (curr_set))
19031 && CONST_INT_P (SET_SRC (prev_set))
19032 && CONST_INT_P (XEXP (set_dest, 2))
19033 && INTVAL (XEXP (set_dest, 2)) == 16
19034 && REG_P (XEXP (set_dest, 0))
19035 && REG_P (SET_DEST (prev_set))
19036 && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
19037 {
19038 return true;
19039 }
19040 }
19041
19042 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD))
19043 {
19044
19045 /* We're trying to match:
19046 prev (adrp) == (set (reg r1)
19047 (high (symbol_ref ("SYM"))))
19048 curr (add) == (set (reg r0)
19049 (lo_sum (reg r1)
19050 (symbol_ref ("SYM"))))
19051 Note that r0 need not necessarily be the same as r1, especially
19052 during pre-regalloc scheduling. */
19053
19054 if (satisfies_constraint_Ush (SET_SRC (prev_set))
19055 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
19056 {
19057 if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
19058 && REG_P (XEXP (SET_SRC (curr_set), 0))
19059 && REGNO (XEXP (SET_SRC (curr_set), 0))
19060 == REGNO (SET_DEST (prev_set))
19061 && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
19062 XEXP (SET_SRC (curr_set), 1)))
19063 return true;
19064 }
19065 }
19066
19067 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK))
19068 {
19069
19070 /* We're trying to match:
19071 prev (movk) == (set (zero_extract (reg r0)
19072 (const_int 16)
19073 (const_int 32))
19074 (const_int imm16_1))
19075 curr (movk) == (set (zero_extract (reg r0)
19076 (const_int 16)
19077 (const_int 48))
19078 (const_int imm16_2)) */
19079
19080 if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
19081 && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
19082 && REG_P (XEXP (SET_DEST (prev_set), 0))
19083 && REG_P (XEXP (SET_DEST (curr_set), 0))
19084 && REGNO (XEXP (SET_DEST (prev_set), 0))
19085 == REGNO (XEXP (SET_DEST (curr_set), 0))
19086 && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
19087 && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
19088 && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
19089 && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
19090 && CONST_INT_P (SET_SRC (prev_set))
19091 && CONST_INT_P (SET_SRC (curr_set)))
19092 return true;
19093
19094 }
19095 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR))
19096 {
19097 /* We're trying to match:
19098 prev (adrp) == (set (reg r0)
19099 (high (symbol_ref ("SYM"))))
19100 curr (ldr) == (set (reg r1)
19101 (mem (lo_sum (reg r0)
19102 (symbol_ref ("SYM")))))
19103 or
19104 curr (ldr) == (set (reg r1)
19105 (zero_extend (mem
19106 (lo_sum (reg r0)
19107 (symbol_ref ("SYM")))))) */
19108 if (satisfies_constraint_Ush (SET_SRC (prev_set))
19109 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
19110 {
19111 rtx curr_src = SET_SRC (curr_set);
19112
19113 if (GET_CODE (curr_src) == ZERO_EXTEND)
19114 curr_src = XEXP (curr_src, 0);
19115
19116 if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
19117 && REG_P (XEXP (XEXP (curr_src, 0), 0))
19118 && REGNO (XEXP (XEXP (curr_src, 0), 0))
19119 == REGNO (SET_DEST (prev_set))
19120 && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
19121 XEXP (SET_SRC (prev_set), 0)))
19122 return true;
19123 }
19124 }
19125
19126 if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH)
19127 && any_condjump_p (curr))
19128 {
19129 unsigned int condreg1, condreg2;
19130 rtx cc_reg_1;
19131 aarch64_fixed_condition_code_regs (&condreg1, &condreg2);
19132 cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
19133
19134 if (reg_referenced_p (cc_reg_1, PATTERN (curr))
19135 && prev
19136 && modified_in_p (cc_reg_1, prev))
19137 {
19138 enum attr_type prev_type = get_attr_type (prev);
19139
19140 /* FIXME: this misses some which is considered simple arthematic
19141 instructions for ThunderX. Simple shifts are missed here. */
19142 if (prev_type == TYPE_ALUS_SREG
19143 || prev_type == TYPE_ALUS_IMM
19144 || prev_type == TYPE_LOGICS_REG
19145 || prev_type == TYPE_LOGICS_IMM)
19146 return true;
19147 }
19148 }
19149
19150 if (prev_set
19151 && curr_set
19152 && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH)
19153 && any_condjump_p (curr))
19154 {
19155 /* We're trying to match:
19156 prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
19157 curr (cbz) == (set (pc) (if_then_else (eq/ne) (r0)
19158 (const_int 0))
19159 (label_ref ("SYM"))
19160 (pc)) */
19161 if (SET_DEST (curr_set) == (pc_rtx)
19162 && GET_CODE (SET_SRC (curr_set)) == IF_THEN_ELSE
19163 && REG_P (XEXP (XEXP (SET_SRC (curr_set), 0), 0))
19164 && REG_P (SET_DEST (prev_set))
19165 && REGNO (SET_DEST (prev_set))
19166 == REGNO (XEXP (XEXP (SET_SRC (curr_set), 0), 0)))
19167 {
19168 /* Fuse ALU operations followed by conditional branch instruction. */
19169 switch (get_attr_type (prev))
19170 {
19171 case TYPE_ALU_IMM:
19172 case TYPE_ALU_SREG:
19173 case TYPE_ADC_REG:
19174 case TYPE_ADC_IMM:
19175 case TYPE_ADCS_REG:
19176 case TYPE_ADCS_IMM:
19177 case TYPE_LOGIC_REG:
19178 case TYPE_LOGIC_IMM:
19179 case TYPE_CSEL:
19180 case TYPE_ADR:
19181 case TYPE_MOV_IMM:
19182 case TYPE_SHIFT_REG:
19183 case TYPE_SHIFT_IMM:
19184 case TYPE_BFM:
19185 case TYPE_RBIT:
19186 case TYPE_REV:
19187 case TYPE_EXTEND:
19188 return true;
19189
19190 default:;
19191 }
19192 }
19193 }
19194
19195 return false;
19196 }
19197
19198 /* Return true iff the instruction fusion described by OP is enabled. */
19199
19200 bool
19201 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op)
19202 {
19203 return (aarch64_tune_params.fusible_ops & op) != 0;
19204 }
19205
19206 /* If MEM is in the form of [base+offset], extract the two parts
19207 of address and set to BASE and OFFSET, otherwise return false
19208 after clearing BASE and OFFSET. */
19209
19210 bool
19211 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
19212 {
19213 rtx addr;
19214
19215 gcc_assert (MEM_P (mem));
19216
19217 addr = XEXP (mem, 0);
19218
19219 if (REG_P (addr))
19220 {
19221 *base = addr;
19222 *offset = const0_rtx;
19223 return true;
19224 }
19225
19226 if (GET_CODE (addr) == PLUS
19227 && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
19228 {
19229 *base = XEXP (addr, 0);
19230 *offset = XEXP (addr, 1);
19231 return true;
19232 }
19233
19234 *base = NULL_RTX;
19235 *offset = NULL_RTX;
19236
19237 return false;
19238 }
19239
19240 /* Types for scheduling fusion. */
19241 enum sched_fusion_type
19242 {
19243 SCHED_FUSION_NONE = 0,
19244 SCHED_FUSION_LD_SIGN_EXTEND,
19245 SCHED_FUSION_LD_ZERO_EXTEND,
19246 SCHED_FUSION_LD,
19247 SCHED_FUSION_ST,
19248 SCHED_FUSION_NUM
19249 };
19250
19251 /* If INSN is a load or store of address in the form of [base+offset],
19252 extract the two parts and set to BASE and OFFSET. Return scheduling
19253 fusion type this INSN is. */
19254
19255 static enum sched_fusion_type
19256 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
19257 {
19258 rtx x, dest, src;
19259 enum sched_fusion_type fusion = SCHED_FUSION_LD;
19260
19261 gcc_assert (INSN_P (insn));
19262 x = PATTERN (insn);
19263 if (GET_CODE (x) != SET)
19264 return SCHED_FUSION_NONE;
19265
19266 src = SET_SRC (x);
19267 dest = SET_DEST (x);
19268
19269 machine_mode dest_mode = GET_MODE (dest);
19270
19271 if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
19272 return SCHED_FUSION_NONE;
19273
19274 if (GET_CODE (src) == SIGN_EXTEND)
19275 {
19276 fusion = SCHED_FUSION_LD_SIGN_EXTEND;
19277 src = XEXP (src, 0);
19278 if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
19279 return SCHED_FUSION_NONE;
19280 }
19281 else if (GET_CODE (src) == ZERO_EXTEND)
19282 {
19283 fusion = SCHED_FUSION_LD_ZERO_EXTEND;
19284 src = XEXP (src, 0);
19285 if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
19286 return SCHED_FUSION_NONE;
19287 }
19288
19289 if (GET_CODE (src) == MEM && REG_P (dest))
19290 extract_base_offset_in_addr (src, base, offset);
19291 else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
19292 {
19293 fusion = SCHED_FUSION_ST;
19294 extract_base_offset_in_addr (dest, base, offset);
19295 }
19296 else
19297 return SCHED_FUSION_NONE;
19298
19299 if (*base == NULL_RTX || *offset == NULL_RTX)
19300 fusion = SCHED_FUSION_NONE;
19301
19302 return fusion;
19303 }
19304
19305 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
19306
19307 Currently we only support to fuse ldr or str instructions, so FUSION_PRI
19308 and PRI are only calculated for these instructions. For other instruction,
19309 FUSION_PRI and PRI are simply set to MAX_PRI - 1. In the future, other
19310 type instruction fusion can be added by returning different priorities.
19311
19312 It's important that irrelevant instructions get the largest FUSION_PRI. */
19313
19314 static void
19315 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
19316 int *fusion_pri, int *pri)
19317 {
19318 int tmp, off_val;
19319 rtx base, offset;
19320 enum sched_fusion_type fusion;
19321
19322 gcc_assert (INSN_P (insn));
19323
19324 tmp = max_pri - 1;
19325 fusion = fusion_load_store (insn, &base, &offset);
19326 if (fusion == SCHED_FUSION_NONE)
19327 {
19328 *pri = tmp;
19329 *fusion_pri = tmp;
19330 return;
19331 }
19332
19333 /* Set FUSION_PRI according to fusion type and base register. */
19334 *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
19335
19336 /* Calculate PRI. */
19337 tmp /= 2;
19338
19339 /* INSN with smaller offset goes first. */
19340 off_val = (int)(INTVAL (offset));
19341 if (off_val >= 0)
19342 tmp -= (off_val & 0xfffff);
19343 else
19344 tmp += ((- off_val) & 0xfffff);
19345
19346 *pri = tmp;
19347 return;
19348 }
19349
19350 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
19351 Adjust priority of sha1h instructions so they are scheduled before
19352 other SHA1 instructions. */
19353
19354 static int
19355 aarch64_sched_adjust_priority (rtx_insn *insn, int priority)
19356 {
19357 rtx x = PATTERN (insn);
19358
19359 if (GET_CODE (x) == SET)
19360 {
19361 x = SET_SRC (x);
19362
19363 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SHA1H)
19364 return priority + 10;
19365 }
19366
19367 return priority;
19368 }
19369
19370 /* Given OPERANDS of consecutive load/store, check if we can merge
19371 them into ldp/stp. LOAD is true if they are load instructions.
19372 MODE is the mode of memory operands. */
19373
19374 bool
19375 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
19376 machine_mode mode)
19377 {
19378 HOST_WIDE_INT offval_1, offval_2, msize;
19379 enum reg_class rclass_1, rclass_2;
19380 rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
19381
19382 if (load)
19383 {
19384 mem_1 = operands[1];
19385 mem_2 = operands[3];
19386 reg_1 = operands[0];
19387 reg_2 = operands[2];
19388 gcc_assert (REG_P (reg_1) && REG_P (reg_2));
19389 if (REGNO (reg_1) == REGNO (reg_2))
19390 return false;
19391 }
19392 else
19393 {
19394 mem_1 = operands[0];
19395 mem_2 = operands[2];
19396 reg_1 = operands[1];
19397 reg_2 = operands[3];
19398 }
19399
19400 /* The mems cannot be volatile. */
19401 if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
19402 return false;
19403
19404 /* If we have SImode and slow unaligned ldp,
19405 check the alignment to be at least 8 byte. */
19406 if (mode == SImode
19407 && (aarch64_tune_params.extra_tuning_flags
19408 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
19409 && !optimize_size
19410 && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
19411 return false;
19412
19413 /* Check if the addresses are in the form of [base+offset]. */
19414 extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
19415 if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
19416 return false;
19417 extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
19418 if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
19419 return false;
19420
19421 /* Check if the bases are same. */
19422 if (!rtx_equal_p (base_1, base_2))
19423 return false;
19424
19425 /* The operands must be of the same size. */
19426 gcc_assert (known_eq (GET_MODE_SIZE (GET_MODE (mem_1)),
19427 GET_MODE_SIZE (GET_MODE (mem_2))));
19428
19429 offval_1 = INTVAL (offset_1);
19430 offval_2 = INTVAL (offset_2);
19431 /* We should only be trying this for fixed-sized modes. There is no
19432 SVE LDP/STP instruction. */
19433 msize = GET_MODE_SIZE (mode).to_constant ();
19434 /* Check if the offsets are consecutive. */
19435 if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
19436 return false;
19437
19438 /* Check if the addresses are clobbered by load. */
19439 if (load)
19440 {
19441 if (reg_mentioned_p (reg_1, mem_1))
19442 return false;
19443
19444 /* In increasing order, the last load can clobber the address. */
19445 if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
19446 return false;
19447 }
19448
19449 /* One of the memory accesses must be a mempair operand.
19450 If it is not the first one, they need to be swapped by the
19451 peephole. */
19452 if (!aarch64_mem_pair_operand (mem_1, GET_MODE (mem_1))
19453 && !aarch64_mem_pair_operand (mem_2, GET_MODE (mem_2)))
19454 return false;
19455
19456 if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
19457 rclass_1 = FP_REGS;
19458 else
19459 rclass_1 = GENERAL_REGS;
19460
19461 if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
19462 rclass_2 = FP_REGS;
19463 else
19464 rclass_2 = GENERAL_REGS;
19465
19466 /* Check if the registers are of same class. */
19467 if (rclass_1 != rclass_2)
19468 return false;
19469
19470 return true;
19471 }
19472
19473 /* Given OPERANDS of consecutive load/store that can be merged,
19474 swap them if they are not in ascending order. */
19475 void
19476 aarch64_swap_ldrstr_operands (rtx* operands, bool load)
19477 {
19478 rtx mem_1, mem_2, base_1, base_2, offset_1, offset_2;
19479 HOST_WIDE_INT offval_1, offval_2;
19480
19481 if (load)
19482 {
19483 mem_1 = operands[1];
19484 mem_2 = operands[3];
19485 }
19486 else
19487 {
19488 mem_1 = operands[0];
19489 mem_2 = operands[2];
19490 }
19491
19492 extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
19493 extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
19494
19495 offval_1 = INTVAL (offset_1);
19496 offval_2 = INTVAL (offset_2);
19497
19498 if (offval_1 > offval_2)
19499 {
19500 /* Irrespective of whether this is a load or a store,
19501 we do the same swap. */
19502 std::swap (operands[0], operands[2]);
19503 std::swap (operands[1], operands[3]);
19504 }
19505 }
19506
19507 /* Taking X and Y to be HOST_WIDE_INT pointers, return the result of a
19508 comparison between the two. */
19509 int
19510 aarch64_host_wide_int_compare (const void *x, const void *y)
19511 {
19512 return wi::cmps (* ((const HOST_WIDE_INT *) x),
19513 * ((const HOST_WIDE_INT *) y));
19514 }
19515
19516 /* Taking X and Y to be pairs of RTX, one pointing to a MEM rtx and the
19517 other pointing to a REG rtx containing an offset, compare the offsets
19518 of the two pairs.
19519
19520 Return:
19521
19522 1 iff offset (X) > offset (Y)
19523 0 iff offset (X) == offset (Y)
19524 -1 iff offset (X) < offset (Y) */
19525 int
19526 aarch64_ldrstr_offset_compare (const void *x, const void *y)
19527 {
19528 const rtx * operands_1 = (const rtx *) x;
19529 const rtx * operands_2 = (const rtx *) y;
19530 rtx mem_1, mem_2, base, offset_1, offset_2;
19531
19532 if (MEM_P (operands_1[0]))
19533 mem_1 = operands_1[0];
19534 else
19535 mem_1 = operands_1[1];
19536
19537 if (MEM_P (operands_2[0]))
19538 mem_2 = operands_2[0];
19539 else
19540 mem_2 = operands_2[1];
19541
19542 /* Extract the offsets. */
19543 extract_base_offset_in_addr (mem_1, &base, &offset_1);
19544 extract_base_offset_in_addr (mem_2, &base, &offset_2);
19545
19546 gcc_assert (offset_1 != NULL_RTX && offset_2 != NULL_RTX);
19547
19548 return wi::cmps (INTVAL (offset_1), INTVAL (offset_2));
19549 }
19550
19551 /* Given OPERANDS of consecutive load/store, check if we can merge
19552 them into ldp/stp by adjusting the offset. LOAD is true if they
19553 are load instructions. MODE is the mode of memory operands.
19554
19555 Given below consecutive stores:
19556
19557 str w1, [xb, 0x100]
19558 str w1, [xb, 0x104]
19559 str w1, [xb, 0x108]
19560 str w1, [xb, 0x10c]
19561
19562 Though the offsets are out of the range supported by stp, we can
19563 still pair them after adjusting the offset, like:
19564
19565 add scratch, xb, 0x100
19566 stp w1, w1, [scratch]
19567 stp w1, w1, [scratch, 0x8]
19568
19569 The peephole patterns detecting this opportunity should guarantee
19570 the scratch register is avaliable. */
19571
19572 bool
19573 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
19574 scalar_mode mode)
19575 {
19576 const int num_insns = 4;
19577 enum reg_class rclass;
19578 HOST_WIDE_INT offvals[num_insns], msize;
19579 rtx mem[num_insns], reg[num_insns], base[num_insns], offset[num_insns];
19580
19581 if (load)
19582 {
19583 for (int i = 0; i < num_insns; i++)
19584 {
19585 reg[i] = operands[2 * i];
19586 mem[i] = operands[2 * i + 1];
19587
19588 gcc_assert (REG_P (reg[i]));
19589 }
19590
19591 /* Do not attempt to merge the loads if the loads clobber each other. */
19592 for (int i = 0; i < 8; i += 2)
19593 for (int j = i + 2; j < 8; j += 2)
19594 if (reg_overlap_mentioned_p (operands[i], operands[j]))
19595 return false;
19596 }
19597 else
19598 for (int i = 0; i < num_insns; i++)
19599 {
19600 mem[i] = operands[2 * i];
19601 reg[i] = operands[2 * i + 1];
19602 }
19603
19604 /* Skip if memory operand is by itself valid for ldp/stp. */
19605 if (!MEM_P (mem[0]) || aarch64_mem_pair_operand (mem[0], mode))
19606 return false;
19607
19608 for (int i = 0; i < num_insns; i++)
19609 {
19610 /* The mems cannot be volatile. */
19611 if (MEM_VOLATILE_P (mem[i]))
19612 return false;
19613
19614 /* Check if the addresses are in the form of [base+offset]. */
19615 extract_base_offset_in_addr (mem[i], base + i, offset + i);
19616 if (base[i] == NULL_RTX || offset[i] == NULL_RTX)
19617 return false;
19618 }
19619
19620 /* Check if the registers are of same class. */
19621 rclass = REG_P (reg[0]) && FP_REGNUM_P (REGNO (reg[0]))
19622 ? FP_REGS : GENERAL_REGS;
19623
19624 for (int i = 1; i < num_insns; i++)
19625 if (REG_P (reg[i]) && FP_REGNUM_P (REGNO (reg[i])))
19626 {
19627 if (rclass != FP_REGS)
19628 return false;
19629 }
19630 else
19631 {
19632 if (rclass != GENERAL_REGS)
19633 return false;
19634 }
19635
19636 /* Only the last register in the order in which they occur
19637 may be clobbered by the load. */
19638 if (rclass == GENERAL_REGS && load)
19639 for (int i = 0; i < num_insns - 1; i++)
19640 if (reg_mentioned_p (reg[i], mem[i]))
19641 return false;
19642
19643 /* Check if the bases are same. */
19644 for (int i = 0; i < num_insns - 1; i++)
19645 if (!rtx_equal_p (base[i], base[i + 1]))
19646 return false;
19647
19648 for (int i = 0; i < num_insns; i++)
19649 offvals[i] = INTVAL (offset[i]);
19650
19651 msize = GET_MODE_SIZE (mode);
19652
19653 /* Check if the offsets can be put in the right order to do a ldp/stp. */
19654 qsort (offvals, num_insns, sizeof (HOST_WIDE_INT),
19655 aarch64_host_wide_int_compare);
19656
19657 if (!(offvals[1] == offvals[0] + msize
19658 && offvals[3] == offvals[2] + msize))
19659 return false;
19660
19661 /* Check that offsets are within range of each other. The ldp/stp
19662 instructions have 7 bit immediate offsets, so use 0x80. */
19663 if (offvals[2] - offvals[0] >= msize * 0x80)
19664 return false;
19665
19666 /* The offsets must be aligned with respect to each other. */
19667 if (offvals[0] % msize != offvals[2] % msize)
19668 return false;
19669
19670 /* If we have SImode and slow unaligned ldp,
19671 check the alignment to be at least 8 byte. */
19672 if (mode == SImode
19673 && (aarch64_tune_params.extra_tuning_flags
19674 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
19675 && !optimize_size
19676 && MEM_ALIGN (mem[0]) < 8 * BITS_PER_UNIT)
19677 return false;
19678
19679 return true;
19680 }
19681
19682 /* Given OPERANDS of consecutive load/store, this function pairs them
19683 into LDP/STP after adjusting the offset. It depends on the fact
19684 that the operands can be sorted so the offsets are correct for STP.
19685 MODE is the mode of memory operands. CODE is the rtl operator
19686 which should be applied to all memory operands, it's SIGN_EXTEND,
19687 ZERO_EXTEND or UNKNOWN. */
19688
19689 bool
19690 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
19691 scalar_mode mode, RTX_CODE code)
19692 {
19693 rtx base, offset_1, offset_3, t1, t2;
19694 rtx mem_1, mem_2, mem_3, mem_4;
19695 rtx temp_operands[8];
19696 HOST_WIDE_INT off_val_1, off_val_3, base_off, new_off_1, new_off_3,
19697 stp_off_upper_limit, stp_off_lower_limit, msize;
19698
19699 /* We make changes on a copy as we may still bail out. */
19700 for (int i = 0; i < 8; i ++)
19701 temp_operands[i] = operands[i];
19702
19703 /* Sort the operands. */
19704 qsort (temp_operands, 4, 2 * sizeof (rtx *), aarch64_ldrstr_offset_compare);
19705
19706 /* Copy the memory operands so that if we have to bail for some
19707 reason the original addresses are unchanged. */
19708 if (load)
19709 {
19710 mem_1 = copy_rtx (temp_operands[1]);
19711 mem_2 = copy_rtx (temp_operands[3]);
19712 mem_3 = copy_rtx (temp_operands[5]);
19713 mem_4 = copy_rtx (temp_operands[7]);
19714 }
19715 else
19716 {
19717 mem_1 = copy_rtx (temp_operands[0]);
19718 mem_2 = copy_rtx (temp_operands[2]);
19719 mem_3 = copy_rtx (temp_operands[4]);
19720 mem_4 = copy_rtx (temp_operands[6]);
19721 gcc_assert (code == UNKNOWN);
19722 }
19723
19724 extract_base_offset_in_addr (mem_1, &base, &offset_1);
19725 extract_base_offset_in_addr (mem_3, &base, &offset_3);
19726 gcc_assert (base != NULL_RTX && offset_1 != NULL_RTX
19727 && offset_3 != NULL_RTX);
19728
19729 /* Adjust offset so it can fit in LDP/STP instruction. */
19730 msize = GET_MODE_SIZE (mode);
19731 stp_off_upper_limit = msize * (0x40 - 1);
19732 stp_off_lower_limit = - msize * 0x40;
19733
19734 off_val_1 = INTVAL (offset_1);
19735 off_val_3 = INTVAL (offset_3);
19736
19737 /* The base offset is optimally half way between the two STP/LDP offsets. */
19738 if (msize <= 4)
19739 base_off = (off_val_1 + off_val_3) / 2;
19740 else
19741 /* However, due to issues with negative LDP/STP offset generation for
19742 larger modes, for DF, DI and vector modes. we must not use negative
19743 addresses smaller than 9 signed unadjusted bits can store. This
19744 provides the most range in this case. */
19745 base_off = off_val_1;
19746
19747 /* Adjust the base so that it is aligned with the addresses but still
19748 optimal. */
19749 if (base_off % msize != off_val_1 % msize)
19750 /* Fix the offset, bearing in mind we want to make it bigger not
19751 smaller. */
19752 base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
19753 else if (msize <= 4)
19754 /* The negative range of LDP/STP is one larger than the positive range. */
19755 base_off += msize;
19756
19757 /* Check if base offset is too big or too small. We can attempt to resolve
19758 this issue by setting it to the maximum value and seeing if the offsets
19759 still fit. */
19760 if (base_off >= 0x1000)
19761 {
19762 base_off = 0x1000 - 1;
19763 /* We must still make sure that the base offset is aligned with respect
19764 to the address. But it may may not be made any bigger. */
19765 base_off -= (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
19766 }
19767
19768 /* Likewise for the case where the base is too small. */
19769 if (base_off <= -0x1000)
19770 {
19771 base_off = -0x1000 + 1;
19772 base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
19773 }
19774
19775 /* Offset of the first STP/LDP. */
19776 new_off_1 = off_val_1 - base_off;
19777
19778 /* Offset of the second STP/LDP. */
19779 new_off_3 = off_val_3 - base_off;
19780
19781 /* The offsets must be within the range of the LDP/STP instructions. */
19782 if (new_off_1 > stp_off_upper_limit || new_off_1 < stp_off_lower_limit
19783 || new_off_3 > stp_off_upper_limit || new_off_3 < stp_off_lower_limit)
19784 return false;
19785
19786 replace_equiv_address_nv (mem_1, plus_constant (Pmode, operands[8],
19787 new_off_1), true);
19788 replace_equiv_address_nv (mem_2, plus_constant (Pmode, operands[8],
19789 new_off_1 + msize), true);
19790 replace_equiv_address_nv (mem_3, plus_constant (Pmode, operands[8],
19791 new_off_3), true);
19792 replace_equiv_address_nv (mem_4, plus_constant (Pmode, operands[8],
19793 new_off_3 + msize), true);
19794
19795 if (!aarch64_mem_pair_operand (mem_1, mode)
19796 || !aarch64_mem_pair_operand (mem_3, mode))
19797 return false;
19798
19799 if (code == ZERO_EXTEND)
19800 {
19801 mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
19802 mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
19803 mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
19804 mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
19805 }
19806 else if (code == SIGN_EXTEND)
19807 {
19808 mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
19809 mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
19810 mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
19811 mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
19812 }
19813
19814 if (load)
19815 {
19816 operands[0] = temp_operands[0];
19817 operands[1] = mem_1;
19818 operands[2] = temp_operands[2];
19819 operands[3] = mem_2;
19820 operands[4] = temp_operands[4];
19821 operands[5] = mem_3;
19822 operands[6] = temp_operands[6];
19823 operands[7] = mem_4;
19824 }
19825 else
19826 {
19827 operands[0] = mem_1;
19828 operands[1] = temp_operands[1];
19829 operands[2] = mem_2;
19830 operands[3] = temp_operands[3];
19831 operands[4] = mem_3;
19832 operands[5] = temp_operands[5];
19833 operands[6] = mem_4;
19834 operands[7] = temp_operands[7];
19835 }
19836
19837 /* Emit adjusting instruction. */
19838 emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, base_off)));
19839 /* Emit ldp/stp instructions. */
19840 t1 = gen_rtx_SET (operands[0], operands[1]);
19841 t2 = gen_rtx_SET (operands[2], operands[3]);
19842 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
19843 t1 = gen_rtx_SET (operands[4], operands[5]);
19844 t2 = gen_rtx_SET (operands[6], operands[7]);
19845 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
19846 return true;
19847 }
19848
19849 /* Implement TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE. Assume for now that
19850 it isn't worth branching around empty masked ops (including masked
19851 stores). */
19852
19853 static bool
19854 aarch64_empty_mask_is_expensive (unsigned)
19855 {
19856 return false;
19857 }
19858
19859 /* Return 1 if pseudo register should be created and used to hold
19860 GOT address for PIC code. */
19861
19862 bool
19863 aarch64_use_pseudo_pic_reg (void)
19864 {
19865 return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
19866 }
19867
19868 /* Implement TARGET_UNSPEC_MAY_TRAP_P. */
19869
19870 static int
19871 aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
19872 {
19873 switch (XINT (x, 1))
19874 {
19875 case UNSPEC_GOTSMALLPIC:
19876 case UNSPEC_GOTSMALLPIC28K:
19877 case UNSPEC_GOTTINYPIC:
19878 return 0;
19879 default:
19880 break;
19881 }
19882
19883 return default_unspec_may_trap_p (x, flags);
19884 }
19885
19886
19887 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
19888 return the log2 of that value. Otherwise return -1. */
19889
19890 int
19891 aarch64_fpconst_pow_of_2 (rtx x)
19892 {
19893 const REAL_VALUE_TYPE *r;
19894
19895 if (!CONST_DOUBLE_P (x))
19896 return -1;
19897
19898 r = CONST_DOUBLE_REAL_VALUE (x);
19899
19900 if (REAL_VALUE_NEGATIVE (*r)
19901 || REAL_VALUE_ISNAN (*r)
19902 || REAL_VALUE_ISINF (*r)
19903 || !real_isinteger (r, DFmode))
19904 return -1;
19905
19906 return exact_log2 (real_to_integer (r));
19907 }
19908
19909 /* If X is a positive CONST_DOUBLE with a value that is the reciprocal of a
19910 power of 2 (i.e 1/2^n) return the number of float bits. e.g. for x==(1/2^n)
19911 return n. Otherwise return -1. */
19912
19913 int
19914 aarch64_fpconst_pow2_recip (rtx x)
19915 {
19916 REAL_VALUE_TYPE r0;
19917
19918 if (!CONST_DOUBLE_P (x))
19919 return -1;
19920
19921 r0 = *CONST_DOUBLE_REAL_VALUE (x);
19922 if (exact_real_inverse (DFmode, &r0)
19923 && !REAL_VALUE_NEGATIVE (r0))
19924 {
19925 int ret = exact_log2 (real_to_integer (&r0));
19926 if (ret >= 1 && ret <= 32)
19927 return ret;
19928 }
19929 return -1;
19930 }
19931
19932 /* If X is a vector of equal CONST_DOUBLE values and that value is
19933 Y, return the aarch64_fpconst_pow_of_2 of Y. Otherwise return -1. */
19934
19935 int
19936 aarch64_vec_fpconst_pow_of_2 (rtx x)
19937 {
19938 int nelts;
19939 if (GET_CODE (x) != CONST_VECTOR
19940 || !CONST_VECTOR_NUNITS (x).is_constant (&nelts))
19941 return -1;
19942
19943 if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
19944 return -1;
19945
19946 int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
19947 if (firstval <= 0)
19948 return -1;
19949
19950 for (int i = 1; i < nelts; i++)
19951 if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
19952 return -1;
19953
19954 return firstval;
19955 }
19956
19957 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
19958 to float.
19959
19960 __fp16 always promotes through this hook.
19961 _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
19962 through the generic excess precision logic rather than here. */
19963
19964 static tree
19965 aarch64_promoted_type (const_tree t)
19966 {
19967 if (SCALAR_FLOAT_TYPE_P (t)
19968 && TYPE_MAIN_VARIANT (t) == aarch64_fp16_type_node)
19969 return float_type_node;
19970
19971 return NULL_TREE;
19972 }
19973
19974 /* Implement the TARGET_OPTAB_SUPPORTED_P hook. */
19975
19976 static bool
19977 aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
19978 optimization_type opt_type)
19979 {
19980 switch (op)
19981 {
19982 case rsqrt_optab:
19983 return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
19984
19985 default:
19986 return true;
19987 }
19988 }
19989
19990 /* Implement the TARGET_DWARF_POLY_INDETERMINATE_VALUE hook. */
19991
19992 static unsigned int
19993 aarch64_dwarf_poly_indeterminate_value (unsigned int i, unsigned int *factor,
19994 int *offset)
19995 {
19996 /* Polynomial invariant 1 == (VG / 2) - 1. */
19997 gcc_assert (i == 1);
19998 *factor = 2;
19999 *offset = 1;
20000 return AARCH64_DWARF_VG;
20001 }
20002
20003 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
20004 if MODE is HFmode, and punt to the generic implementation otherwise. */
20005
20006 static bool
20007 aarch64_libgcc_floating_mode_supported_p (scalar_float_mode mode)
20008 {
20009 return (mode == HFmode
20010 ? true
20011 : default_libgcc_floating_mode_supported_p (mode));
20012 }
20013
20014 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
20015 if MODE is HFmode, and punt to the generic implementation otherwise. */
20016
20017 static bool
20018 aarch64_scalar_mode_supported_p (scalar_mode mode)
20019 {
20020 return (mode == HFmode
20021 ? true
20022 : default_scalar_mode_supported_p (mode));
20023 }
20024
20025 /* Set the value of FLT_EVAL_METHOD.
20026 ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
20027
20028 0: evaluate all operations and constants, whose semantic type has at
20029 most the range and precision of type float, to the range and
20030 precision of float; evaluate all other operations and constants to
20031 the range and precision of the semantic type;
20032
20033 N, where _FloatN is a supported interchange floating type
20034 evaluate all operations and constants, whose semantic type has at
20035 most the range and precision of _FloatN type, to the range and
20036 precision of the _FloatN type; evaluate all other operations and
20037 constants to the range and precision of the semantic type;
20038
20039 If we have the ARMv8.2-A extensions then we support _Float16 in native
20040 precision, so we should set this to 16. Otherwise, we support the type,
20041 but want to evaluate expressions in float precision, so set this to
20042 0. */
20043
20044 static enum flt_eval_method
20045 aarch64_excess_precision (enum excess_precision_type type)
20046 {
20047 switch (type)
20048 {
20049 case EXCESS_PRECISION_TYPE_FAST:
20050 case EXCESS_PRECISION_TYPE_STANDARD:
20051 /* We can calculate either in 16-bit range and precision or
20052 32-bit range and precision. Make that decision based on whether
20053 we have native support for the ARMv8.2-A 16-bit floating-point
20054 instructions or not. */
20055 return (TARGET_FP_F16INST
20056 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
20057 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT);
20058 case EXCESS_PRECISION_TYPE_IMPLICIT:
20059 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16;
20060 default:
20061 gcc_unreachable ();
20062 }
20063 return FLT_EVAL_METHOD_UNPREDICTABLE;
20064 }
20065
20066 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN. Return true if INSN can be
20067 scheduled for speculative execution. Reject the long-running division
20068 and square-root instructions. */
20069
20070 static bool
20071 aarch64_sched_can_speculate_insn (rtx_insn *insn)
20072 {
20073 switch (get_attr_type (insn))
20074 {
20075 case TYPE_SDIV:
20076 case TYPE_UDIV:
20077 case TYPE_FDIVS:
20078 case TYPE_FDIVD:
20079 case TYPE_FSQRTS:
20080 case TYPE_FSQRTD:
20081 case TYPE_NEON_FP_SQRT_S:
20082 case TYPE_NEON_FP_SQRT_D:
20083 case TYPE_NEON_FP_SQRT_S_Q:
20084 case TYPE_NEON_FP_SQRT_D_Q:
20085 case TYPE_NEON_FP_DIV_S:
20086 case TYPE_NEON_FP_DIV_D:
20087 case TYPE_NEON_FP_DIV_S_Q:
20088 case TYPE_NEON_FP_DIV_D_Q:
20089 return false;
20090 default:
20091 return true;
20092 }
20093 }
20094
20095 /* Implement TARGET_COMPUTE_PRESSURE_CLASSES. */
20096
20097 static int
20098 aarch64_compute_pressure_classes (reg_class *classes)
20099 {
20100 int i = 0;
20101 classes[i++] = GENERAL_REGS;
20102 classes[i++] = FP_REGS;
20103 /* PR_REGS isn't a useful pressure class because many predicate pseudo
20104 registers need to go in PR_LO_REGS at some point during their
20105 lifetime. Splitting it into two halves has the effect of making
20106 all predicates count against PR_LO_REGS, so that we try whenever
20107 possible to restrict the number of live predicates to 8. This
20108 greatly reduces the amount of spilling in certain loops. */
20109 classes[i++] = PR_LO_REGS;
20110 classes[i++] = PR_HI_REGS;
20111 return i;
20112 }
20113
20114 /* Implement TARGET_CAN_CHANGE_MODE_CLASS. */
20115
20116 static bool
20117 aarch64_can_change_mode_class (machine_mode from,
20118 machine_mode to, reg_class_t)
20119 {
20120 if (BYTES_BIG_ENDIAN)
20121 {
20122 bool from_sve_p = aarch64_sve_data_mode_p (from);
20123 bool to_sve_p = aarch64_sve_data_mode_p (to);
20124
20125 /* Don't allow changes between SVE data modes and non-SVE modes.
20126 See the comment at the head of aarch64-sve.md for details. */
20127 if (from_sve_p != to_sve_p)
20128 return false;
20129
20130 /* Don't allow changes in element size: lane 0 of the new vector
20131 would not then be lane 0 of the old vector. See the comment
20132 above aarch64_maybe_expand_sve_subreg_move for a more detailed
20133 description.
20134
20135 In the worst case, this forces a register to be spilled in
20136 one mode and reloaded in the other, which handles the
20137 endianness correctly. */
20138 if (from_sve_p && GET_MODE_UNIT_SIZE (from) != GET_MODE_UNIT_SIZE (to))
20139 return false;
20140 }
20141 return true;
20142 }
20143
20144 /* Implement TARGET_EARLY_REMAT_MODES. */
20145
20146 static void
20147 aarch64_select_early_remat_modes (sbitmap modes)
20148 {
20149 /* SVE values are not normally live across a call, so it should be
20150 worth doing early rematerialization even in VL-specific mode. */
20151 for (int i = 0; i < NUM_MACHINE_MODES; ++i)
20152 if (aarch64_sve_mode_p ((machine_mode) i))
20153 bitmap_set_bit (modes, i);
20154 }
20155
20156 /* Override the default target speculation_safe_value. */
20157 static rtx
20158 aarch64_speculation_safe_value (machine_mode mode,
20159 rtx result, rtx val, rtx failval)
20160 {
20161 /* Maybe we should warn if falling back to hard barriers. They are
20162 likely to be noticably more expensive than the alternative below. */
20163 if (!aarch64_track_speculation)
20164 return default_speculation_safe_value (mode, result, val, failval);
20165
20166 if (!REG_P (val))
20167 val = copy_to_mode_reg (mode, val);
20168
20169 if (!aarch64_reg_or_zero (failval, mode))
20170 failval = copy_to_mode_reg (mode, failval);
20171
20172 emit_insn (gen_despeculate_copy (mode, result, val, failval));
20173 return result;
20174 }
20175
20176 /* Implement TARGET_ESTIMATED_POLY_VALUE.
20177 Look into the tuning structure for an estimate.
20178 VAL.coeffs[1] is multiplied by the number of VQ chunks over the initial
20179 Advanced SIMD 128 bits. */
20180
20181 static HOST_WIDE_INT
20182 aarch64_estimated_poly_value (poly_int64 val)
20183 {
20184 enum aarch64_sve_vector_bits_enum width_source
20185 = aarch64_tune_params.sve_width;
20186
20187 /* If we still don't have an estimate, use the default. */
20188 if (width_source == SVE_SCALABLE)
20189 return default_estimated_poly_value (val);
20190
20191 HOST_WIDE_INT over_128 = width_source - 128;
20192 return val.coeffs[0] + val.coeffs[1] * over_128 / 128;
20193 }
20194
20195
20196 /* Return true for types that could be supported as SIMD return or
20197 argument types. */
20198
20199 static bool
20200 supported_simd_type (tree t)
20201 {
20202 if (SCALAR_FLOAT_TYPE_P (t) || INTEGRAL_TYPE_P (t) || POINTER_TYPE_P (t))
20203 {
20204 HOST_WIDE_INT s = tree_to_shwi (TYPE_SIZE_UNIT (t));
20205 return s == 1 || s == 2 || s == 4 || s == 8;
20206 }
20207 return false;
20208 }
20209
20210 /* Return true for types that currently are supported as SIMD return
20211 or argument types. */
20212
20213 static bool
20214 currently_supported_simd_type (tree t, tree b)
20215 {
20216 if (COMPLEX_FLOAT_TYPE_P (t))
20217 return false;
20218
20219 if (TYPE_SIZE (t) != TYPE_SIZE (b))
20220 return false;
20221
20222 return supported_simd_type (t);
20223 }
20224
20225 /* Implement TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN. */
20226
20227 static int
20228 aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
20229 struct cgraph_simd_clone *clonei,
20230 tree base_type, int num)
20231 {
20232 tree t, ret_type, arg_type;
20233 unsigned int elt_bits, vec_bits, count;
20234
20235 if (!TARGET_SIMD)
20236 return 0;
20237
20238 if (clonei->simdlen
20239 && (clonei->simdlen < 2
20240 || clonei->simdlen > 1024
20241 || (clonei->simdlen & (clonei->simdlen - 1)) != 0))
20242 {
20243 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
20244 "unsupported simdlen %d", clonei->simdlen);
20245 return 0;
20246 }
20247
20248 ret_type = TREE_TYPE (TREE_TYPE (node->decl));
20249 if (TREE_CODE (ret_type) != VOID_TYPE
20250 && !currently_supported_simd_type (ret_type, base_type))
20251 {
20252 if (TYPE_SIZE (ret_type) != TYPE_SIZE (base_type))
20253 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
20254 "GCC does not currently support mixed size types "
20255 "for %<simd%> functions");
20256 else if (supported_simd_type (ret_type))
20257 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
20258 "GCC does not currently support return type %qT "
20259 "for %<simd%> functions", ret_type);
20260 else
20261 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
20262 "unsupported return type %qT for %<simd%> functions",
20263 ret_type);
20264 return 0;
20265 }
20266
20267 for (t = DECL_ARGUMENTS (node->decl); t; t = DECL_CHAIN (t))
20268 {
20269 arg_type = TREE_TYPE (t);
20270
20271 if (!currently_supported_simd_type (arg_type, base_type))
20272 {
20273 if (TYPE_SIZE (arg_type) != TYPE_SIZE (base_type))
20274 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
20275 "GCC does not currently support mixed size types "
20276 "for %<simd%> functions");
20277 else
20278 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
20279 "GCC does not currently support argument type %qT "
20280 "for %<simd%> functions", arg_type);
20281 return 0;
20282 }
20283 }
20284
20285 clonei->vecsize_mangle = 'n';
20286 clonei->mask_mode = VOIDmode;
20287 elt_bits = GET_MODE_BITSIZE (SCALAR_TYPE_MODE (base_type));
20288 if (clonei->simdlen == 0)
20289 {
20290 count = 2;
20291 vec_bits = (num == 0 ? 64 : 128);
20292 clonei->simdlen = vec_bits / elt_bits;
20293 }
20294 else
20295 {
20296 count = 1;
20297 vec_bits = clonei->simdlen * elt_bits;
20298 if (vec_bits != 64 && vec_bits != 128)
20299 {
20300 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
20301 "GCC does not currently support simdlen %d for type %qT",
20302 clonei->simdlen, base_type);
20303 return 0;
20304 }
20305 }
20306 clonei->vecsize_int = vec_bits;
20307 clonei->vecsize_float = vec_bits;
20308 return count;
20309 }
20310
20311 /* Implement TARGET_SIMD_CLONE_ADJUST. */
20312
20313 static void
20314 aarch64_simd_clone_adjust (struct cgraph_node *node)
20315 {
20316 /* Add aarch64_vector_pcs target attribute to SIMD clones so they
20317 use the correct ABI. */
20318
20319 tree t = TREE_TYPE (node->decl);
20320 TYPE_ATTRIBUTES (t) = make_attribute ("aarch64_vector_pcs", "default",
20321 TYPE_ATTRIBUTES (t));
20322 }
20323
20324 /* Implement TARGET_SIMD_CLONE_USABLE. */
20325
20326 static int
20327 aarch64_simd_clone_usable (struct cgraph_node *node)
20328 {
20329 switch (node->simdclone->vecsize_mangle)
20330 {
20331 case 'n':
20332 if (!TARGET_SIMD)
20333 return -1;
20334 return 0;
20335 default:
20336 gcc_unreachable ();
20337 }
20338 }
20339
20340 /* Implement TARGET_COMP_TYPE_ATTRIBUTES */
20341
20342 static int
20343 aarch64_comp_type_attributes (const_tree type1, const_tree type2)
20344 {
20345 if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (type1))
20346 != lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (type2)))
20347 return 0;
20348 return 1;
20349 }
20350
20351 /* Implement TARGET_GET_MULTILIB_ABI_NAME */
20352
20353 static const char *
20354 aarch64_get_multilib_abi_name (void)
20355 {
20356 if (TARGET_BIG_END)
20357 return TARGET_ILP32 ? "aarch64_be_ilp32" : "aarch64_be";
20358 return TARGET_ILP32 ? "aarch64_ilp32" : "aarch64";
20359 }
20360
20361 /* Implement TARGET_STACK_PROTECT_GUARD. In case of a
20362 global variable based guard use the default else
20363 return a null tree. */
20364 static tree
20365 aarch64_stack_protect_guard (void)
20366 {
20367 if (aarch64_stack_protector_guard == SSP_GLOBAL)
20368 return default_stack_protect_guard ();
20369
20370 return NULL_TREE;
20371 }
20372
20373 /* Implement TARGET_ASM_FILE_END for AArch64. This adds the AArch64 GNU NOTE
20374 section at the end if needed. */
20375 #define GNU_PROPERTY_AARCH64_FEATURE_1_AND 0xc0000000
20376 #define GNU_PROPERTY_AARCH64_FEATURE_1_BTI (1U << 0)
20377 #define GNU_PROPERTY_AARCH64_FEATURE_1_PAC (1U << 1)
20378 void
20379 aarch64_file_end_indicate_exec_stack ()
20380 {
20381 file_end_indicate_exec_stack ();
20382
20383 unsigned feature_1_and = 0;
20384 if (aarch64_bti_enabled ())
20385 feature_1_and |= GNU_PROPERTY_AARCH64_FEATURE_1_BTI;
20386
20387 if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE)
20388 feature_1_and |= GNU_PROPERTY_AARCH64_FEATURE_1_PAC;
20389
20390 if (feature_1_and)
20391 {
20392 /* Generate .note.gnu.property section. */
20393 switch_to_section (get_section (".note.gnu.property",
20394 SECTION_NOTYPE, NULL));
20395
20396 /* PT_NOTE header: namesz, descsz, type.
20397 namesz = 4 ("GNU\0")
20398 descsz = 16 (Size of the program property array)
20399 [(12 + padding) * Number of array elements]
20400 type = 5 (NT_GNU_PROPERTY_TYPE_0). */
20401 assemble_align (POINTER_SIZE);
20402 assemble_integer (GEN_INT (4), 4, 32, 1);
20403 assemble_integer (GEN_INT (ROUND_UP (12, POINTER_BYTES)), 4, 32, 1);
20404 assemble_integer (GEN_INT (5), 4, 32, 1);
20405
20406 /* PT_NOTE name. */
20407 assemble_string ("GNU", 4);
20408
20409 /* PT_NOTE contents for NT_GNU_PROPERTY_TYPE_0:
20410 type = GNU_PROPERTY_AARCH64_FEATURE_1_AND
20411 datasz = 4
20412 data = feature_1_and. */
20413 assemble_integer (GEN_INT (GNU_PROPERTY_AARCH64_FEATURE_1_AND), 4, 32, 1);
20414 assemble_integer (GEN_INT (4), 4, 32, 1);
20415 assemble_integer (GEN_INT (feature_1_and), 4, 32, 1);
20416
20417 /* Pad the size of the note to the required alignment. */
20418 assemble_align (POINTER_SIZE);
20419 }
20420 }
20421 #undef GNU_PROPERTY_AARCH64_FEATURE_1_PAC
20422 #undef GNU_PROPERTY_AARCH64_FEATURE_1_BTI
20423 #undef GNU_PROPERTY_AARCH64_FEATURE_1_AND
20424
20425 /* Target-specific selftests. */
20426
20427 #if CHECKING_P
20428
20429 namespace selftest {
20430
20431 /* Selftest for the RTL loader.
20432 Verify that the RTL loader copes with a dump from
20433 print_rtx_function. This is essentially just a test that class
20434 function_reader can handle a real dump, but it also verifies
20435 that lookup_reg_by_dump_name correctly handles hard regs.
20436 The presence of hard reg names in the dump means that the test is
20437 target-specific, hence it is in this file. */
20438
20439 static void
20440 aarch64_test_loading_full_dump ()
20441 {
20442 rtl_dump_test t (SELFTEST_LOCATION, locate_file ("aarch64/times-two.rtl"));
20443
20444 ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
20445
20446 rtx_insn *insn_1 = get_insn_by_uid (1);
20447 ASSERT_EQ (NOTE, GET_CODE (insn_1));
20448
20449 rtx_insn *insn_15 = get_insn_by_uid (15);
20450 ASSERT_EQ (INSN, GET_CODE (insn_15));
20451 ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
20452
20453 /* Verify crtl->return_rtx. */
20454 ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
20455 ASSERT_EQ (0, REGNO (crtl->return_rtx));
20456 ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
20457 }
20458
20459 /* Run all target-specific selftests. */
20460
20461 static void
20462 aarch64_run_selftests (void)
20463 {
20464 aarch64_test_loading_full_dump ();
20465 }
20466
20467 } // namespace selftest
20468
20469 #endif /* #if CHECKING_P */
20470
20471 #undef TARGET_STACK_PROTECT_GUARD
20472 #define TARGET_STACK_PROTECT_GUARD aarch64_stack_protect_guard
20473
20474 #undef TARGET_ADDRESS_COST
20475 #define TARGET_ADDRESS_COST aarch64_address_cost
20476
20477 /* This hook will determines whether unnamed bitfields affect the alignment
20478 of the containing structure. The hook returns true if the structure
20479 should inherit the alignment requirements of an unnamed bitfield's
20480 type. */
20481 #undef TARGET_ALIGN_ANON_BITFIELD
20482 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
20483
20484 #undef TARGET_ASM_ALIGNED_DI_OP
20485 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
20486
20487 #undef TARGET_ASM_ALIGNED_HI_OP
20488 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
20489
20490 #undef TARGET_ASM_ALIGNED_SI_OP
20491 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
20492
20493 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
20494 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
20495 hook_bool_const_tree_hwi_hwi_const_tree_true
20496
20497 #undef TARGET_ASM_FILE_START
20498 #define TARGET_ASM_FILE_START aarch64_start_file
20499
20500 #undef TARGET_ASM_OUTPUT_MI_THUNK
20501 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
20502
20503 #undef TARGET_ASM_SELECT_RTX_SECTION
20504 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
20505
20506 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
20507 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
20508
20509 #undef TARGET_BUILD_BUILTIN_VA_LIST
20510 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
20511
20512 #undef TARGET_CALLEE_COPIES
20513 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_arg_info_false
20514
20515 #undef TARGET_CAN_ELIMINATE
20516 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
20517
20518 #undef TARGET_CAN_INLINE_P
20519 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
20520
20521 #undef TARGET_CANNOT_FORCE_CONST_MEM
20522 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
20523
20524 #undef TARGET_CASE_VALUES_THRESHOLD
20525 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
20526
20527 #undef TARGET_CONDITIONAL_REGISTER_USAGE
20528 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
20529
20530 /* Only the least significant bit is used for initialization guard
20531 variables. */
20532 #undef TARGET_CXX_GUARD_MASK_BIT
20533 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
20534
20535 #undef TARGET_C_MODE_FOR_SUFFIX
20536 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
20537
20538 #ifdef TARGET_BIG_ENDIAN_DEFAULT
20539 #undef TARGET_DEFAULT_TARGET_FLAGS
20540 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
20541 #endif
20542
20543 #undef TARGET_CLASS_MAX_NREGS
20544 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
20545
20546 #undef TARGET_BUILTIN_DECL
20547 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
20548
20549 #undef TARGET_BUILTIN_RECIPROCAL
20550 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
20551
20552 #undef TARGET_C_EXCESS_PRECISION
20553 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
20554
20555 #undef TARGET_EXPAND_BUILTIN
20556 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
20557
20558 #undef TARGET_EXPAND_BUILTIN_VA_START
20559 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
20560
20561 #undef TARGET_FOLD_BUILTIN
20562 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
20563
20564 #undef TARGET_FUNCTION_ARG
20565 #define TARGET_FUNCTION_ARG aarch64_function_arg
20566
20567 #undef TARGET_FUNCTION_ARG_ADVANCE
20568 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
20569
20570 #undef TARGET_FUNCTION_ARG_BOUNDARY
20571 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
20572
20573 #undef TARGET_FUNCTION_ARG_PADDING
20574 #define TARGET_FUNCTION_ARG_PADDING aarch64_function_arg_padding
20575
20576 #undef TARGET_GET_RAW_RESULT_MODE
20577 #define TARGET_GET_RAW_RESULT_MODE aarch64_get_reg_raw_mode
20578 #undef TARGET_GET_RAW_ARG_MODE
20579 #define TARGET_GET_RAW_ARG_MODE aarch64_get_reg_raw_mode
20580
20581 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
20582 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
20583
20584 #undef TARGET_FUNCTION_VALUE
20585 #define TARGET_FUNCTION_VALUE aarch64_function_value
20586
20587 #undef TARGET_FUNCTION_VALUE_REGNO_P
20588 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
20589
20590 #undef TARGET_GIMPLE_FOLD_BUILTIN
20591 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
20592
20593 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
20594 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
20595
20596 #undef TARGET_INIT_BUILTINS
20597 #define TARGET_INIT_BUILTINS aarch64_init_builtins
20598
20599 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
20600 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
20601 aarch64_ira_change_pseudo_allocno_class
20602
20603 #undef TARGET_LEGITIMATE_ADDRESS_P
20604 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
20605
20606 #undef TARGET_LEGITIMATE_CONSTANT_P
20607 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
20608
20609 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
20610 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
20611 aarch64_legitimize_address_displacement
20612
20613 #undef TARGET_LIBGCC_CMP_RETURN_MODE
20614 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
20615
20616 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
20617 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
20618 aarch64_libgcc_floating_mode_supported_p
20619
20620 #undef TARGET_MANGLE_TYPE
20621 #define TARGET_MANGLE_TYPE aarch64_mangle_type
20622
20623 #undef TARGET_MEMORY_MOVE_COST
20624 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
20625
20626 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
20627 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
20628
20629 #undef TARGET_MUST_PASS_IN_STACK
20630 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
20631
20632 /* This target hook should return true if accesses to volatile bitfields
20633 should use the narrowest mode possible. It should return false if these
20634 accesses should use the bitfield container type. */
20635 #undef TARGET_NARROW_VOLATILE_BITFIELD
20636 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
20637
20638 #undef TARGET_OPTION_OVERRIDE
20639 #define TARGET_OPTION_OVERRIDE aarch64_override_options
20640
20641 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
20642 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
20643 aarch64_override_options_after_change
20644
20645 #undef TARGET_OPTION_SAVE
20646 #define TARGET_OPTION_SAVE aarch64_option_save
20647
20648 #undef TARGET_OPTION_RESTORE
20649 #define TARGET_OPTION_RESTORE aarch64_option_restore
20650
20651 #undef TARGET_OPTION_PRINT
20652 #define TARGET_OPTION_PRINT aarch64_option_print
20653
20654 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
20655 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
20656
20657 #undef TARGET_SET_CURRENT_FUNCTION
20658 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
20659
20660 #undef TARGET_PASS_BY_REFERENCE
20661 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
20662
20663 #undef TARGET_PREFERRED_RELOAD_CLASS
20664 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
20665
20666 #undef TARGET_SCHED_REASSOCIATION_WIDTH
20667 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
20668
20669 #undef TARGET_PROMOTED_TYPE
20670 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
20671
20672 #undef TARGET_SECONDARY_RELOAD
20673 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
20674
20675 #undef TARGET_SHIFT_TRUNCATION_MASK
20676 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
20677
20678 #undef TARGET_SETUP_INCOMING_VARARGS
20679 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
20680
20681 #undef TARGET_STRUCT_VALUE_RTX
20682 #define TARGET_STRUCT_VALUE_RTX aarch64_struct_value_rtx
20683
20684 #undef TARGET_REGISTER_MOVE_COST
20685 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
20686
20687 #undef TARGET_RETURN_IN_MEMORY
20688 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
20689
20690 #undef TARGET_RETURN_IN_MSB
20691 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
20692
20693 #undef TARGET_RTX_COSTS
20694 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
20695
20696 #undef TARGET_SCALAR_MODE_SUPPORTED_P
20697 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
20698
20699 #undef TARGET_SCHED_ISSUE_RATE
20700 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
20701
20702 #undef TARGET_SCHED_VARIABLE_ISSUE
20703 #define TARGET_SCHED_VARIABLE_ISSUE aarch64_sched_variable_issue
20704
20705 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
20706 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
20707 aarch64_sched_first_cycle_multipass_dfa_lookahead
20708
20709 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
20710 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
20711 aarch64_first_cycle_multipass_dfa_lookahead_guard
20712
20713 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
20714 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
20715 aarch64_get_separate_components
20716
20717 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
20718 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
20719 aarch64_components_for_bb
20720
20721 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
20722 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
20723 aarch64_disqualify_components
20724
20725 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
20726 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
20727 aarch64_emit_prologue_components
20728
20729 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
20730 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
20731 aarch64_emit_epilogue_components
20732
20733 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
20734 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
20735 aarch64_set_handled_components
20736
20737 #undef TARGET_TRAMPOLINE_INIT
20738 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
20739
20740 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
20741 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
20742
20743 #undef TARGET_VECTOR_MODE_SUPPORTED_P
20744 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
20745
20746 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
20747 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
20748 aarch64_builtin_support_vector_misalignment
20749
20750 #undef TARGET_ARRAY_MODE
20751 #define TARGET_ARRAY_MODE aarch64_array_mode
20752
20753 #undef TARGET_ARRAY_MODE_SUPPORTED_P
20754 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
20755
20756 #undef TARGET_VECTORIZE_ADD_STMT_COST
20757 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
20758
20759 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
20760 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
20761 aarch64_builtin_vectorization_cost
20762
20763 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
20764 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
20765
20766 #undef TARGET_VECTORIZE_BUILTINS
20767 #define TARGET_VECTORIZE_BUILTINS
20768
20769 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
20770 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
20771 aarch64_builtin_vectorized_function
20772
20773 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
20774 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
20775 aarch64_autovectorize_vector_sizes
20776
20777 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
20778 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
20779 aarch64_atomic_assign_expand_fenv
20780
20781 /* Section anchor support. */
20782
20783 #undef TARGET_MIN_ANCHOR_OFFSET
20784 #define TARGET_MIN_ANCHOR_OFFSET -256
20785
20786 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
20787 byte offset; we can do much more for larger data types, but have no way
20788 to determine the size of the access. We assume accesses are aligned. */
20789 #undef TARGET_MAX_ANCHOR_OFFSET
20790 #define TARGET_MAX_ANCHOR_OFFSET 4095
20791
20792 #undef TARGET_VECTOR_ALIGNMENT
20793 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
20794
20795 #undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
20796 #define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
20797 aarch64_vectorize_preferred_vector_alignment
20798 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
20799 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
20800 aarch64_simd_vector_alignment_reachable
20801
20802 /* vec_perm support. */
20803
20804 #undef TARGET_VECTORIZE_VEC_PERM_CONST
20805 #define TARGET_VECTORIZE_VEC_PERM_CONST \
20806 aarch64_vectorize_vec_perm_const
20807
20808 #undef TARGET_VECTORIZE_GET_MASK_MODE
20809 #define TARGET_VECTORIZE_GET_MASK_MODE aarch64_get_mask_mode
20810 #undef TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE
20811 #define TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE \
20812 aarch64_empty_mask_is_expensive
20813 #undef TARGET_PREFERRED_ELSE_VALUE
20814 #define TARGET_PREFERRED_ELSE_VALUE \
20815 aarch64_preferred_else_value
20816
20817 #undef TARGET_INIT_LIBFUNCS
20818 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
20819
20820 #undef TARGET_FIXED_CONDITION_CODE_REGS
20821 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
20822
20823 #undef TARGET_FLAGS_REGNUM
20824 #define TARGET_FLAGS_REGNUM CC_REGNUM
20825
20826 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
20827 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
20828
20829 #undef TARGET_ASAN_SHADOW_OFFSET
20830 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
20831
20832 #undef TARGET_LEGITIMIZE_ADDRESS
20833 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
20834
20835 #undef TARGET_SCHED_CAN_SPECULATE_INSN
20836 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
20837
20838 #undef TARGET_CAN_USE_DOLOOP_P
20839 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
20840
20841 #undef TARGET_SCHED_ADJUST_PRIORITY
20842 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
20843
20844 #undef TARGET_SCHED_MACRO_FUSION_P
20845 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
20846
20847 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
20848 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
20849
20850 #undef TARGET_SCHED_FUSION_PRIORITY
20851 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
20852
20853 #undef TARGET_UNSPEC_MAY_TRAP_P
20854 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
20855
20856 #undef TARGET_USE_PSEUDO_PIC_REG
20857 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
20858
20859 #undef TARGET_PRINT_OPERAND
20860 #define TARGET_PRINT_OPERAND aarch64_print_operand
20861
20862 #undef TARGET_PRINT_OPERAND_ADDRESS
20863 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
20864
20865 #undef TARGET_OPTAB_SUPPORTED_P
20866 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
20867
20868 #undef TARGET_OMIT_STRUCT_RETURN_REG
20869 #define TARGET_OMIT_STRUCT_RETURN_REG true
20870
20871 #undef TARGET_DWARF_POLY_INDETERMINATE_VALUE
20872 #define TARGET_DWARF_POLY_INDETERMINATE_VALUE \
20873 aarch64_dwarf_poly_indeterminate_value
20874
20875 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors. */
20876 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
20877 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
20878
20879 #undef TARGET_HARD_REGNO_NREGS
20880 #define TARGET_HARD_REGNO_NREGS aarch64_hard_regno_nregs
20881 #undef TARGET_HARD_REGNO_MODE_OK
20882 #define TARGET_HARD_REGNO_MODE_OK aarch64_hard_regno_mode_ok
20883
20884 #undef TARGET_MODES_TIEABLE_P
20885 #define TARGET_MODES_TIEABLE_P aarch64_modes_tieable_p
20886
20887 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
20888 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
20889 aarch64_hard_regno_call_part_clobbered
20890
20891 #undef TARGET_REMOVE_EXTRA_CALL_PRESERVED_REGS
20892 #define TARGET_REMOVE_EXTRA_CALL_PRESERVED_REGS \
20893 aarch64_remove_extra_call_preserved_regs
20894
20895 #undef TARGET_RETURN_CALL_WITH_MAX_CLOBBERS
20896 #define TARGET_RETURN_CALL_WITH_MAX_CLOBBERS \
20897 aarch64_return_call_with_max_clobbers
20898
20899 #undef TARGET_CONSTANT_ALIGNMENT
20900 #define TARGET_CONSTANT_ALIGNMENT aarch64_constant_alignment
20901
20902 #undef TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE
20903 #define TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE \
20904 aarch64_stack_clash_protection_alloca_probe_range
20905
20906 #undef TARGET_COMPUTE_PRESSURE_CLASSES
20907 #define TARGET_COMPUTE_PRESSURE_CLASSES aarch64_compute_pressure_classes
20908
20909 #undef TARGET_CAN_CHANGE_MODE_CLASS
20910 #define TARGET_CAN_CHANGE_MODE_CLASS aarch64_can_change_mode_class
20911
20912 #undef TARGET_SELECT_EARLY_REMAT_MODES
20913 #define TARGET_SELECT_EARLY_REMAT_MODES aarch64_select_early_remat_modes
20914
20915 #undef TARGET_SPECULATION_SAFE_VALUE
20916 #define TARGET_SPECULATION_SAFE_VALUE aarch64_speculation_safe_value
20917
20918 #undef TARGET_ESTIMATED_POLY_VALUE
20919 #define TARGET_ESTIMATED_POLY_VALUE aarch64_estimated_poly_value
20920
20921 #undef TARGET_ATTRIBUTE_TABLE
20922 #define TARGET_ATTRIBUTE_TABLE aarch64_attribute_table
20923
20924 #undef TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN
20925 #define TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN \
20926 aarch64_simd_clone_compute_vecsize_and_simdlen
20927
20928 #undef TARGET_SIMD_CLONE_ADJUST
20929 #define TARGET_SIMD_CLONE_ADJUST aarch64_simd_clone_adjust
20930
20931 #undef TARGET_SIMD_CLONE_USABLE
20932 #define TARGET_SIMD_CLONE_USABLE aarch64_simd_clone_usable
20933
20934 #undef TARGET_COMP_TYPE_ATTRIBUTES
20935 #define TARGET_COMP_TYPE_ATTRIBUTES aarch64_comp_type_attributes
20936
20937 #undef TARGET_GET_MULTILIB_ABI_NAME
20938 #define TARGET_GET_MULTILIB_ABI_NAME aarch64_get_multilib_abi_name
20939
20940 #if CHECKING_P
20941 #undef TARGET_RUN_TARGET_SELFTESTS
20942 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
20943 #endif /* #if CHECKING_P */
20944
20945 #undef TARGET_ASM_POST_CFI_STARTPROC
20946 #define TARGET_ASM_POST_CFI_STARTPROC aarch64_post_cfi_startproc
20947
20948 struct gcc_target targetm = TARGET_INITIALIZER;
20949
20950 #include "gt-aarch64.h"