]> git.ipfire.org Git - thirdparty/gcc.git/blob - gcc/config/aarch64/aarch64.c
Add IFN_COND_FMA functions
[thirdparty/gcc.git] / gcc / config / aarch64 / aarch64.c
1 /* Machine description for AArch64 architecture.
2 Copyright (C) 2009-2018 Free Software Foundation, Inc.
3 Contributed by ARM Ltd.
4
5 This file is part of GCC.
6
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3, or (at your option)
10 any later version.
11
12 GCC is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 General Public License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
20
21 #define IN_TARGET_CODE 1
22
23 #include "config.h"
24 #define INCLUDE_STRING
25 #include "system.h"
26 #include "coretypes.h"
27 #include "backend.h"
28 #include "target.h"
29 #include "rtl.h"
30 #include "tree.h"
31 #include "memmodel.h"
32 #include "gimple.h"
33 #include "cfghooks.h"
34 #include "cfgloop.h"
35 #include "df.h"
36 #include "tm_p.h"
37 #include "stringpool.h"
38 #include "attribs.h"
39 #include "optabs.h"
40 #include "regs.h"
41 #include "emit-rtl.h"
42 #include "recog.h"
43 #include "diagnostic.h"
44 #include "insn-attr.h"
45 #include "alias.h"
46 #include "fold-const.h"
47 #include "stor-layout.h"
48 #include "calls.h"
49 #include "varasm.h"
50 #include "output.h"
51 #include "flags.h"
52 #include "explow.h"
53 #include "expr.h"
54 #include "reload.h"
55 #include "langhooks.h"
56 #include "opts.h"
57 #include "params.h"
58 #include "gimplify.h"
59 #include "dwarf2.h"
60 #include "gimple-iterator.h"
61 #include "tree-vectorizer.h"
62 #include "aarch64-cost-tables.h"
63 #include "dumpfile.h"
64 #include "builtins.h"
65 #include "rtl-iter.h"
66 #include "tm-constrs.h"
67 #include "sched-int.h"
68 #include "target-globals.h"
69 #include "common/common-target.h"
70 #include "cfgrtl.h"
71 #include "selftest.h"
72 #include "selftest-rtl.h"
73 #include "rtx-vector-builder.h"
74
75 /* This file should be included last. */
76 #include "target-def.h"
77
78 /* Defined for convenience. */
79 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
80
81 /* Classifies an address.
82
83 ADDRESS_REG_IMM
84 A simple base register plus immediate offset.
85
86 ADDRESS_REG_WB
87 A base register indexed by immediate offset with writeback.
88
89 ADDRESS_REG_REG
90 A base register indexed by (optionally scaled) register.
91
92 ADDRESS_REG_UXTW
93 A base register indexed by (optionally scaled) zero-extended register.
94
95 ADDRESS_REG_SXTW
96 A base register indexed by (optionally scaled) sign-extended register.
97
98 ADDRESS_LO_SUM
99 A LO_SUM rtx with a base register and "LO12" symbol relocation.
100
101 ADDRESS_SYMBOLIC:
102 A constant symbolic address, in pc-relative literal pool. */
103
104 enum aarch64_address_type {
105 ADDRESS_REG_IMM,
106 ADDRESS_REG_WB,
107 ADDRESS_REG_REG,
108 ADDRESS_REG_UXTW,
109 ADDRESS_REG_SXTW,
110 ADDRESS_LO_SUM,
111 ADDRESS_SYMBOLIC
112 };
113
114 struct aarch64_address_info {
115 enum aarch64_address_type type;
116 rtx base;
117 rtx offset;
118 poly_int64 const_offset;
119 int shift;
120 enum aarch64_symbol_type symbol_type;
121 };
122
123 /* Information about a legitimate vector immediate operand. */
124 struct simd_immediate_info
125 {
126 enum insn_type { MOV, MVN };
127 enum modifier_type { LSL, MSL };
128
129 simd_immediate_info () {}
130 simd_immediate_info (scalar_float_mode, rtx);
131 simd_immediate_info (scalar_int_mode, unsigned HOST_WIDE_INT,
132 insn_type = MOV, modifier_type = LSL,
133 unsigned int = 0);
134 simd_immediate_info (scalar_mode, rtx, rtx);
135
136 /* The mode of the elements. */
137 scalar_mode elt_mode;
138
139 /* The value of each element if all elements are the same, or the
140 first value if the constant is a series. */
141 rtx value;
142
143 /* The value of the step if the constant is a series, null otherwise. */
144 rtx step;
145
146 /* The instruction to use to move the immediate into a vector. */
147 insn_type insn;
148
149 /* The kind of shift modifier to use, and the number of bits to shift.
150 This is (LSL, 0) if no shift is needed. */
151 modifier_type modifier;
152 unsigned int shift;
153 };
154
155 /* Construct a floating-point immediate in which each element has mode
156 ELT_MODE_IN and value VALUE_IN. */
157 inline simd_immediate_info
158 ::simd_immediate_info (scalar_float_mode elt_mode_in, rtx value_in)
159 : elt_mode (elt_mode_in), value (value_in), step (NULL_RTX), insn (MOV),
160 modifier (LSL), shift (0)
161 {}
162
163 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
164 and value VALUE_IN. The other parameters are as for the structure
165 fields. */
166 inline simd_immediate_info
167 ::simd_immediate_info (scalar_int_mode elt_mode_in,
168 unsigned HOST_WIDE_INT value_in,
169 insn_type insn_in, modifier_type modifier_in,
170 unsigned int shift_in)
171 : elt_mode (elt_mode_in), value (gen_int_mode (value_in, elt_mode_in)),
172 step (NULL_RTX), insn (insn_in), modifier (modifier_in), shift (shift_in)
173 {}
174
175 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
176 and where element I is equal to VALUE_IN + I * STEP_IN. */
177 inline simd_immediate_info
178 ::simd_immediate_info (scalar_mode elt_mode_in, rtx value_in, rtx step_in)
179 : elt_mode (elt_mode_in), value (value_in), step (step_in), insn (MOV),
180 modifier (LSL), shift (0)
181 {}
182
183 /* The current code model. */
184 enum aarch64_code_model aarch64_cmodel;
185
186 /* The number of 64-bit elements in an SVE vector. */
187 poly_uint16 aarch64_sve_vg;
188
189 #ifdef HAVE_AS_TLS
190 #undef TARGET_HAVE_TLS
191 #define TARGET_HAVE_TLS 1
192 #endif
193
194 static bool aarch64_composite_type_p (const_tree, machine_mode);
195 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
196 const_tree,
197 machine_mode *, int *,
198 bool *);
199 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
200 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
201 static void aarch64_override_options_after_change (void);
202 static bool aarch64_vector_mode_supported_p (machine_mode);
203 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
204 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
205 const_tree type,
206 int misalignment,
207 bool is_packed);
208 static machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64);
209 static bool aarch64_print_ldpstp_address (FILE *, machine_mode, rtx);
210
211 /* Major revision number of the ARM Architecture implemented by the target. */
212 unsigned aarch64_architecture_version;
213
214 /* The processor for which instructions should be scheduled. */
215 enum aarch64_processor aarch64_tune = cortexa53;
216
217 /* Mask to specify which instruction scheduling options should be used. */
218 unsigned long aarch64_tune_flags = 0;
219
220 /* Global flag for PC relative loads. */
221 bool aarch64_pcrelative_literal_loads;
222
223 /* Global flag for whether frame pointer is enabled. */
224 bool aarch64_use_frame_pointer;
225
226 /* Support for command line parsing of boolean flags in the tuning
227 structures. */
228 struct aarch64_flag_desc
229 {
230 const char* name;
231 unsigned int flag;
232 };
233
234 #define AARCH64_FUSION_PAIR(name, internal_name) \
235 { name, AARCH64_FUSE_##internal_name },
236 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
237 {
238 { "none", AARCH64_FUSE_NOTHING },
239 #include "aarch64-fusion-pairs.def"
240 { "all", AARCH64_FUSE_ALL },
241 { NULL, AARCH64_FUSE_NOTHING }
242 };
243
244 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
245 { name, AARCH64_EXTRA_TUNE_##internal_name },
246 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
247 {
248 { "none", AARCH64_EXTRA_TUNE_NONE },
249 #include "aarch64-tuning-flags.def"
250 { "all", AARCH64_EXTRA_TUNE_ALL },
251 { NULL, AARCH64_EXTRA_TUNE_NONE }
252 };
253
254 /* Tuning parameters. */
255
256 static const struct cpu_addrcost_table generic_addrcost_table =
257 {
258 {
259 1, /* hi */
260 0, /* si */
261 0, /* di */
262 1, /* ti */
263 },
264 0, /* pre_modify */
265 0, /* post_modify */
266 0, /* register_offset */
267 0, /* register_sextend */
268 0, /* register_zextend */
269 0 /* imm_offset */
270 };
271
272 static const struct cpu_addrcost_table exynosm1_addrcost_table =
273 {
274 {
275 0, /* hi */
276 0, /* si */
277 0, /* di */
278 2, /* ti */
279 },
280 0, /* pre_modify */
281 0, /* post_modify */
282 1, /* register_offset */
283 1, /* register_sextend */
284 2, /* register_zextend */
285 0, /* imm_offset */
286 };
287
288 static const struct cpu_addrcost_table xgene1_addrcost_table =
289 {
290 {
291 1, /* hi */
292 0, /* si */
293 0, /* di */
294 1, /* ti */
295 },
296 1, /* pre_modify */
297 0, /* post_modify */
298 0, /* register_offset */
299 1, /* register_sextend */
300 1, /* register_zextend */
301 0, /* imm_offset */
302 };
303
304 static const struct cpu_addrcost_table thunderx2t99_addrcost_table =
305 {
306 {
307 1, /* hi */
308 1, /* si */
309 1, /* di */
310 2, /* ti */
311 },
312 0, /* pre_modify */
313 0, /* post_modify */
314 2, /* register_offset */
315 3, /* register_sextend */
316 3, /* register_zextend */
317 0, /* imm_offset */
318 };
319
320 static const struct cpu_addrcost_table qdf24xx_addrcost_table =
321 {
322 {
323 1, /* hi */
324 1, /* si */
325 1, /* di */
326 2, /* ti */
327 },
328 1, /* pre_modify */
329 1, /* post_modify */
330 3, /* register_offset */
331 4, /* register_sextend */
332 3, /* register_zextend */
333 2, /* imm_offset */
334 };
335
336 static const struct cpu_regmove_cost generic_regmove_cost =
337 {
338 1, /* GP2GP */
339 /* Avoid the use of slow int<->fp moves for spilling by setting
340 their cost higher than memmov_cost. */
341 5, /* GP2FP */
342 5, /* FP2GP */
343 2 /* FP2FP */
344 };
345
346 static const struct cpu_regmove_cost cortexa57_regmove_cost =
347 {
348 1, /* GP2GP */
349 /* Avoid the use of slow int<->fp moves for spilling by setting
350 their cost higher than memmov_cost. */
351 5, /* GP2FP */
352 5, /* FP2GP */
353 2 /* FP2FP */
354 };
355
356 static const struct cpu_regmove_cost cortexa53_regmove_cost =
357 {
358 1, /* GP2GP */
359 /* Avoid the use of slow int<->fp moves for spilling by setting
360 their cost higher than memmov_cost. */
361 5, /* GP2FP */
362 5, /* FP2GP */
363 2 /* FP2FP */
364 };
365
366 static const struct cpu_regmove_cost exynosm1_regmove_cost =
367 {
368 1, /* GP2GP */
369 /* Avoid the use of slow int<->fp moves for spilling by setting
370 their cost higher than memmov_cost (actual, 4 and 9). */
371 9, /* GP2FP */
372 9, /* FP2GP */
373 1 /* FP2FP */
374 };
375
376 static const struct cpu_regmove_cost thunderx_regmove_cost =
377 {
378 2, /* GP2GP */
379 2, /* GP2FP */
380 6, /* FP2GP */
381 4 /* FP2FP */
382 };
383
384 static const struct cpu_regmove_cost xgene1_regmove_cost =
385 {
386 1, /* GP2GP */
387 /* Avoid the use of slow int<->fp moves for spilling by setting
388 their cost higher than memmov_cost. */
389 8, /* GP2FP */
390 8, /* FP2GP */
391 2 /* FP2FP */
392 };
393
394 static const struct cpu_regmove_cost qdf24xx_regmove_cost =
395 {
396 2, /* GP2GP */
397 /* Avoid the use of int<->fp moves for spilling. */
398 6, /* GP2FP */
399 6, /* FP2GP */
400 4 /* FP2FP */
401 };
402
403 static const struct cpu_regmove_cost thunderx2t99_regmove_cost =
404 {
405 1, /* GP2GP */
406 /* Avoid the use of int<->fp moves for spilling. */
407 8, /* GP2FP */
408 8, /* FP2GP */
409 4 /* FP2FP */
410 };
411
412 /* Generic costs for vector insn classes. */
413 static const struct cpu_vector_cost generic_vector_cost =
414 {
415 1, /* scalar_int_stmt_cost */
416 1, /* scalar_fp_stmt_cost */
417 1, /* scalar_load_cost */
418 1, /* scalar_store_cost */
419 1, /* vec_int_stmt_cost */
420 1, /* vec_fp_stmt_cost */
421 2, /* vec_permute_cost */
422 1, /* vec_to_scalar_cost */
423 1, /* scalar_to_vec_cost */
424 1, /* vec_align_load_cost */
425 1, /* vec_unalign_load_cost */
426 1, /* vec_unalign_store_cost */
427 1, /* vec_store_cost */
428 3, /* cond_taken_branch_cost */
429 1 /* cond_not_taken_branch_cost */
430 };
431
432 /* ThunderX costs for vector insn classes. */
433 static const struct cpu_vector_cost thunderx_vector_cost =
434 {
435 1, /* scalar_int_stmt_cost */
436 1, /* scalar_fp_stmt_cost */
437 3, /* scalar_load_cost */
438 1, /* scalar_store_cost */
439 4, /* vec_int_stmt_cost */
440 1, /* vec_fp_stmt_cost */
441 4, /* vec_permute_cost */
442 2, /* vec_to_scalar_cost */
443 2, /* scalar_to_vec_cost */
444 3, /* vec_align_load_cost */
445 5, /* vec_unalign_load_cost */
446 5, /* vec_unalign_store_cost */
447 1, /* vec_store_cost */
448 3, /* cond_taken_branch_cost */
449 3 /* cond_not_taken_branch_cost */
450 };
451
452 /* Generic costs for vector insn classes. */
453 static const struct cpu_vector_cost cortexa57_vector_cost =
454 {
455 1, /* scalar_int_stmt_cost */
456 1, /* scalar_fp_stmt_cost */
457 4, /* scalar_load_cost */
458 1, /* scalar_store_cost */
459 2, /* vec_int_stmt_cost */
460 2, /* vec_fp_stmt_cost */
461 3, /* vec_permute_cost */
462 8, /* vec_to_scalar_cost */
463 8, /* scalar_to_vec_cost */
464 4, /* vec_align_load_cost */
465 4, /* vec_unalign_load_cost */
466 1, /* vec_unalign_store_cost */
467 1, /* vec_store_cost */
468 1, /* cond_taken_branch_cost */
469 1 /* cond_not_taken_branch_cost */
470 };
471
472 static const struct cpu_vector_cost exynosm1_vector_cost =
473 {
474 1, /* scalar_int_stmt_cost */
475 1, /* scalar_fp_stmt_cost */
476 5, /* scalar_load_cost */
477 1, /* scalar_store_cost */
478 3, /* vec_int_stmt_cost */
479 3, /* vec_fp_stmt_cost */
480 3, /* vec_permute_cost */
481 3, /* vec_to_scalar_cost */
482 3, /* scalar_to_vec_cost */
483 5, /* vec_align_load_cost */
484 5, /* vec_unalign_load_cost */
485 1, /* vec_unalign_store_cost */
486 1, /* vec_store_cost */
487 1, /* cond_taken_branch_cost */
488 1 /* cond_not_taken_branch_cost */
489 };
490
491 /* Generic costs for vector insn classes. */
492 static const struct cpu_vector_cost xgene1_vector_cost =
493 {
494 1, /* scalar_int_stmt_cost */
495 1, /* scalar_fp_stmt_cost */
496 5, /* scalar_load_cost */
497 1, /* scalar_store_cost */
498 2, /* vec_int_stmt_cost */
499 2, /* vec_fp_stmt_cost */
500 2, /* vec_permute_cost */
501 4, /* vec_to_scalar_cost */
502 4, /* scalar_to_vec_cost */
503 10, /* vec_align_load_cost */
504 10, /* vec_unalign_load_cost */
505 2, /* vec_unalign_store_cost */
506 2, /* vec_store_cost */
507 2, /* cond_taken_branch_cost */
508 1 /* cond_not_taken_branch_cost */
509 };
510
511 /* Costs for vector insn classes for Vulcan. */
512 static const struct cpu_vector_cost thunderx2t99_vector_cost =
513 {
514 1, /* scalar_int_stmt_cost */
515 6, /* scalar_fp_stmt_cost */
516 4, /* scalar_load_cost */
517 1, /* scalar_store_cost */
518 5, /* vec_int_stmt_cost */
519 6, /* vec_fp_stmt_cost */
520 3, /* vec_permute_cost */
521 6, /* vec_to_scalar_cost */
522 5, /* scalar_to_vec_cost */
523 8, /* vec_align_load_cost */
524 8, /* vec_unalign_load_cost */
525 4, /* vec_unalign_store_cost */
526 4, /* vec_store_cost */
527 2, /* cond_taken_branch_cost */
528 1 /* cond_not_taken_branch_cost */
529 };
530
531 /* Generic costs for branch instructions. */
532 static const struct cpu_branch_cost generic_branch_cost =
533 {
534 1, /* Predictable. */
535 3 /* Unpredictable. */
536 };
537
538 /* Generic approximation modes. */
539 static const cpu_approx_modes generic_approx_modes =
540 {
541 AARCH64_APPROX_NONE, /* division */
542 AARCH64_APPROX_NONE, /* sqrt */
543 AARCH64_APPROX_NONE /* recip_sqrt */
544 };
545
546 /* Approximation modes for Exynos M1. */
547 static const cpu_approx_modes exynosm1_approx_modes =
548 {
549 AARCH64_APPROX_NONE, /* division */
550 AARCH64_APPROX_ALL, /* sqrt */
551 AARCH64_APPROX_ALL /* recip_sqrt */
552 };
553
554 /* Approximation modes for X-Gene 1. */
555 static const cpu_approx_modes xgene1_approx_modes =
556 {
557 AARCH64_APPROX_NONE, /* division */
558 AARCH64_APPROX_NONE, /* sqrt */
559 AARCH64_APPROX_ALL /* recip_sqrt */
560 };
561
562 /* Generic prefetch settings (which disable prefetch). */
563 static const cpu_prefetch_tune generic_prefetch_tune =
564 {
565 0, /* num_slots */
566 -1, /* l1_cache_size */
567 -1, /* l1_cache_line_size */
568 -1, /* l2_cache_size */
569 true, /* prefetch_dynamic_strides */
570 -1, /* minimum_stride */
571 -1 /* default_opt_level */
572 };
573
574 static const cpu_prefetch_tune exynosm1_prefetch_tune =
575 {
576 0, /* num_slots */
577 -1, /* l1_cache_size */
578 64, /* l1_cache_line_size */
579 -1, /* l2_cache_size */
580 true, /* prefetch_dynamic_strides */
581 -1, /* minimum_stride */
582 -1 /* default_opt_level */
583 };
584
585 static const cpu_prefetch_tune qdf24xx_prefetch_tune =
586 {
587 4, /* num_slots */
588 32, /* l1_cache_size */
589 64, /* l1_cache_line_size */
590 512, /* l2_cache_size */
591 false, /* prefetch_dynamic_strides */
592 2048, /* minimum_stride */
593 3 /* default_opt_level */
594 };
595
596 static const cpu_prefetch_tune thunderxt88_prefetch_tune =
597 {
598 8, /* num_slots */
599 32, /* l1_cache_size */
600 128, /* l1_cache_line_size */
601 16*1024, /* l2_cache_size */
602 true, /* prefetch_dynamic_strides */
603 -1, /* minimum_stride */
604 3 /* default_opt_level */
605 };
606
607 static const cpu_prefetch_tune thunderx_prefetch_tune =
608 {
609 8, /* num_slots */
610 32, /* l1_cache_size */
611 128, /* l1_cache_line_size */
612 -1, /* l2_cache_size */
613 true, /* prefetch_dynamic_strides */
614 -1, /* minimum_stride */
615 -1 /* default_opt_level */
616 };
617
618 static const cpu_prefetch_tune thunderx2t99_prefetch_tune =
619 {
620 8, /* num_slots */
621 32, /* l1_cache_size */
622 64, /* l1_cache_line_size */
623 256, /* l2_cache_size */
624 true, /* prefetch_dynamic_strides */
625 -1, /* minimum_stride */
626 -1 /* default_opt_level */
627 };
628
629 static const struct tune_params generic_tunings =
630 {
631 &cortexa57_extra_costs,
632 &generic_addrcost_table,
633 &generic_regmove_cost,
634 &generic_vector_cost,
635 &generic_branch_cost,
636 &generic_approx_modes,
637 4, /* memmov_cost */
638 2, /* issue_rate */
639 (AARCH64_FUSE_AES_AESMC), /* fusible_ops */
640 "8", /* function_align. */
641 "4", /* jump_align. */
642 "8", /* loop_align. */
643 2, /* int_reassoc_width. */
644 4, /* fp_reassoc_width. */
645 1, /* vec_reassoc_width. */
646 2, /* min_div_recip_mul_sf. */
647 2, /* min_div_recip_mul_df. */
648 0, /* max_case_values. */
649 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
650 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
651 &generic_prefetch_tune
652 };
653
654 static const struct tune_params cortexa35_tunings =
655 {
656 &cortexa53_extra_costs,
657 &generic_addrcost_table,
658 &cortexa53_regmove_cost,
659 &generic_vector_cost,
660 &generic_branch_cost,
661 &generic_approx_modes,
662 4, /* memmov_cost */
663 1, /* issue_rate */
664 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
665 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
666 "16", /* function_align. */
667 "4", /* jump_align. */
668 "8", /* loop_align. */
669 2, /* int_reassoc_width. */
670 4, /* fp_reassoc_width. */
671 1, /* vec_reassoc_width. */
672 2, /* min_div_recip_mul_sf. */
673 2, /* min_div_recip_mul_df. */
674 0, /* max_case_values. */
675 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
676 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
677 &generic_prefetch_tune
678 };
679
680 static const struct tune_params cortexa53_tunings =
681 {
682 &cortexa53_extra_costs,
683 &generic_addrcost_table,
684 &cortexa53_regmove_cost,
685 &generic_vector_cost,
686 &generic_branch_cost,
687 &generic_approx_modes,
688 4, /* memmov_cost */
689 2, /* issue_rate */
690 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
691 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
692 "16", /* function_align. */
693 "4", /* jump_align. */
694 "8", /* loop_align. */
695 2, /* int_reassoc_width. */
696 4, /* fp_reassoc_width. */
697 1, /* vec_reassoc_width. */
698 2, /* min_div_recip_mul_sf. */
699 2, /* min_div_recip_mul_df. */
700 0, /* max_case_values. */
701 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
702 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
703 &generic_prefetch_tune
704 };
705
706 static const struct tune_params cortexa57_tunings =
707 {
708 &cortexa57_extra_costs,
709 &generic_addrcost_table,
710 &cortexa57_regmove_cost,
711 &cortexa57_vector_cost,
712 &generic_branch_cost,
713 &generic_approx_modes,
714 4, /* memmov_cost */
715 3, /* issue_rate */
716 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
717 | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
718 "16", /* function_align. */
719 "4", /* jump_align. */
720 "8", /* loop_align. */
721 2, /* int_reassoc_width. */
722 4, /* fp_reassoc_width. */
723 1, /* vec_reassoc_width. */
724 2, /* min_div_recip_mul_sf. */
725 2, /* min_div_recip_mul_df. */
726 0, /* max_case_values. */
727 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
728 (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS), /* tune_flags. */
729 &generic_prefetch_tune
730 };
731
732 static const struct tune_params cortexa72_tunings =
733 {
734 &cortexa57_extra_costs,
735 &generic_addrcost_table,
736 &cortexa57_regmove_cost,
737 &cortexa57_vector_cost,
738 &generic_branch_cost,
739 &generic_approx_modes,
740 4, /* memmov_cost */
741 3, /* issue_rate */
742 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
743 | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
744 "16", /* function_align. */
745 "4", /* jump_align. */
746 "8", /* loop_align. */
747 2, /* int_reassoc_width. */
748 4, /* fp_reassoc_width. */
749 1, /* vec_reassoc_width. */
750 2, /* min_div_recip_mul_sf. */
751 2, /* min_div_recip_mul_df. */
752 0, /* max_case_values. */
753 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
754 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
755 &generic_prefetch_tune
756 };
757
758 static const struct tune_params cortexa73_tunings =
759 {
760 &cortexa57_extra_costs,
761 &generic_addrcost_table,
762 &cortexa57_regmove_cost,
763 &cortexa57_vector_cost,
764 &generic_branch_cost,
765 &generic_approx_modes,
766 4, /* memmov_cost. */
767 2, /* issue_rate. */
768 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
769 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
770 "16", /* function_align. */
771 "4", /* jump_align. */
772 "8", /* loop_align. */
773 2, /* int_reassoc_width. */
774 4, /* fp_reassoc_width. */
775 1, /* vec_reassoc_width. */
776 2, /* min_div_recip_mul_sf. */
777 2, /* min_div_recip_mul_df. */
778 0, /* max_case_values. */
779 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
780 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
781 &generic_prefetch_tune
782 };
783
784
785
786 static const struct tune_params exynosm1_tunings =
787 {
788 &exynosm1_extra_costs,
789 &exynosm1_addrcost_table,
790 &exynosm1_regmove_cost,
791 &exynosm1_vector_cost,
792 &generic_branch_cost,
793 &exynosm1_approx_modes,
794 4, /* memmov_cost */
795 3, /* issue_rate */
796 (AARCH64_FUSE_AES_AESMC), /* fusible_ops */
797 "4", /* function_align. */
798 "4", /* jump_align. */
799 "4", /* loop_align. */
800 2, /* int_reassoc_width. */
801 4, /* fp_reassoc_width. */
802 1, /* vec_reassoc_width. */
803 2, /* min_div_recip_mul_sf. */
804 2, /* min_div_recip_mul_df. */
805 48, /* max_case_values. */
806 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
807 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
808 &exynosm1_prefetch_tune
809 };
810
811 static const struct tune_params thunderxt88_tunings =
812 {
813 &thunderx_extra_costs,
814 &generic_addrcost_table,
815 &thunderx_regmove_cost,
816 &thunderx_vector_cost,
817 &generic_branch_cost,
818 &generic_approx_modes,
819 6, /* memmov_cost */
820 2, /* issue_rate */
821 AARCH64_FUSE_CMP_BRANCH, /* fusible_ops */
822 "8", /* function_align. */
823 "8", /* jump_align. */
824 "8", /* loop_align. */
825 2, /* int_reassoc_width. */
826 4, /* fp_reassoc_width. */
827 1, /* vec_reassoc_width. */
828 2, /* min_div_recip_mul_sf. */
829 2, /* min_div_recip_mul_df. */
830 0, /* max_case_values. */
831 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
832 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW), /* tune_flags. */
833 &thunderxt88_prefetch_tune
834 };
835
836 static const struct tune_params thunderx_tunings =
837 {
838 &thunderx_extra_costs,
839 &generic_addrcost_table,
840 &thunderx_regmove_cost,
841 &thunderx_vector_cost,
842 &generic_branch_cost,
843 &generic_approx_modes,
844 6, /* memmov_cost */
845 2, /* issue_rate */
846 AARCH64_FUSE_CMP_BRANCH, /* fusible_ops */
847 "8", /* function_align. */
848 "8", /* jump_align. */
849 "8", /* loop_align. */
850 2, /* int_reassoc_width. */
851 4, /* fp_reassoc_width. */
852 1, /* vec_reassoc_width. */
853 2, /* min_div_recip_mul_sf. */
854 2, /* min_div_recip_mul_df. */
855 0, /* max_case_values. */
856 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
857 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
858 | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND), /* tune_flags. */
859 &thunderx_prefetch_tune
860 };
861
862 static const struct tune_params xgene1_tunings =
863 {
864 &xgene1_extra_costs,
865 &xgene1_addrcost_table,
866 &xgene1_regmove_cost,
867 &xgene1_vector_cost,
868 &generic_branch_cost,
869 &xgene1_approx_modes,
870 6, /* memmov_cost */
871 4, /* issue_rate */
872 AARCH64_FUSE_NOTHING, /* fusible_ops */
873 "16", /* function_align. */
874 "8", /* jump_align. */
875 "16", /* loop_align. */
876 2, /* int_reassoc_width. */
877 4, /* fp_reassoc_width. */
878 1, /* vec_reassoc_width. */
879 2, /* min_div_recip_mul_sf. */
880 2, /* min_div_recip_mul_df. */
881 0, /* max_case_values. */
882 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
883 (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS), /* tune_flags. */
884 &generic_prefetch_tune
885 };
886
887 static const struct tune_params qdf24xx_tunings =
888 {
889 &qdf24xx_extra_costs,
890 &qdf24xx_addrcost_table,
891 &qdf24xx_regmove_cost,
892 &generic_vector_cost,
893 &generic_branch_cost,
894 &generic_approx_modes,
895 4, /* memmov_cost */
896 4, /* issue_rate */
897 (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
898 | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops */
899 "16", /* function_align. */
900 "8", /* jump_align. */
901 "16", /* loop_align. */
902 2, /* int_reassoc_width. */
903 4, /* fp_reassoc_width. */
904 1, /* vec_reassoc_width. */
905 2, /* min_div_recip_mul_sf. */
906 2, /* min_div_recip_mul_df. */
907 0, /* max_case_values. */
908 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
909 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
910 &qdf24xx_prefetch_tune
911 };
912
913 /* Tuning structure for the Qualcomm Saphira core. Default to falkor values
914 for now. */
915 static const struct tune_params saphira_tunings =
916 {
917 &generic_extra_costs,
918 &generic_addrcost_table,
919 &generic_regmove_cost,
920 &generic_vector_cost,
921 &generic_branch_cost,
922 &generic_approx_modes,
923 4, /* memmov_cost */
924 4, /* issue_rate */
925 (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
926 | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops */
927 "16", /* function_align. */
928 "8", /* jump_align. */
929 "16", /* loop_align. */
930 2, /* int_reassoc_width. */
931 4, /* fp_reassoc_width. */
932 1, /* vec_reassoc_width. */
933 2, /* min_div_recip_mul_sf. */
934 2, /* min_div_recip_mul_df. */
935 0, /* max_case_values. */
936 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
937 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
938 &generic_prefetch_tune
939 };
940
941 static const struct tune_params thunderx2t99_tunings =
942 {
943 &thunderx2t99_extra_costs,
944 &thunderx2t99_addrcost_table,
945 &thunderx2t99_regmove_cost,
946 &thunderx2t99_vector_cost,
947 &generic_branch_cost,
948 &generic_approx_modes,
949 4, /* memmov_cost. */
950 4, /* issue_rate. */
951 (AARCH64_FUSE_CMP_BRANCH | AARCH64_FUSE_AES_AESMC
952 | AARCH64_FUSE_ALU_BRANCH), /* fusible_ops */
953 "16", /* function_align. */
954 "8", /* jump_align. */
955 "16", /* loop_align. */
956 3, /* int_reassoc_width. */
957 2, /* fp_reassoc_width. */
958 2, /* vec_reassoc_width. */
959 2, /* min_div_recip_mul_sf. */
960 2, /* min_div_recip_mul_df. */
961 0, /* max_case_values. */
962 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
963 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
964 &thunderx2t99_prefetch_tune
965 };
966
967 /* Support for fine-grained override of the tuning structures. */
968 struct aarch64_tuning_override_function
969 {
970 const char* name;
971 void (*parse_override)(const char*, struct tune_params*);
972 };
973
974 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
975 static void aarch64_parse_tune_string (const char*, struct tune_params*);
976
977 static const struct aarch64_tuning_override_function
978 aarch64_tuning_override_functions[] =
979 {
980 { "fuse", aarch64_parse_fuse_string },
981 { "tune", aarch64_parse_tune_string },
982 { NULL, NULL }
983 };
984
985 /* A processor implementing AArch64. */
986 struct processor
987 {
988 const char *const name;
989 enum aarch64_processor ident;
990 enum aarch64_processor sched_core;
991 enum aarch64_arch arch;
992 unsigned architecture_version;
993 const unsigned long flags;
994 const struct tune_params *const tune;
995 };
996
997 /* Architectures implementing AArch64. */
998 static const struct processor all_architectures[] =
999 {
1000 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
1001 {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
1002 #include "aarch64-arches.def"
1003 {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
1004 };
1005
1006 /* Processor cores implementing AArch64. */
1007 static const struct processor all_cores[] =
1008 {
1009 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART, VARIANT) \
1010 {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH, \
1011 all_architectures[AARCH64_ARCH_##ARCH].architecture_version, \
1012 FLAGS, &COSTS##_tunings},
1013 #include "aarch64-cores.def"
1014 {"generic", generic, cortexa53, AARCH64_ARCH_8A, 8,
1015 AARCH64_FL_FOR_ARCH8, &generic_tunings},
1016 {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
1017 };
1018
1019
1020 /* Target specification. These are populated by the -march, -mtune, -mcpu
1021 handling code or by target attributes. */
1022 static const struct processor *selected_arch;
1023 static const struct processor *selected_cpu;
1024 static const struct processor *selected_tune;
1025
1026 /* The current tuning set. */
1027 struct tune_params aarch64_tune_params = generic_tunings;
1028
1029 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
1030
1031 /* An ISA extension in the co-processor and main instruction set space. */
1032 struct aarch64_option_extension
1033 {
1034 const char *const name;
1035 const unsigned long flags_on;
1036 const unsigned long flags_off;
1037 };
1038
1039 typedef enum aarch64_cond_code
1040 {
1041 AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
1042 AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
1043 AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
1044 }
1045 aarch64_cc;
1046
1047 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
1048
1049 /* The condition codes of the processor, and the inverse function. */
1050 static const char * const aarch64_condition_codes[] =
1051 {
1052 "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
1053 "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
1054 };
1055
1056 /* Generate code to enable conditional branches in functions over 1 MiB. */
1057 const char *
1058 aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
1059 const char * branch_format)
1060 {
1061 rtx_code_label * tmp_label = gen_label_rtx ();
1062 char label_buf[256];
1063 char buffer[128];
1064 ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
1065 CODE_LABEL_NUMBER (tmp_label));
1066 const char *label_ptr = targetm.strip_name_encoding (label_buf);
1067 rtx dest_label = operands[pos_label];
1068 operands[pos_label] = tmp_label;
1069
1070 snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
1071 output_asm_insn (buffer, operands);
1072
1073 snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
1074 operands[pos_label] = dest_label;
1075 output_asm_insn (buffer, operands);
1076 return "";
1077 }
1078
1079 void
1080 aarch64_err_no_fpadvsimd (machine_mode mode)
1081 {
1082 if (TARGET_GENERAL_REGS_ONLY)
1083 if (FLOAT_MODE_P (mode))
1084 error ("%qs is incompatible with the use of floating-point types",
1085 "-mgeneral-regs-only");
1086 else
1087 error ("%qs is incompatible with the use of vector types",
1088 "-mgeneral-regs-only");
1089 else
1090 if (FLOAT_MODE_P (mode))
1091 error ("%qs feature modifier is incompatible with the use of"
1092 " floating-point types", "+nofp");
1093 else
1094 error ("%qs feature modifier is incompatible with the use of"
1095 " vector types", "+nofp");
1096 }
1097
1098 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
1099 The register allocator chooses POINTER_AND_FP_REGS if FP_REGS and
1100 GENERAL_REGS have the same cost - even if POINTER_AND_FP_REGS has a much
1101 higher cost. POINTER_AND_FP_REGS is also used if the cost of both FP_REGS
1102 and GENERAL_REGS is lower than the memory cost (in this case the best class
1103 is the lowest cost one). Using POINTER_AND_FP_REGS irrespectively of its
1104 cost results in bad allocations with many redundant int<->FP moves which
1105 are expensive on various cores.
1106 To avoid this we don't allow POINTER_AND_FP_REGS as the allocno class, but
1107 force a decision between FP_REGS and GENERAL_REGS. We use the allocno class
1108 if it isn't POINTER_AND_FP_REGS. Similarly, use the best class if it isn't
1109 POINTER_AND_FP_REGS. Otherwise set the allocno class depending on the mode.
1110 The result of this is that it is no longer inefficient to have a higher
1111 memory move cost than the register move cost.
1112 */
1113
1114 static reg_class_t
1115 aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class,
1116 reg_class_t best_class)
1117 {
1118 machine_mode mode;
1119
1120 if (!reg_class_subset_p (GENERAL_REGS, allocno_class)
1121 || !reg_class_subset_p (FP_REGS, allocno_class))
1122 return allocno_class;
1123
1124 if (!reg_class_subset_p (GENERAL_REGS, best_class)
1125 || !reg_class_subset_p (FP_REGS, best_class))
1126 return best_class;
1127
1128 mode = PSEUDO_REGNO_MODE (regno);
1129 return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS;
1130 }
1131
1132 static unsigned int
1133 aarch64_min_divisions_for_recip_mul (machine_mode mode)
1134 {
1135 if (GET_MODE_UNIT_SIZE (mode) == 4)
1136 return aarch64_tune_params.min_div_recip_mul_sf;
1137 return aarch64_tune_params.min_div_recip_mul_df;
1138 }
1139
1140 /* Return the reassociation width of treeop OPC with mode MODE. */
1141 static int
1142 aarch64_reassociation_width (unsigned opc, machine_mode mode)
1143 {
1144 if (VECTOR_MODE_P (mode))
1145 return aarch64_tune_params.vec_reassoc_width;
1146 if (INTEGRAL_MODE_P (mode))
1147 return aarch64_tune_params.int_reassoc_width;
1148 /* Avoid reassociating floating point addition so we emit more FMAs. */
1149 if (FLOAT_MODE_P (mode) && opc != PLUS_EXPR)
1150 return aarch64_tune_params.fp_reassoc_width;
1151 return 1;
1152 }
1153
1154 /* Provide a mapping from gcc register numbers to dwarf register numbers. */
1155 unsigned
1156 aarch64_dbx_register_number (unsigned regno)
1157 {
1158 if (GP_REGNUM_P (regno))
1159 return AARCH64_DWARF_R0 + regno - R0_REGNUM;
1160 else if (regno == SP_REGNUM)
1161 return AARCH64_DWARF_SP;
1162 else if (FP_REGNUM_P (regno))
1163 return AARCH64_DWARF_V0 + regno - V0_REGNUM;
1164 else if (PR_REGNUM_P (regno))
1165 return AARCH64_DWARF_P0 + regno - P0_REGNUM;
1166 else if (regno == VG_REGNUM)
1167 return AARCH64_DWARF_VG;
1168
1169 /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
1170 equivalent DWARF register. */
1171 return DWARF_FRAME_REGISTERS;
1172 }
1173
1174 /* Return true if MODE is any of the Advanced SIMD structure modes. */
1175 static bool
1176 aarch64_advsimd_struct_mode_p (machine_mode mode)
1177 {
1178 return (TARGET_SIMD
1179 && (mode == OImode || mode == CImode || mode == XImode));
1180 }
1181
1182 /* Return true if MODE is an SVE predicate mode. */
1183 static bool
1184 aarch64_sve_pred_mode_p (machine_mode mode)
1185 {
1186 return (TARGET_SVE
1187 && (mode == VNx16BImode
1188 || mode == VNx8BImode
1189 || mode == VNx4BImode
1190 || mode == VNx2BImode));
1191 }
1192
1193 /* Three mutually-exclusive flags describing a vector or predicate type. */
1194 const unsigned int VEC_ADVSIMD = 1;
1195 const unsigned int VEC_SVE_DATA = 2;
1196 const unsigned int VEC_SVE_PRED = 4;
1197 /* Can be used in combination with VEC_ADVSIMD or VEC_SVE_DATA to indicate
1198 a structure of 2, 3 or 4 vectors. */
1199 const unsigned int VEC_STRUCT = 8;
1200 /* Useful combinations of the above. */
1201 const unsigned int VEC_ANY_SVE = VEC_SVE_DATA | VEC_SVE_PRED;
1202 const unsigned int VEC_ANY_DATA = VEC_ADVSIMD | VEC_SVE_DATA;
1203
1204 /* Return a set of flags describing the vector properties of mode MODE.
1205 Ignore modes that are not supported by the current target. */
1206 static unsigned int
1207 aarch64_classify_vector_mode (machine_mode mode)
1208 {
1209 if (aarch64_advsimd_struct_mode_p (mode))
1210 return VEC_ADVSIMD | VEC_STRUCT;
1211
1212 if (aarch64_sve_pred_mode_p (mode))
1213 return VEC_SVE_PRED;
1214
1215 scalar_mode inner = GET_MODE_INNER (mode);
1216 if (VECTOR_MODE_P (mode)
1217 && (inner == QImode
1218 || inner == HImode
1219 || inner == HFmode
1220 || inner == SImode
1221 || inner == SFmode
1222 || inner == DImode
1223 || inner == DFmode))
1224 {
1225 if (TARGET_SVE)
1226 {
1227 if (known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR))
1228 return VEC_SVE_DATA;
1229 if (known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR * 2)
1230 || known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR * 3)
1231 || known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR * 4))
1232 return VEC_SVE_DATA | VEC_STRUCT;
1233 }
1234
1235 /* This includes V1DF but not V1DI (which doesn't exist). */
1236 if (TARGET_SIMD
1237 && (known_eq (GET_MODE_BITSIZE (mode), 64)
1238 || known_eq (GET_MODE_BITSIZE (mode), 128)))
1239 return VEC_ADVSIMD;
1240 }
1241
1242 return 0;
1243 }
1244
1245 /* Return true if MODE is any of the data vector modes, including
1246 structure modes. */
1247 static bool
1248 aarch64_vector_data_mode_p (machine_mode mode)
1249 {
1250 return aarch64_classify_vector_mode (mode) & VEC_ANY_DATA;
1251 }
1252
1253 /* Return true if MODE is an SVE data vector mode; either a single vector
1254 or a structure of vectors. */
1255 static bool
1256 aarch64_sve_data_mode_p (machine_mode mode)
1257 {
1258 return aarch64_classify_vector_mode (mode) & VEC_SVE_DATA;
1259 }
1260
1261 /* Implement target hook TARGET_ARRAY_MODE. */
1262 static opt_machine_mode
1263 aarch64_array_mode (machine_mode mode, unsigned HOST_WIDE_INT nelems)
1264 {
1265 if (aarch64_classify_vector_mode (mode) == VEC_SVE_DATA
1266 && IN_RANGE (nelems, 2, 4))
1267 return mode_for_vector (GET_MODE_INNER (mode),
1268 GET_MODE_NUNITS (mode) * nelems);
1269
1270 return opt_machine_mode ();
1271 }
1272
1273 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P. */
1274 static bool
1275 aarch64_array_mode_supported_p (machine_mode mode,
1276 unsigned HOST_WIDE_INT nelems)
1277 {
1278 if (TARGET_SIMD
1279 && (AARCH64_VALID_SIMD_QREG_MODE (mode)
1280 || AARCH64_VALID_SIMD_DREG_MODE (mode))
1281 && (nelems >= 2 && nelems <= 4))
1282 return true;
1283
1284 return false;
1285 }
1286
1287 /* Return the SVE predicate mode to use for elements that have
1288 ELEM_NBYTES bytes, if such a mode exists. */
1289
1290 opt_machine_mode
1291 aarch64_sve_pred_mode (unsigned int elem_nbytes)
1292 {
1293 if (TARGET_SVE)
1294 {
1295 if (elem_nbytes == 1)
1296 return VNx16BImode;
1297 if (elem_nbytes == 2)
1298 return VNx8BImode;
1299 if (elem_nbytes == 4)
1300 return VNx4BImode;
1301 if (elem_nbytes == 8)
1302 return VNx2BImode;
1303 }
1304 return opt_machine_mode ();
1305 }
1306
1307 /* Implement TARGET_VECTORIZE_GET_MASK_MODE. */
1308
1309 static opt_machine_mode
1310 aarch64_get_mask_mode (poly_uint64 nunits, poly_uint64 nbytes)
1311 {
1312 if (TARGET_SVE && known_eq (nbytes, BYTES_PER_SVE_VECTOR))
1313 {
1314 unsigned int elem_nbytes = vector_element_size (nbytes, nunits);
1315 machine_mode pred_mode;
1316 if (aarch64_sve_pred_mode (elem_nbytes).exists (&pred_mode))
1317 return pred_mode;
1318 }
1319
1320 return default_get_mask_mode (nunits, nbytes);
1321 }
1322
1323 /* Implement TARGET_PREFERRED_ELSE_VALUE. For binary operations,
1324 prefer to use the first arithmetic operand as the else value if
1325 the else value doesn't matter, since that exactly matches the SVE
1326 destructive merging form. For ternary operations we could either
1327 pick the first operand and use FMAD-like instructions or the last
1328 operand and use FMLA-like instructions; the latter seems more
1329 natural. */
1330
1331 static tree
1332 aarch64_preferred_else_value (unsigned, tree, unsigned int nops, tree *ops)
1333 {
1334 return nops == 3 ? ops[2] : ops[0];
1335 }
1336
1337 /* Implement TARGET_HARD_REGNO_NREGS. */
1338
1339 static unsigned int
1340 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
1341 {
1342 /* ??? Logically we should only need to provide a value when
1343 HARD_REGNO_MODE_OK says that the combination is valid,
1344 but at the moment we need to handle all modes. Just ignore
1345 any runtime parts for registers that can't store them. */
1346 HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
1347 switch (aarch64_regno_regclass (regno))
1348 {
1349 case FP_REGS:
1350 case FP_LO_REGS:
1351 if (aarch64_sve_data_mode_p (mode))
1352 return exact_div (GET_MODE_SIZE (mode),
1353 BYTES_PER_SVE_VECTOR).to_constant ();
1354 return CEIL (lowest_size, UNITS_PER_VREG);
1355 case PR_REGS:
1356 case PR_LO_REGS:
1357 case PR_HI_REGS:
1358 return 1;
1359 default:
1360 return CEIL (lowest_size, UNITS_PER_WORD);
1361 }
1362 gcc_unreachable ();
1363 }
1364
1365 /* Implement TARGET_HARD_REGNO_MODE_OK. */
1366
1367 static bool
1368 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
1369 {
1370 if (GET_MODE_CLASS (mode) == MODE_CC)
1371 return regno == CC_REGNUM;
1372
1373 if (regno == VG_REGNUM)
1374 /* This must have the same size as _Unwind_Word. */
1375 return mode == DImode;
1376
1377 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1378 if (vec_flags & VEC_SVE_PRED)
1379 return PR_REGNUM_P (regno);
1380
1381 if (PR_REGNUM_P (regno))
1382 return 0;
1383
1384 if (regno == SP_REGNUM)
1385 /* The purpose of comparing with ptr_mode is to support the
1386 global register variable associated with the stack pointer
1387 register via the syntax of asm ("wsp") in ILP32. */
1388 return mode == Pmode || mode == ptr_mode;
1389
1390 if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
1391 return mode == Pmode;
1392
1393 if (GP_REGNUM_P (regno) && known_le (GET_MODE_SIZE (mode), 16))
1394 return true;
1395
1396 if (FP_REGNUM_P (regno))
1397 {
1398 if (vec_flags & VEC_STRUCT)
1399 return end_hard_regno (mode, regno) - 1 <= V31_REGNUM;
1400 else
1401 return !VECTOR_MODE_P (mode) || vec_flags != 0;
1402 }
1403
1404 return false;
1405 }
1406
1407 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED. The callee only saves
1408 the lower 64 bits of a 128-bit register. Tell the compiler the callee
1409 clobbers the top 64 bits when restoring the bottom 64 bits. */
1410
1411 static bool
1412 aarch64_hard_regno_call_part_clobbered (unsigned int regno, machine_mode mode)
1413 {
1414 return FP_REGNUM_P (regno) && maybe_gt (GET_MODE_SIZE (mode), 8);
1415 }
1416
1417 /* Implement REGMODE_NATURAL_SIZE. */
1418 poly_uint64
1419 aarch64_regmode_natural_size (machine_mode mode)
1420 {
1421 /* The natural size for SVE data modes is one SVE data vector,
1422 and similarly for predicates. We can't independently modify
1423 anything smaller than that. */
1424 /* ??? For now, only do this for variable-width SVE registers.
1425 Doing it for constant-sized registers breaks lower-subreg.c. */
1426 /* ??? And once that's fixed, we should probably have similar
1427 code for Advanced SIMD. */
1428 if (!aarch64_sve_vg.is_constant ())
1429 {
1430 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1431 if (vec_flags & VEC_SVE_PRED)
1432 return BYTES_PER_SVE_PRED;
1433 if (vec_flags & VEC_SVE_DATA)
1434 return BYTES_PER_SVE_VECTOR;
1435 }
1436 return UNITS_PER_WORD;
1437 }
1438
1439 /* Implement HARD_REGNO_CALLER_SAVE_MODE. */
1440 machine_mode
1441 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned,
1442 machine_mode mode)
1443 {
1444 /* The predicate mode determines which bits are significant and
1445 which are "don't care". Decreasing the number of lanes would
1446 lose data while increasing the number of lanes would make bits
1447 unnecessarily significant. */
1448 if (PR_REGNUM_P (regno))
1449 return mode;
1450 if (known_ge (GET_MODE_SIZE (mode), 4))
1451 return mode;
1452 else
1453 return SImode;
1454 }
1455
1456 /* Implement TARGET_CONSTANT_ALIGNMENT. Make strings word-aligned so
1457 that strcpy from constants will be faster. */
1458
1459 static HOST_WIDE_INT
1460 aarch64_constant_alignment (const_tree exp, HOST_WIDE_INT align)
1461 {
1462 if (TREE_CODE (exp) == STRING_CST && !optimize_size)
1463 return MAX (align, BITS_PER_WORD);
1464 return align;
1465 }
1466
1467 /* Return true if calls to DECL should be treated as
1468 long-calls (ie called via a register). */
1469 static bool
1470 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
1471 {
1472 return false;
1473 }
1474
1475 /* Return true if calls to symbol-ref SYM should be treated as
1476 long-calls (ie called via a register). */
1477 bool
1478 aarch64_is_long_call_p (rtx sym)
1479 {
1480 return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
1481 }
1482
1483 /* Return true if calls to symbol-ref SYM should not go through
1484 plt stubs. */
1485
1486 bool
1487 aarch64_is_noplt_call_p (rtx sym)
1488 {
1489 const_tree decl = SYMBOL_REF_DECL (sym);
1490
1491 if (flag_pic
1492 && decl
1493 && (!flag_plt
1494 || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
1495 && !targetm.binds_local_p (decl))
1496 return true;
1497
1498 return false;
1499 }
1500
1501 /* Return true if the offsets to a zero/sign-extract operation
1502 represent an expression that matches an extend operation. The
1503 operands represent the paramters from
1504
1505 (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)). */
1506 bool
1507 aarch64_is_extend_from_extract (scalar_int_mode mode, rtx mult_imm,
1508 rtx extract_imm)
1509 {
1510 HOST_WIDE_INT mult_val, extract_val;
1511
1512 if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
1513 return false;
1514
1515 mult_val = INTVAL (mult_imm);
1516 extract_val = INTVAL (extract_imm);
1517
1518 if (extract_val > 8
1519 && extract_val < GET_MODE_BITSIZE (mode)
1520 && exact_log2 (extract_val & ~7) > 0
1521 && (extract_val & 7) <= 4
1522 && mult_val == (1 << (extract_val & 7)))
1523 return true;
1524
1525 return false;
1526 }
1527
1528 /* Emit an insn that's a simple single-set. Both the operands must be
1529 known to be valid. */
1530 inline static rtx_insn *
1531 emit_set_insn (rtx x, rtx y)
1532 {
1533 return emit_insn (gen_rtx_SET (x, y));
1534 }
1535
1536 /* X and Y are two things to compare using CODE. Emit the compare insn and
1537 return the rtx for register 0 in the proper mode. */
1538 rtx
1539 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
1540 {
1541 machine_mode mode = SELECT_CC_MODE (code, x, y);
1542 rtx cc_reg = gen_rtx_REG (mode, CC_REGNUM);
1543
1544 emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y));
1545 return cc_reg;
1546 }
1547
1548 /* Build the SYMBOL_REF for __tls_get_addr. */
1549
1550 static GTY(()) rtx tls_get_addr_libfunc;
1551
1552 rtx
1553 aarch64_tls_get_addr (void)
1554 {
1555 if (!tls_get_addr_libfunc)
1556 tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
1557 return tls_get_addr_libfunc;
1558 }
1559
1560 /* Return the TLS model to use for ADDR. */
1561
1562 static enum tls_model
1563 tls_symbolic_operand_type (rtx addr)
1564 {
1565 enum tls_model tls_kind = TLS_MODEL_NONE;
1566 if (GET_CODE (addr) == CONST)
1567 {
1568 poly_int64 addend;
1569 rtx sym = strip_offset (addr, &addend);
1570 if (GET_CODE (sym) == SYMBOL_REF)
1571 tls_kind = SYMBOL_REF_TLS_MODEL (sym);
1572 }
1573 else if (GET_CODE (addr) == SYMBOL_REF)
1574 tls_kind = SYMBOL_REF_TLS_MODEL (addr);
1575
1576 return tls_kind;
1577 }
1578
1579 /* We'll allow lo_sum's in addresses in our legitimate addresses
1580 so that combine would take care of combining addresses where
1581 necessary, but for generation purposes, we'll generate the address
1582 as :
1583 RTL Absolute
1584 tmp = hi (symbol_ref); adrp x1, foo
1585 dest = lo_sum (tmp, symbol_ref); add dest, x1, :lo_12:foo
1586 nop
1587
1588 PIC TLS
1589 adrp x1, :got:foo adrp tmp, :tlsgd:foo
1590 ldr x1, [:got_lo12:foo] add dest, tmp, :tlsgd_lo12:foo
1591 bl __tls_get_addr
1592 nop
1593
1594 Load TLS symbol, depending on TLS mechanism and TLS access model.
1595
1596 Global Dynamic - Traditional TLS:
1597 adrp tmp, :tlsgd:imm
1598 add dest, tmp, #:tlsgd_lo12:imm
1599 bl __tls_get_addr
1600
1601 Global Dynamic - TLS Descriptors:
1602 adrp dest, :tlsdesc:imm
1603 ldr tmp, [dest, #:tlsdesc_lo12:imm]
1604 add dest, dest, #:tlsdesc_lo12:imm
1605 blr tmp
1606 mrs tp, tpidr_el0
1607 add dest, dest, tp
1608
1609 Initial Exec:
1610 mrs tp, tpidr_el0
1611 adrp tmp, :gottprel:imm
1612 ldr dest, [tmp, #:gottprel_lo12:imm]
1613 add dest, dest, tp
1614
1615 Local Exec:
1616 mrs tp, tpidr_el0
1617 add t0, tp, #:tprel_hi12:imm, lsl #12
1618 add t0, t0, #:tprel_lo12_nc:imm
1619 */
1620
1621 static void
1622 aarch64_load_symref_appropriately (rtx dest, rtx imm,
1623 enum aarch64_symbol_type type)
1624 {
1625 switch (type)
1626 {
1627 case SYMBOL_SMALL_ABSOLUTE:
1628 {
1629 /* In ILP32, the mode of dest can be either SImode or DImode. */
1630 rtx tmp_reg = dest;
1631 machine_mode mode = GET_MODE (dest);
1632
1633 gcc_assert (mode == Pmode || mode == ptr_mode);
1634
1635 if (can_create_pseudo_p ())
1636 tmp_reg = gen_reg_rtx (mode);
1637
1638 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1639 emit_insn (gen_add_losym (dest, tmp_reg, imm));
1640 return;
1641 }
1642
1643 case SYMBOL_TINY_ABSOLUTE:
1644 emit_insn (gen_rtx_SET (dest, imm));
1645 return;
1646
1647 case SYMBOL_SMALL_GOT_28K:
1648 {
1649 machine_mode mode = GET_MODE (dest);
1650 rtx gp_rtx = pic_offset_table_rtx;
1651 rtx insn;
1652 rtx mem;
1653
1654 /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
1655 here before rtl expand. Tree IVOPT will generate rtl pattern to
1656 decide rtx costs, in which case pic_offset_table_rtx is not
1657 initialized. For that case no need to generate the first adrp
1658 instruction as the final cost for global variable access is
1659 one instruction. */
1660 if (gp_rtx != NULL)
1661 {
1662 /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
1663 using the page base as GOT base, the first page may be wasted,
1664 in the worst scenario, there is only 28K space for GOT).
1665
1666 The generate instruction sequence for accessing global variable
1667 is:
1668
1669 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
1670
1671 Only one instruction needed. But we must initialize
1672 pic_offset_table_rtx properly. We generate initialize insn for
1673 every global access, and allow CSE to remove all redundant.
1674
1675 The final instruction sequences will look like the following
1676 for multiply global variables access.
1677
1678 adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
1679
1680 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
1681 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
1682 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
1683 ... */
1684
1685 rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
1686 crtl->uses_pic_offset_table = 1;
1687 emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
1688
1689 if (mode != GET_MODE (gp_rtx))
1690 gp_rtx = gen_lowpart (mode, gp_rtx);
1691
1692 }
1693
1694 if (mode == ptr_mode)
1695 {
1696 if (mode == DImode)
1697 insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
1698 else
1699 insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
1700
1701 mem = XVECEXP (SET_SRC (insn), 0, 0);
1702 }
1703 else
1704 {
1705 gcc_assert (mode == Pmode);
1706
1707 insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
1708 mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1709 }
1710
1711 /* The operand is expected to be MEM. Whenever the related insn
1712 pattern changed, above code which calculate mem should be
1713 updated. */
1714 gcc_assert (GET_CODE (mem) == MEM);
1715 MEM_READONLY_P (mem) = 1;
1716 MEM_NOTRAP_P (mem) = 1;
1717 emit_insn (insn);
1718 return;
1719 }
1720
1721 case SYMBOL_SMALL_GOT_4G:
1722 {
1723 /* In ILP32, the mode of dest can be either SImode or DImode,
1724 while the got entry is always of SImode size. The mode of
1725 dest depends on how dest is used: if dest is assigned to a
1726 pointer (e.g. in the memory), it has SImode; it may have
1727 DImode if dest is dereferenced to access the memeory.
1728 This is why we have to handle three different ldr_got_small
1729 patterns here (two patterns for ILP32). */
1730
1731 rtx insn;
1732 rtx mem;
1733 rtx tmp_reg = dest;
1734 machine_mode mode = GET_MODE (dest);
1735
1736 if (can_create_pseudo_p ())
1737 tmp_reg = gen_reg_rtx (mode);
1738
1739 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1740 if (mode == ptr_mode)
1741 {
1742 if (mode == DImode)
1743 insn = gen_ldr_got_small_di (dest, tmp_reg, imm);
1744 else
1745 insn = gen_ldr_got_small_si (dest, tmp_reg, imm);
1746
1747 mem = XVECEXP (SET_SRC (insn), 0, 0);
1748 }
1749 else
1750 {
1751 gcc_assert (mode == Pmode);
1752
1753 insn = gen_ldr_got_small_sidi (dest, tmp_reg, imm);
1754 mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1755 }
1756
1757 gcc_assert (GET_CODE (mem) == MEM);
1758 MEM_READONLY_P (mem) = 1;
1759 MEM_NOTRAP_P (mem) = 1;
1760 emit_insn (insn);
1761 return;
1762 }
1763
1764 case SYMBOL_SMALL_TLSGD:
1765 {
1766 rtx_insn *insns;
1767 machine_mode mode = GET_MODE (dest);
1768 rtx result = gen_rtx_REG (mode, R0_REGNUM);
1769
1770 start_sequence ();
1771 if (TARGET_ILP32)
1772 aarch64_emit_call_insn (gen_tlsgd_small_si (result, imm));
1773 else
1774 aarch64_emit_call_insn (gen_tlsgd_small_di (result, imm));
1775 insns = get_insns ();
1776 end_sequence ();
1777
1778 RTL_CONST_CALL_P (insns) = 1;
1779 emit_libcall_block (insns, dest, result, imm);
1780 return;
1781 }
1782
1783 case SYMBOL_SMALL_TLSDESC:
1784 {
1785 machine_mode mode = GET_MODE (dest);
1786 rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
1787 rtx tp;
1788
1789 gcc_assert (mode == Pmode || mode == ptr_mode);
1790
1791 /* In ILP32, the got entry is always of SImode size. Unlike
1792 small GOT, the dest is fixed at reg 0. */
1793 if (TARGET_ILP32)
1794 emit_insn (gen_tlsdesc_small_si (imm));
1795 else
1796 emit_insn (gen_tlsdesc_small_di (imm));
1797 tp = aarch64_load_tp (NULL);
1798
1799 if (mode != Pmode)
1800 tp = gen_lowpart (mode, tp);
1801
1802 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
1803 if (REG_P (dest))
1804 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1805 return;
1806 }
1807
1808 case SYMBOL_SMALL_TLSIE:
1809 {
1810 /* In ILP32, the mode of dest can be either SImode or DImode,
1811 while the got entry is always of SImode size. The mode of
1812 dest depends on how dest is used: if dest is assigned to a
1813 pointer (e.g. in the memory), it has SImode; it may have
1814 DImode if dest is dereferenced to access the memeory.
1815 This is why we have to handle three different tlsie_small
1816 patterns here (two patterns for ILP32). */
1817 machine_mode mode = GET_MODE (dest);
1818 rtx tmp_reg = gen_reg_rtx (mode);
1819 rtx tp = aarch64_load_tp (NULL);
1820
1821 if (mode == ptr_mode)
1822 {
1823 if (mode == DImode)
1824 emit_insn (gen_tlsie_small_di (tmp_reg, imm));
1825 else
1826 {
1827 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
1828 tp = gen_lowpart (mode, tp);
1829 }
1830 }
1831 else
1832 {
1833 gcc_assert (mode == Pmode);
1834 emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
1835 }
1836
1837 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
1838 if (REG_P (dest))
1839 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1840 return;
1841 }
1842
1843 case SYMBOL_TLSLE12:
1844 case SYMBOL_TLSLE24:
1845 case SYMBOL_TLSLE32:
1846 case SYMBOL_TLSLE48:
1847 {
1848 machine_mode mode = GET_MODE (dest);
1849 rtx tp = aarch64_load_tp (NULL);
1850
1851 if (mode != Pmode)
1852 tp = gen_lowpart (mode, tp);
1853
1854 switch (type)
1855 {
1856 case SYMBOL_TLSLE12:
1857 emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
1858 (dest, tp, imm));
1859 break;
1860 case SYMBOL_TLSLE24:
1861 emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
1862 (dest, tp, imm));
1863 break;
1864 case SYMBOL_TLSLE32:
1865 emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
1866 (dest, imm));
1867 emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1868 (dest, dest, tp));
1869 break;
1870 case SYMBOL_TLSLE48:
1871 emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
1872 (dest, imm));
1873 emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1874 (dest, dest, tp));
1875 break;
1876 default:
1877 gcc_unreachable ();
1878 }
1879
1880 if (REG_P (dest))
1881 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1882 return;
1883 }
1884
1885 case SYMBOL_TINY_GOT:
1886 emit_insn (gen_ldr_got_tiny (dest, imm));
1887 return;
1888
1889 case SYMBOL_TINY_TLSIE:
1890 {
1891 machine_mode mode = GET_MODE (dest);
1892 rtx tp = aarch64_load_tp (NULL);
1893
1894 if (mode == ptr_mode)
1895 {
1896 if (mode == DImode)
1897 emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
1898 else
1899 {
1900 tp = gen_lowpart (mode, tp);
1901 emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
1902 }
1903 }
1904 else
1905 {
1906 gcc_assert (mode == Pmode);
1907 emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
1908 }
1909
1910 if (REG_P (dest))
1911 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1912 return;
1913 }
1914
1915 default:
1916 gcc_unreachable ();
1917 }
1918 }
1919
1920 /* Emit a move from SRC to DEST. Assume that the move expanders can
1921 handle all moves if !can_create_pseudo_p (). The distinction is
1922 important because, unlike emit_move_insn, the move expanders know
1923 how to force Pmode objects into the constant pool even when the
1924 constant pool address is not itself legitimate. */
1925 static rtx
1926 aarch64_emit_move (rtx dest, rtx src)
1927 {
1928 return (can_create_pseudo_p ()
1929 ? emit_move_insn (dest, src)
1930 : emit_move_insn_1 (dest, src));
1931 }
1932
1933 /* Apply UNOPTAB to OP and store the result in DEST. */
1934
1935 static void
1936 aarch64_emit_unop (rtx dest, optab unoptab, rtx op)
1937 {
1938 rtx tmp = expand_unop (GET_MODE (dest), unoptab, op, dest, 0);
1939 if (dest != tmp)
1940 emit_move_insn (dest, tmp);
1941 }
1942
1943 /* Apply BINOPTAB to OP0 and OP1 and store the result in DEST. */
1944
1945 static void
1946 aarch64_emit_binop (rtx dest, optab binoptab, rtx op0, rtx op1)
1947 {
1948 rtx tmp = expand_binop (GET_MODE (dest), binoptab, op0, op1, dest, 0,
1949 OPTAB_DIRECT);
1950 if (dest != tmp)
1951 emit_move_insn (dest, tmp);
1952 }
1953
1954 /* Split a 128-bit move operation into two 64-bit move operations,
1955 taking care to handle partial overlap of register to register
1956 copies. Special cases are needed when moving between GP regs and
1957 FP regs. SRC can be a register, constant or memory; DST a register
1958 or memory. If either operand is memory it must not have any side
1959 effects. */
1960 void
1961 aarch64_split_128bit_move (rtx dst, rtx src)
1962 {
1963 rtx dst_lo, dst_hi;
1964 rtx src_lo, src_hi;
1965
1966 machine_mode mode = GET_MODE (dst);
1967
1968 gcc_assert (mode == TImode || mode == TFmode);
1969 gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
1970 gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
1971
1972 if (REG_P (dst) && REG_P (src))
1973 {
1974 int src_regno = REGNO (src);
1975 int dst_regno = REGNO (dst);
1976
1977 /* Handle FP <-> GP regs. */
1978 if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
1979 {
1980 src_lo = gen_lowpart (word_mode, src);
1981 src_hi = gen_highpart (word_mode, src);
1982
1983 if (mode == TImode)
1984 {
1985 emit_insn (gen_aarch64_movtilow_di (dst, src_lo));
1986 emit_insn (gen_aarch64_movtihigh_di (dst, src_hi));
1987 }
1988 else
1989 {
1990 emit_insn (gen_aarch64_movtflow_di (dst, src_lo));
1991 emit_insn (gen_aarch64_movtfhigh_di (dst, src_hi));
1992 }
1993 return;
1994 }
1995 else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
1996 {
1997 dst_lo = gen_lowpart (word_mode, dst);
1998 dst_hi = gen_highpart (word_mode, dst);
1999
2000 if (mode == TImode)
2001 {
2002 emit_insn (gen_aarch64_movdi_tilow (dst_lo, src));
2003 emit_insn (gen_aarch64_movdi_tihigh (dst_hi, src));
2004 }
2005 else
2006 {
2007 emit_insn (gen_aarch64_movdi_tflow (dst_lo, src));
2008 emit_insn (gen_aarch64_movdi_tfhigh (dst_hi, src));
2009 }
2010 return;
2011 }
2012 }
2013
2014 dst_lo = gen_lowpart (word_mode, dst);
2015 dst_hi = gen_highpart (word_mode, dst);
2016 src_lo = gen_lowpart (word_mode, src);
2017 src_hi = gen_highpart_mode (word_mode, mode, src);
2018
2019 /* At most one pairing may overlap. */
2020 if (reg_overlap_mentioned_p (dst_lo, src_hi))
2021 {
2022 aarch64_emit_move (dst_hi, src_hi);
2023 aarch64_emit_move (dst_lo, src_lo);
2024 }
2025 else
2026 {
2027 aarch64_emit_move (dst_lo, src_lo);
2028 aarch64_emit_move (dst_hi, src_hi);
2029 }
2030 }
2031
2032 bool
2033 aarch64_split_128bit_move_p (rtx dst, rtx src)
2034 {
2035 return (! REG_P (src)
2036 || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
2037 }
2038
2039 /* Split a complex SIMD combine. */
2040
2041 void
2042 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
2043 {
2044 machine_mode src_mode = GET_MODE (src1);
2045 machine_mode dst_mode = GET_MODE (dst);
2046
2047 gcc_assert (VECTOR_MODE_P (dst_mode));
2048 gcc_assert (register_operand (dst, dst_mode)
2049 && register_operand (src1, src_mode)
2050 && register_operand (src2, src_mode));
2051
2052 rtx (*gen) (rtx, rtx, rtx);
2053
2054 switch (src_mode)
2055 {
2056 case E_V8QImode:
2057 gen = gen_aarch64_simd_combinev8qi;
2058 break;
2059 case E_V4HImode:
2060 gen = gen_aarch64_simd_combinev4hi;
2061 break;
2062 case E_V2SImode:
2063 gen = gen_aarch64_simd_combinev2si;
2064 break;
2065 case E_V4HFmode:
2066 gen = gen_aarch64_simd_combinev4hf;
2067 break;
2068 case E_V2SFmode:
2069 gen = gen_aarch64_simd_combinev2sf;
2070 break;
2071 case E_DImode:
2072 gen = gen_aarch64_simd_combinedi;
2073 break;
2074 case E_DFmode:
2075 gen = gen_aarch64_simd_combinedf;
2076 break;
2077 default:
2078 gcc_unreachable ();
2079 }
2080
2081 emit_insn (gen (dst, src1, src2));
2082 return;
2083 }
2084
2085 /* Split a complex SIMD move. */
2086
2087 void
2088 aarch64_split_simd_move (rtx dst, rtx src)
2089 {
2090 machine_mode src_mode = GET_MODE (src);
2091 machine_mode dst_mode = GET_MODE (dst);
2092
2093 gcc_assert (VECTOR_MODE_P (dst_mode));
2094
2095 if (REG_P (dst) && REG_P (src))
2096 {
2097 rtx (*gen) (rtx, rtx);
2098
2099 gcc_assert (VECTOR_MODE_P (src_mode));
2100
2101 switch (src_mode)
2102 {
2103 case E_V16QImode:
2104 gen = gen_aarch64_split_simd_movv16qi;
2105 break;
2106 case E_V8HImode:
2107 gen = gen_aarch64_split_simd_movv8hi;
2108 break;
2109 case E_V4SImode:
2110 gen = gen_aarch64_split_simd_movv4si;
2111 break;
2112 case E_V2DImode:
2113 gen = gen_aarch64_split_simd_movv2di;
2114 break;
2115 case E_V8HFmode:
2116 gen = gen_aarch64_split_simd_movv8hf;
2117 break;
2118 case E_V4SFmode:
2119 gen = gen_aarch64_split_simd_movv4sf;
2120 break;
2121 case E_V2DFmode:
2122 gen = gen_aarch64_split_simd_movv2df;
2123 break;
2124 default:
2125 gcc_unreachable ();
2126 }
2127
2128 emit_insn (gen (dst, src));
2129 return;
2130 }
2131 }
2132
2133 bool
2134 aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
2135 machine_mode ymode, rtx y)
2136 {
2137 rtx r = simplify_const_unary_operation (ZERO_EXTEND, xmode, y, ymode);
2138 gcc_assert (r != NULL);
2139 return rtx_equal_p (x, r);
2140 }
2141
2142
2143 static rtx
2144 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
2145 {
2146 if (can_create_pseudo_p ())
2147 return force_reg (mode, value);
2148 else
2149 {
2150 gcc_assert (x);
2151 aarch64_emit_move (x, value);
2152 return x;
2153 }
2154 }
2155
2156 /* Return true if we can move VALUE into a register using a single
2157 CNT[BHWD] instruction. */
2158
2159 static bool
2160 aarch64_sve_cnt_immediate_p (poly_int64 value)
2161 {
2162 HOST_WIDE_INT factor = value.coeffs[0];
2163 /* The coefficient must be [1, 16] * {2, 4, 8, 16}. */
2164 return (value.coeffs[1] == factor
2165 && IN_RANGE (factor, 2, 16 * 16)
2166 && (factor & 1) == 0
2167 && factor <= 16 * (factor & -factor));
2168 }
2169
2170 /* Likewise for rtx X. */
2171
2172 bool
2173 aarch64_sve_cnt_immediate_p (rtx x)
2174 {
2175 poly_int64 value;
2176 return poly_int_rtx_p (x, &value) && aarch64_sve_cnt_immediate_p (value);
2177 }
2178
2179 /* Return the asm string for an instruction with a CNT-like vector size
2180 operand (a vector pattern followed by a multiplier in the range [1, 16]).
2181 PREFIX is the mnemonic without the size suffix and OPERANDS is the
2182 first part of the operands template (the part that comes before the
2183 vector size itself). FACTOR is the number of quadwords.
2184 NELTS_PER_VQ, if nonzero, is the number of elements in each quadword.
2185 If it is zero, we can use any element size. */
2186
2187 static char *
2188 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
2189 unsigned int factor,
2190 unsigned int nelts_per_vq)
2191 {
2192 static char buffer[sizeof ("sqincd\t%x0, %w0, all, mul #16")];
2193
2194 if (nelts_per_vq == 0)
2195 /* There is some overlap in the ranges of the four CNT instructions.
2196 Here we always use the smallest possible element size, so that the
2197 multiplier is 1 whereever possible. */
2198 nelts_per_vq = factor & -factor;
2199 int shift = std::min (exact_log2 (nelts_per_vq), 4);
2200 gcc_assert (IN_RANGE (shift, 1, 4));
2201 char suffix = "dwhb"[shift - 1];
2202
2203 factor >>= shift;
2204 unsigned int written;
2205 if (factor == 1)
2206 written = snprintf (buffer, sizeof (buffer), "%s%c\t%s",
2207 prefix, suffix, operands);
2208 else
2209 written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, all, mul #%d",
2210 prefix, suffix, operands, factor);
2211 gcc_assert (written < sizeof (buffer));
2212 return buffer;
2213 }
2214
2215 /* Return the asm string for an instruction with a CNT-like vector size
2216 operand (a vector pattern followed by a multiplier in the range [1, 16]).
2217 PREFIX is the mnemonic without the size suffix and OPERANDS is the
2218 first part of the operands template (the part that comes before the
2219 vector size itself). X is the value of the vector size operand,
2220 as a polynomial integer rtx. */
2221
2222 char *
2223 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
2224 rtx x)
2225 {
2226 poly_int64 value = rtx_to_poly_int64 (x);
2227 gcc_assert (aarch64_sve_cnt_immediate_p (value));
2228 return aarch64_output_sve_cnt_immediate (prefix, operands,
2229 value.coeffs[1], 0);
2230 }
2231
2232 /* Return true if we can add VALUE to a register using a single ADDVL
2233 or ADDPL instruction. */
2234
2235 static bool
2236 aarch64_sve_addvl_addpl_immediate_p (poly_int64 value)
2237 {
2238 HOST_WIDE_INT factor = value.coeffs[0];
2239 if (factor == 0 || value.coeffs[1] != factor)
2240 return false;
2241 /* FACTOR counts VG / 2, so a value of 2 is one predicate width
2242 and a value of 16 is one vector width. */
2243 return (((factor & 15) == 0 && IN_RANGE (factor, -32 * 16, 31 * 16))
2244 || ((factor & 1) == 0 && IN_RANGE (factor, -32 * 2, 31 * 2)));
2245 }
2246
2247 /* Likewise for rtx X. */
2248
2249 bool
2250 aarch64_sve_addvl_addpl_immediate_p (rtx x)
2251 {
2252 poly_int64 value;
2253 return (poly_int_rtx_p (x, &value)
2254 && aarch64_sve_addvl_addpl_immediate_p (value));
2255 }
2256
2257 /* Return the asm string for adding ADDVL or ADDPL immediate X to operand 1
2258 and storing the result in operand 0. */
2259
2260 char *
2261 aarch64_output_sve_addvl_addpl (rtx dest, rtx base, rtx offset)
2262 {
2263 static char buffer[sizeof ("addpl\t%x0, %x1, #-") + 3 * sizeof (int)];
2264 poly_int64 offset_value = rtx_to_poly_int64 (offset);
2265 gcc_assert (aarch64_sve_addvl_addpl_immediate_p (offset_value));
2266
2267 /* Use INC or DEC if possible. */
2268 if (rtx_equal_p (dest, base) && GP_REGNUM_P (REGNO (dest)))
2269 {
2270 if (aarch64_sve_cnt_immediate_p (offset_value))
2271 return aarch64_output_sve_cnt_immediate ("inc", "%x0",
2272 offset_value.coeffs[1], 0);
2273 if (aarch64_sve_cnt_immediate_p (-offset_value))
2274 return aarch64_output_sve_cnt_immediate ("dec", "%x0",
2275 -offset_value.coeffs[1], 0);
2276 }
2277
2278 int factor = offset_value.coeffs[1];
2279 if ((factor & 15) == 0)
2280 snprintf (buffer, sizeof (buffer), "addvl\t%%x0, %%x1, #%d", factor / 16);
2281 else
2282 snprintf (buffer, sizeof (buffer), "addpl\t%%x0, %%x1, #%d", factor / 2);
2283 return buffer;
2284 }
2285
2286 /* Return true if X is a valid immediate for an SVE vector INC or DEC
2287 instruction. If it is, store the number of elements in each vector
2288 quadword in *NELTS_PER_VQ_OUT (if nonnull) and store the multiplication
2289 factor in *FACTOR_OUT (if nonnull). */
2290
2291 bool
2292 aarch64_sve_inc_dec_immediate_p (rtx x, int *factor_out,
2293 unsigned int *nelts_per_vq_out)
2294 {
2295 rtx elt;
2296 poly_int64 value;
2297
2298 if (!const_vec_duplicate_p (x, &elt)
2299 || !poly_int_rtx_p (elt, &value))
2300 return false;
2301
2302 unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (GET_MODE (x));
2303 if (nelts_per_vq != 8 && nelts_per_vq != 4 && nelts_per_vq != 2)
2304 /* There's no vector INCB. */
2305 return false;
2306
2307 HOST_WIDE_INT factor = value.coeffs[0];
2308 if (value.coeffs[1] != factor)
2309 return false;
2310
2311 /* The coefficient must be [1, 16] * NELTS_PER_VQ. */
2312 if ((factor % nelts_per_vq) != 0
2313 || !IN_RANGE (abs (factor), nelts_per_vq, 16 * nelts_per_vq))
2314 return false;
2315
2316 if (factor_out)
2317 *factor_out = factor;
2318 if (nelts_per_vq_out)
2319 *nelts_per_vq_out = nelts_per_vq;
2320 return true;
2321 }
2322
2323 /* Return true if X is a valid immediate for an SVE vector INC or DEC
2324 instruction. */
2325
2326 bool
2327 aarch64_sve_inc_dec_immediate_p (rtx x)
2328 {
2329 return aarch64_sve_inc_dec_immediate_p (x, NULL, NULL);
2330 }
2331
2332 /* Return the asm template for an SVE vector INC or DEC instruction.
2333 OPERANDS gives the operands before the vector count and X is the
2334 value of the vector count operand itself. */
2335
2336 char *
2337 aarch64_output_sve_inc_dec_immediate (const char *operands, rtx x)
2338 {
2339 int factor;
2340 unsigned int nelts_per_vq;
2341 if (!aarch64_sve_inc_dec_immediate_p (x, &factor, &nelts_per_vq))
2342 gcc_unreachable ();
2343 if (factor < 0)
2344 return aarch64_output_sve_cnt_immediate ("dec", operands, -factor,
2345 nelts_per_vq);
2346 else
2347 return aarch64_output_sve_cnt_immediate ("inc", operands, factor,
2348 nelts_per_vq);
2349 }
2350
2351 static int
2352 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
2353 scalar_int_mode mode)
2354 {
2355 int i;
2356 unsigned HOST_WIDE_INT val, val2, mask;
2357 int one_match, zero_match;
2358 int num_insns;
2359
2360 val = INTVAL (imm);
2361
2362 if (aarch64_move_imm (val, mode))
2363 {
2364 if (generate)
2365 emit_insn (gen_rtx_SET (dest, imm));
2366 return 1;
2367 }
2368
2369 /* Check to see if the low 32 bits are either 0xffffXXXX or 0xXXXXffff
2370 (with XXXX non-zero). In that case check to see if the move can be done in
2371 a smaller mode. */
2372 val2 = val & 0xffffffff;
2373 if (mode == DImode
2374 && aarch64_move_imm (val2, SImode)
2375 && (((val >> 32) & 0xffff) == 0 || (val >> 48) == 0))
2376 {
2377 if (generate)
2378 emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
2379
2380 /* Check if we have to emit a second instruction by checking to see
2381 if any of the upper 32 bits of the original DI mode value is set. */
2382 if (val == val2)
2383 return 1;
2384
2385 i = (val >> 48) ? 48 : 32;
2386
2387 if (generate)
2388 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
2389 GEN_INT ((val >> i) & 0xffff)));
2390
2391 return 2;
2392 }
2393
2394 if ((val >> 32) == 0 || mode == SImode)
2395 {
2396 if (generate)
2397 {
2398 emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
2399 if (mode == SImode)
2400 emit_insn (gen_insv_immsi (dest, GEN_INT (16),
2401 GEN_INT ((val >> 16) & 0xffff)));
2402 else
2403 emit_insn (gen_insv_immdi (dest, GEN_INT (16),
2404 GEN_INT ((val >> 16) & 0xffff)));
2405 }
2406 return 2;
2407 }
2408
2409 /* Remaining cases are all for DImode. */
2410
2411 mask = 0xffff;
2412 zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
2413 ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
2414 one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
2415 ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
2416
2417 if (zero_match != 2 && one_match != 2)
2418 {
2419 /* Try emitting a bitmask immediate with a movk replacing 16 bits.
2420 For a 64-bit bitmask try whether changing 16 bits to all ones or
2421 zeroes creates a valid bitmask. To check any repeated bitmask,
2422 try using 16 bits from the other 32-bit half of val. */
2423
2424 for (i = 0; i < 64; i += 16, mask <<= 16)
2425 {
2426 val2 = val & ~mask;
2427 if (val2 != val && aarch64_bitmask_imm (val2, mode))
2428 break;
2429 val2 = val | mask;
2430 if (val2 != val && aarch64_bitmask_imm (val2, mode))
2431 break;
2432 val2 = val2 & ~mask;
2433 val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
2434 if (val2 != val && aarch64_bitmask_imm (val2, mode))
2435 break;
2436 }
2437 if (i != 64)
2438 {
2439 if (generate)
2440 {
2441 emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
2442 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
2443 GEN_INT ((val >> i) & 0xffff)));
2444 }
2445 return 2;
2446 }
2447 }
2448
2449 /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
2450 are emitted by the initial mov. If one_match > zero_match, skip set bits,
2451 otherwise skip zero bits. */
2452
2453 num_insns = 1;
2454 mask = 0xffff;
2455 val2 = one_match > zero_match ? ~val : val;
2456 i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
2457
2458 if (generate)
2459 emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
2460 ? (val | ~(mask << i))
2461 : (val & (mask << i)))));
2462 for (i += 16; i < 64; i += 16)
2463 {
2464 if ((val2 & (mask << i)) == 0)
2465 continue;
2466 if (generate)
2467 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
2468 GEN_INT ((val >> i) & 0xffff)));
2469 num_insns ++;
2470 }
2471
2472 return num_insns;
2473 }
2474
2475 /* Return whether imm is a 128-bit immediate which is simple enough to
2476 expand inline. */
2477 bool
2478 aarch64_mov128_immediate (rtx imm)
2479 {
2480 if (GET_CODE (imm) == CONST_INT)
2481 return true;
2482
2483 gcc_assert (CONST_WIDE_INT_NUNITS (imm) == 2);
2484
2485 rtx lo = GEN_INT (CONST_WIDE_INT_ELT (imm, 0));
2486 rtx hi = GEN_INT (CONST_WIDE_INT_ELT (imm, 1));
2487
2488 return aarch64_internal_mov_immediate (NULL_RTX, lo, false, DImode)
2489 + aarch64_internal_mov_immediate (NULL_RTX, hi, false, DImode) <= 4;
2490 }
2491
2492
2493 /* Return the number of temporary registers that aarch64_add_offset_1
2494 would need to add OFFSET to a register. */
2495
2496 static unsigned int
2497 aarch64_add_offset_1_temporaries (HOST_WIDE_INT offset)
2498 {
2499 return abs_hwi (offset) < 0x1000000 ? 0 : 1;
2500 }
2501
2502 /* A subroutine of aarch64_add_offset. Set DEST to SRC + OFFSET for
2503 a non-polynomial OFFSET. MODE is the mode of the addition.
2504 FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
2505 be set and CFA adjustments added to the generated instructions.
2506
2507 TEMP1, if nonnull, is a register of mode MODE that can be used as a
2508 temporary if register allocation is already complete. This temporary
2509 register may overlap DEST but must not overlap SRC. If TEMP1 is known
2510 to hold abs (OFFSET), EMIT_MOVE_IMM can be set to false to avoid emitting
2511 the immediate again.
2512
2513 Since this function may be used to adjust the stack pointer, we must
2514 ensure that it cannot cause transient stack deallocation (for example
2515 by first incrementing SP and then decrementing when adjusting by a
2516 large immediate). */
2517
2518 static void
2519 aarch64_add_offset_1 (scalar_int_mode mode, rtx dest,
2520 rtx src, HOST_WIDE_INT offset, rtx temp1,
2521 bool frame_related_p, bool emit_move_imm)
2522 {
2523 gcc_assert (emit_move_imm || temp1 != NULL_RTX);
2524 gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
2525
2526 HOST_WIDE_INT moffset = abs_hwi (offset);
2527 rtx_insn *insn;
2528
2529 if (!moffset)
2530 {
2531 if (!rtx_equal_p (dest, src))
2532 {
2533 insn = emit_insn (gen_rtx_SET (dest, src));
2534 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2535 }
2536 return;
2537 }
2538
2539 /* Single instruction adjustment. */
2540 if (aarch64_uimm12_shift (moffset))
2541 {
2542 insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (offset)));
2543 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2544 return;
2545 }
2546
2547 /* Emit 2 additions/subtractions if the adjustment is less than 24 bits
2548 and either:
2549
2550 a) the offset cannot be loaded by a 16-bit move or
2551 b) there is no spare register into which we can move it. */
2552 if (moffset < 0x1000000
2553 && ((!temp1 && !can_create_pseudo_p ())
2554 || !aarch64_move_imm (moffset, mode)))
2555 {
2556 HOST_WIDE_INT low_off = moffset & 0xfff;
2557
2558 low_off = offset < 0 ? -low_off : low_off;
2559 insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (low_off)));
2560 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2561 insn = emit_insn (gen_add2_insn (dest, GEN_INT (offset - low_off)));
2562 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2563 return;
2564 }
2565
2566 /* Emit a move immediate if required and an addition/subtraction. */
2567 if (emit_move_imm)
2568 {
2569 gcc_assert (temp1 != NULL_RTX || can_create_pseudo_p ());
2570 temp1 = aarch64_force_temporary (mode, temp1, GEN_INT (moffset));
2571 }
2572 insn = emit_insn (offset < 0
2573 ? gen_sub3_insn (dest, src, temp1)
2574 : gen_add3_insn (dest, src, temp1));
2575 if (frame_related_p)
2576 {
2577 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2578 rtx adj = plus_constant (mode, src, offset);
2579 add_reg_note (insn, REG_CFA_ADJUST_CFA, gen_rtx_SET (dest, adj));
2580 }
2581 }
2582
2583 /* Return the number of temporary registers that aarch64_add_offset
2584 would need to move OFFSET into a register or add OFFSET to a register;
2585 ADD_P is true if we want the latter rather than the former. */
2586
2587 static unsigned int
2588 aarch64_offset_temporaries (bool add_p, poly_int64 offset)
2589 {
2590 /* This follows the same structure as aarch64_add_offset. */
2591 if (add_p && aarch64_sve_addvl_addpl_immediate_p (offset))
2592 return 0;
2593
2594 unsigned int count = 0;
2595 HOST_WIDE_INT factor = offset.coeffs[1];
2596 HOST_WIDE_INT constant = offset.coeffs[0] - factor;
2597 poly_int64 poly_offset (factor, factor);
2598 if (add_p && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
2599 /* Need one register for the ADDVL/ADDPL result. */
2600 count += 1;
2601 else if (factor != 0)
2602 {
2603 factor = abs (factor);
2604 if (factor > 16 * (factor & -factor))
2605 /* Need one register for the CNT result and one for the multiplication
2606 factor. If necessary, the second temporary can be reused for the
2607 constant part of the offset. */
2608 return 2;
2609 /* Need one register for the CNT result (which might then
2610 be shifted). */
2611 count += 1;
2612 }
2613 return count + aarch64_add_offset_1_temporaries (constant);
2614 }
2615
2616 /* If X can be represented as a poly_int64, return the number
2617 of temporaries that are required to add it to a register.
2618 Return -1 otherwise. */
2619
2620 int
2621 aarch64_add_offset_temporaries (rtx x)
2622 {
2623 poly_int64 offset;
2624 if (!poly_int_rtx_p (x, &offset))
2625 return -1;
2626 return aarch64_offset_temporaries (true, offset);
2627 }
2628
2629 /* Set DEST to SRC + OFFSET. MODE is the mode of the addition.
2630 FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
2631 be set and CFA adjustments added to the generated instructions.
2632
2633 TEMP1, if nonnull, is a register of mode MODE that can be used as a
2634 temporary if register allocation is already complete. This temporary
2635 register may overlap DEST if !FRAME_RELATED_P but must not overlap SRC.
2636 If TEMP1 is known to hold abs (OFFSET), EMIT_MOVE_IMM can be set to
2637 false to avoid emitting the immediate again.
2638
2639 TEMP2, if nonnull, is a second temporary register that doesn't
2640 overlap either DEST or REG.
2641
2642 Since this function may be used to adjust the stack pointer, we must
2643 ensure that it cannot cause transient stack deallocation (for example
2644 by first incrementing SP and then decrementing when adjusting by a
2645 large immediate). */
2646
2647 static void
2648 aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx src,
2649 poly_int64 offset, rtx temp1, rtx temp2,
2650 bool frame_related_p, bool emit_move_imm = true)
2651 {
2652 gcc_assert (emit_move_imm || temp1 != NULL_RTX);
2653 gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
2654 gcc_assert (temp1 == NULL_RTX
2655 || !frame_related_p
2656 || !reg_overlap_mentioned_p (temp1, dest));
2657 gcc_assert (temp2 == NULL_RTX || !reg_overlap_mentioned_p (dest, temp2));
2658
2659 /* Try using ADDVL or ADDPL to add the whole value. */
2660 if (src != const0_rtx && aarch64_sve_addvl_addpl_immediate_p (offset))
2661 {
2662 rtx offset_rtx = gen_int_mode (offset, mode);
2663 rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
2664 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2665 return;
2666 }
2667
2668 /* Coefficient 1 is multiplied by the number of 128-bit blocks in an
2669 SVE vector register, over and above the minimum size of 128 bits.
2670 This is equivalent to half the value returned by CNTD with a
2671 vector shape of ALL. */
2672 HOST_WIDE_INT factor = offset.coeffs[1];
2673 HOST_WIDE_INT constant = offset.coeffs[0] - factor;
2674
2675 /* Try using ADDVL or ADDPL to add the VG-based part. */
2676 poly_int64 poly_offset (factor, factor);
2677 if (src != const0_rtx
2678 && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
2679 {
2680 rtx offset_rtx = gen_int_mode (poly_offset, mode);
2681 if (frame_related_p)
2682 {
2683 rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
2684 RTX_FRAME_RELATED_P (insn) = true;
2685 src = dest;
2686 }
2687 else
2688 {
2689 rtx addr = gen_rtx_PLUS (mode, src, offset_rtx);
2690 src = aarch64_force_temporary (mode, temp1, addr);
2691 temp1 = temp2;
2692 temp2 = NULL_RTX;
2693 }
2694 }
2695 /* Otherwise use a CNT-based sequence. */
2696 else if (factor != 0)
2697 {
2698 /* Use a subtraction if we have a negative factor. */
2699 rtx_code code = PLUS;
2700 if (factor < 0)
2701 {
2702 factor = -factor;
2703 code = MINUS;
2704 }
2705
2706 /* Calculate CNTD * FACTOR / 2. First try to fold the division
2707 into the multiplication. */
2708 rtx val;
2709 int shift = 0;
2710 if (factor & 1)
2711 /* Use a right shift by 1. */
2712 shift = -1;
2713 else
2714 factor /= 2;
2715 HOST_WIDE_INT low_bit = factor & -factor;
2716 if (factor <= 16 * low_bit)
2717 {
2718 if (factor > 16 * 8)
2719 {
2720 /* "CNTB Xn, ALL, MUL #FACTOR" is out of range, so calculate
2721 the value with the minimum multiplier and shift it into
2722 position. */
2723 int extra_shift = exact_log2 (low_bit);
2724 shift += extra_shift;
2725 factor >>= extra_shift;
2726 }
2727 val = gen_int_mode (poly_int64 (factor * 2, factor * 2), mode);
2728 }
2729 else
2730 {
2731 /* Use CNTD, then multiply it by FACTOR. */
2732 val = gen_int_mode (poly_int64 (2, 2), mode);
2733 val = aarch64_force_temporary (mode, temp1, val);
2734
2735 /* Go back to using a negative multiplication factor if we have
2736 no register from which to subtract. */
2737 if (code == MINUS && src == const0_rtx)
2738 {
2739 factor = -factor;
2740 code = PLUS;
2741 }
2742 rtx coeff1 = gen_int_mode (factor, mode);
2743 coeff1 = aarch64_force_temporary (mode, temp2, coeff1);
2744 val = gen_rtx_MULT (mode, val, coeff1);
2745 }
2746
2747 if (shift > 0)
2748 {
2749 /* Multiply by 1 << SHIFT. */
2750 val = aarch64_force_temporary (mode, temp1, val);
2751 val = gen_rtx_ASHIFT (mode, val, GEN_INT (shift));
2752 }
2753 else if (shift == -1)
2754 {
2755 /* Divide by 2. */
2756 val = aarch64_force_temporary (mode, temp1, val);
2757 val = gen_rtx_ASHIFTRT (mode, val, const1_rtx);
2758 }
2759
2760 /* Calculate SRC +/- CNTD * FACTOR / 2. */
2761 if (src != const0_rtx)
2762 {
2763 val = aarch64_force_temporary (mode, temp1, val);
2764 val = gen_rtx_fmt_ee (code, mode, src, val);
2765 }
2766 else if (code == MINUS)
2767 {
2768 val = aarch64_force_temporary (mode, temp1, val);
2769 val = gen_rtx_NEG (mode, val);
2770 }
2771
2772 if (constant == 0 || frame_related_p)
2773 {
2774 rtx_insn *insn = emit_insn (gen_rtx_SET (dest, val));
2775 if (frame_related_p)
2776 {
2777 RTX_FRAME_RELATED_P (insn) = true;
2778 add_reg_note (insn, REG_CFA_ADJUST_CFA,
2779 gen_rtx_SET (dest, plus_constant (Pmode, src,
2780 poly_offset)));
2781 }
2782 src = dest;
2783 if (constant == 0)
2784 return;
2785 }
2786 else
2787 {
2788 src = aarch64_force_temporary (mode, temp1, val);
2789 temp1 = temp2;
2790 temp2 = NULL_RTX;
2791 }
2792
2793 emit_move_imm = true;
2794 }
2795
2796 aarch64_add_offset_1 (mode, dest, src, constant, temp1,
2797 frame_related_p, emit_move_imm);
2798 }
2799
2800 /* Like aarch64_add_offset, but the offset is given as an rtx rather
2801 than a poly_int64. */
2802
2803 void
2804 aarch64_split_add_offset (scalar_int_mode mode, rtx dest, rtx src,
2805 rtx offset_rtx, rtx temp1, rtx temp2)
2806 {
2807 aarch64_add_offset (mode, dest, src, rtx_to_poly_int64 (offset_rtx),
2808 temp1, temp2, false);
2809 }
2810
2811 /* Add DELTA to the stack pointer, marking the instructions frame-related.
2812 TEMP1 is available as a temporary if nonnull. EMIT_MOVE_IMM is false
2813 if TEMP1 already contains abs (DELTA). */
2814
2815 static inline void
2816 aarch64_add_sp (rtx temp1, rtx temp2, poly_int64 delta, bool emit_move_imm)
2817 {
2818 aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, delta,
2819 temp1, temp2, true, emit_move_imm);
2820 }
2821
2822 /* Subtract DELTA from the stack pointer, marking the instructions
2823 frame-related if FRAME_RELATED_P. TEMP1 is available as a temporary
2824 if nonnull. */
2825
2826 static inline void
2827 aarch64_sub_sp (rtx temp1, rtx temp2, poly_int64 delta, bool frame_related_p)
2828 {
2829 aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, -delta,
2830 temp1, temp2, frame_related_p);
2831 }
2832
2833 /* Set DEST to (vec_series BASE STEP). */
2834
2835 static void
2836 aarch64_expand_vec_series (rtx dest, rtx base, rtx step)
2837 {
2838 machine_mode mode = GET_MODE (dest);
2839 scalar_mode inner = GET_MODE_INNER (mode);
2840
2841 /* Each operand can be a register or an immediate in the range [-16, 15]. */
2842 if (!aarch64_sve_index_immediate_p (base))
2843 base = force_reg (inner, base);
2844 if (!aarch64_sve_index_immediate_p (step))
2845 step = force_reg (inner, step);
2846
2847 emit_set_insn (dest, gen_rtx_VEC_SERIES (mode, base, step));
2848 }
2849
2850 /* Try to duplicate SRC into SVE register DEST, given that SRC is an
2851 integer of mode INT_MODE. Return true on success. */
2852
2853 static bool
2854 aarch64_expand_sve_widened_duplicate (rtx dest, scalar_int_mode src_mode,
2855 rtx src)
2856 {
2857 /* If the constant is smaller than 128 bits, we can do the move
2858 using a vector of SRC_MODEs. */
2859 if (src_mode != TImode)
2860 {
2861 poly_uint64 count = exact_div (GET_MODE_SIZE (GET_MODE (dest)),
2862 GET_MODE_SIZE (src_mode));
2863 machine_mode dup_mode = mode_for_vector (src_mode, count).require ();
2864 emit_move_insn (gen_lowpart (dup_mode, dest),
2865 gen_const_vec_duplicate (dup_mode, src));
2866 return true;
2867 }
2868
2869 /* Use LD1RQ[BHWD] to load the 128 bits from memory. */
2870 src = force_const_mem (src_mode, src);
2871 if (!src)
2872 return false;
2873
2874 /* Make sure that the address is legitimate. */
2875 if (!aarch64_sve_ld1r_operand_p (src))
2876 {
2877 rtx addr = force_reg (Pmode, XEXP (src, 0));
2878 src = replace_equiv_address (src, addr);
2879 }
2880
2881 machine_mode mode = GET_MODE (dest);
2882 unsigned int elem_bytes = GET_MODE_UNIT_SIZE (mode);
2883 machine_mode pred_mode = aarch64_sve_pred_mode (elem_bytes).require ();
2884 rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
2885 src = gen_rtx_UNSPEC (mode, gen_rtvec (2, ptrue, src), UNSPEC_LD1RQ);
2886 emit_insn (gen_rtx_SET (dest, src));
2887 return true;
2888 }
2889
2890 /* Expand a move of general CONST_VECTOR SRC into DEST, given that it
2891 isn't a simple duplicate or series. */
2892
2893 static void
2894 aarch64_expand_sve_const_vector (rtx dest, rtx src)
2895 {
2896 machine_mode mode = GET_MODE (src);
2897 unsigned int npatterns = CONST_VECTOR_NPATTERNS (src);
2898 unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (src);
2899 gcc_assert (npatterns > 1);
2900
2901 if (nelts_per_pattern == 1)
2902 {
2903 /* The constant is a repeating seqeuence of at least two elements,
2904 where the repeating elements occupy no more than 128 bits.
2905 Get an integer representation of the replicated value. */
2906 scalar_int_mode int_mode;
2907 if (BYTES_BIG_ENDIAN)
2908 /* For now, always use LD1RQ to load the value on big-endian
2909 targets, since the handling of smaller integers includes a
2910 subreg that is semantically an element reverse. */
2911 int_mode = TImode;
2912 else
2913 {
2914 unsigned int int_bits = GET_MODE_UNIT_BITSIZE (mode) * npatterns;
2915 gcc_assert (int_bits <= 128);
2916 int_mode = int_mode_for_size (int_bits, 0).require ();
2917 }
2918 rtx int_value = simplify_gen_subreg (int_mode, src, mode, 0);
2919 if (int_value
2920 && aarch64_expand_sve_widened_duplicate (dest, int_mode, int_value))
2921 return;
2922 }
2923
2924 /* Expand each pattern individually. */
2925 rtx_vector_builder builder;
2926 auto_vec<rtx, 16> vectors (npatterns);
2927 for (unsigned int i = 0; i < npatterns; ++i)
2928 {
2929 builder.new_vector (mode, 1, nelts_per_pattern);
2930 for (unsigned int j = 0; j < nelts_per_pattern; ++j)
2931 builder.quick_push (CONST_VECTOR_ELT (src, i + j * npatterns));
2932 vectors.quick_push (force_reg (mode, builder.build ()));
2933 }
2934
2935 /* Use permutes to interleave the separate vectors. */
2936 while (npatterns > 1)
2937 {
2938 npatterns /= 2;
2939 for (unsigned int i = 0; i < npatterns; ++i)
2940 {
2941 rtx tmp = (npatterns == 1 ? dest : gen_reg_rtx (mode));
2942 rtvec v = gen_rtvec (2, vectors[i], vectors[i + npatterns]);
2943 emit_set_insn (tmp, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
2944 vectors[i] = tmp;
2945 }
2946 }
2947 gcc_assert (vectors[0] == dest);
2948 }
2949
2950 /* Set DEST to immediate IMM. For SVE vector modes, GEN_VEC_DUPLICATE
2951 is a pattern that can be used to set DEST to a replicated scalar
2952 element. */
2953
2954 void
2955 aarch64_expand_mov_immediate (rtx dest, rtx imm,
2956 rtx (*gen_vec_duplicate) (rtx, rtx))
2957 {
2958 machine_mode mode = GET_MODE (dest);
2959
2960 /* Check on what type of symbol it is. */
2961 scalar_int_mode int_mode;
2962 if ((GET_CODE (imm) == SYMBOL_REF
2963 || GET_CODE (imm) == LABEL_REF
2964 || GET_CODE (imm) == CONST
2965 || GET_CODE (imm) == CONST_POLY_INT)
2966 && is_a <scalar_int_mode> (mode, &int_mode))
2967 {
2968 rtx mem;
2969 poly_int64 offset;
2970 HOST_WIDE_INT const_offset;
2971 enum aarch64_symbol_type sty;
2972
2973 /* If we have (const (plus symbol offset)), separate out the offset
2974 before we start classifying the symbol. */
2975 rtx base = strip_offset (imm, &offset);
2976
2977 /* We must always add an offset involving VL separately, rather than
2978 folding it into the relocation. */
2979 if (!offset.is_constant (&const_offset))
2980 {
2981 if (base == const0_rtx && aarch64_sve_cnt_immediate_p (offset))
2982 emit_insn (gen_rtx_SET (dest, imm));
2983 else
2984 {
2985 /* Do arithmetic on 32-bit values if the result is smaller
2986 than that. */
2987 if (partial_subreg_p (int_mode, SImode))
2988 {
2989 /* It is invalid to do symbol calculations in modes
2990 narrower than SImode. */
2991 gcc_assert (base == const0_rtx);
2992 dest = gen_lowpart (SImode, dest);
2993 int_mode = SImode;
2994 }
2995 if (base != const0_rtx)
2996 {
2997 base = aarch64_force_temporary (int_mode, dest, base);
2998 aarch64_add_offset (int_mode, dest, base, offset,
2999 NULL_RTX, NULL_RTX, false);
3000 }
3001 else
3002 aarch64_add_offset (int_mode, dest, base, offset,
3003 dest, NULL_RTX, false);
3004 }
3005 return;
3006 }
3007
3008 sty = aarch64_classify_symbol (base, const_offset);
3009 switch (sty)
3010 {
3011 case SYMBOL_FORCE_TO_MEM:
3012 if (const_offset != 0
3013 && targetm.cannot_force_const_mem (int_mode, imm))
3014 {
3015 gcc_assert (can_create_pseudo_p ());
3016 base = aarch64_force_temporary (int_mode, dest, base);
3017 aarch64_add_offset (int_mode, dest, base, const_offset,
3018 NULL_RTX, NULL_RTX, false);
3019 return;
3020 }
3021
3022 mem = force_const_mem (ptr_mode, imm);
3023 gcc_assert (mem);
3024
3025 /* If we aren't generating PC relative literals, then
3026 we need to expand the literal pool access carefully.
3027 This is something that needs to be done in a number
3028 of places, so could well live as a separate function. */
3029 if (!aarch64_pcrelative_literal_loads)
3030 {
3031 gcc_assert (can_create_pseudo_p ());
3032 base = gen_reg_rtx (ptr_mode);
3033 aarch64_expand_mov_immediate (base, XEXP (mem, 0));
3034 if (ptr_mode != Pmode)
3035 base = convert_memory_address (Pmode, base);
3036 mem = gen_rtx_MEM (ptr_mode, base);
3037 }
3038
3039 if (int_mode != ptr_mode)
3040 mem = gen_rtx_ZERO_EXTEND (int_mode, mem);
3041
3042 emit_insn (gen_rtx_SET (dest, mem));
3043
3044 return;
3045
3046 case SYMBOL_SMALL_TLSGD:
3047 case SYMBOL_SMALL_TLSDESC:
3048 case SYMBOL_SMALL_TLSIE:
3049 case SYMBOL_SMALL_GOT_28K:
3050 case SYMBOL_SMALL_GOT_4G:
3051 case SYMBOL_TINY_GOT:
3052 case SYMBOL_TINY_TLSIE:
3053 if (const_offset != 0)
3054 {
3055 gcc_assert(can_create_pseudo_p ());
3056 base = aarch64_force_temporary (int_mode, dest, base);
3057 aarch64_add_offset (int_mode, dest, base, const_offset,
3058 NULL_RTX, NULL_RTX, false);
3059 return;
3060 }
3061 /* FALLTHRU */
3062
3063 case SYMBOL_SMALL_ABSOLUTE:
3064 case SYMBOL_TINY_ABSOLUTE:
3065 case SYMBOL_TLSLE12:
3066 case SYMBOL_TLSLE24:
3067 case SYMBOL_TLSLE32:
3068 case SYMBOL_TLSLE48:
3069 aarch64_load_symref_appropriately (dest, imm, sty);
3070 return;
3071
3072 default:
3073 gcc_unreachable ();
3074 }
3075 }
3076
3077 if (!CONST_INT_P (imm))
3078 {
3079 rtx base, step, value;
3080 if (GET_CODE (imm) == HIGH
3081 || aarch64_simd_valid_immediate (imm, NULL))
3082 emit_insn (gen_rtx_SET (dest, imm));
3083 else if (const_vec_series_p (imm, &base, &step))
3084 aarch64_expand_vec_series (dest, base, step);
3085 else if (const_vec_duplicate_p (imm, &value))
3086 {
3087 /* If the constant is out of range of an SVE vector move,
3088 load it from memory if we can, otherwise move it into
3089 a register and use a DUP. */
3090 scalar_mode inner_mode = GET_MODE_INNER (mode);
3091 rtx op = force_const_mem (inner_mode, value);
3092 if (!op)
3093 op = force_reg (inner_mode, value);
3094 else if (!aarch64_sve_ld1r_operand_p (op))
3095 {
3096 rtx addr = force_reg (Pmode, XEXP (op, 0));
3097 op = replace_equiv_address (op, addr);
3098 }
3099 emit_insn (gen_vec_duplicate (dest, op));
3100 }
3101 else if (GET_CODE (imm) == CONST_VECTOR
3102 && !GET_MODE_NUNITS (GET_MODE (imm)).is_constant ())
3103 aarch64_expand_sve_const_vector (dest, imm);
3104 else
3105 {
3106 rtx mem = force_const_mem (mode, imm);
3107 gcc_assert (mem);
3108 emit_move_insn (dest, mem);
3109 }
3110
3111 return;
3112 }
3113
3114 aarch64_internal_mov_immediate (dest, imm, true,
3115 as_a <scalar_int_mode> (mode));
3116 }
3117
3118 /* Emit an SVE predicated move from SRC to DEST. PRED is a predicate
3119 that is known to contain PTRUE. */
3120
3121 void
3122 aarch64_emit_sve_pred_move (rtx dest, rtx pred, rtx src)
3123 {
3124 emit_insn (gen_rtx_SET (dest, gen_rtx_UNSPEC (GET_MODE (dest),
3125 gen_rtvec (2, pred, src),
3126 UNSPEC_MERGE_PTRUE)));
3127 }
3128
3129 /* Expand a pre-RA SVE data move from SRC to DEST in which at least one
3130 operand is in memory. In this case we need to use the predicated LD1
3131 and ST1 instead of LDR and STR, both for correctness on big-endian
3132 targets and because LD1 and ST1 support a wider range of addressing modes.
3133 PRED_MODE is the mode of the predicate.
3134
3135 See the comment at the head of aarch64-sve.md for details about the
3136 big-endian handling. */
3137
3138 void
3139 aarch64_expand_sve_mem_move (rtx dest, rtx src, machine_mode pred_mode)
3140 {
3141 machine_mode mode = GET_MODE (dest);
3142 rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
3143 if (!register_operand (src, mode)
3144 && !register_operand (dest, mode))
3145 {
3146 rtx tmp = gen_reg_rtx (mode);
3147 if (MEM_P (src))
3148 aarch64_emit_sve_pred_move (tmp, ptrue, src);
3149 else
3150 emit_move_insn (tmp, src);
3151 src = tmp;
3152 }
3153 aarch64_emit_sve_pred_move (dest, ptrue, src);
3154 }
3155
3156 /* Called only on big-endian targets. See whether an SVE vector move
3157 from SRC to DEST is effectively a REV[BHW] instruction, because at
3158 least one operand is a subreg of an SVE vector that has wider or
3159 narrower elements. Return true and emit the instruction if so.
3160
3161 For example:
3162
3163 (set (reg:VNx8HI R1) (subreg:VNx8HI (reg:VNx16QI R2) 0))
3164
3165 represents a VIEW_CONVERT between the following vectors, viewed
3166 in memory order:
3167
3168 R2: { [0].high, [0].low, [1].high, [1].low, ... }
3169 R1: { [0], [1], [2], [3], ... }
3170
3171 The high part of lane X in R2 should therefore correspond to lane X*2
3172 of R1, but the register representations are:
3173
3174 msb lsb
3175 R2: ...... [1].high [1].low [0].high [0].low
3176 R1: ...... [3] [2] [1] [0]
3177
3178 where the low part of lane X in R2 corresponds to lane X*2 in R1.
3179 We therefore need a reverse operation to swap the high and low values
3180 around.
3181
3182 This is purely an optimization. Without it we would spill the
3183 subreg operand to the stack in one mode and reload it in the
3184 other mode, which has the same effect as the REV. */
3185
3186 bool
3187 aarch64_maybe_expand_sve_subreg_move (rtx dest, rtx src)
3188 {
3189 gcc_assert (BYTES_BIG_ENDIAN);
3190 if (GET_CODE (dest) == SUBREG)
3191 dest = SUBREG_REG (dest);
3192 if (GET_CODE (src) == SUBREG)
3193 src = SUBREG_REG (src);
3194
3195 /* The optimization handles two single SVE REGs with different element
3196 sizes. */
3197 if (!REG_P (dest)
3198 || !REG_P (src)
3199 || aarch64_classify_vector_mode (GET_MODE (dest)) != VEC_SVE_DATA
3200 || aarch64_classify_vector_mode (GET_MODE (src)) != VEC_SVE_DATA
3201 || (GET_MODE_UNIT_SIZE (GET_MODE (dest))
3202 == GET_MODE_UNIT_SIZE (GET_MODE (src))))
3203 return false;
3204
3205 /* Generate *aarch64_sve_mov<mode>_subreg_be. */
3206 rtx ptrue = force_reg (VNx16BImode, CONSTM1_RTX (VNx16BImode));
3207 rtx unspec = gen_rtx_UNSPEC (GET_MODE (dest), gen_rtvec (2, ptrue, src),
3208 UNSPEC_REV_SUBREG);
3209 emit_insn (gen_rtx_SET (dest, unspec));
3210 return true;
3211 }
3212
3213 /* Return a copy of X with mode MODE, without changing its other
3214 attributes. Unlike gen_lowpart, this doesn't care whether the
3215 mode change is valid. */
3216
3217 static rtx
3218 aarch64_replace_reg_mode (rtx x, machine_mode mode)
3219 {
3220 if (GET_MODE (x) == mode)
3221 return x;
3222
3223 x = shallow_copy_rtx (x);
3224 set_mode_and_regno (x, mode, REGNO (x));
3225 return x;
3226 }
3227
3228 /* Split a *aarch64_sve_mov<mode>_subreg_be pattern with the given
3229 operands. */
3230
3231 void
3232 aarch64_split_sve_subreg_move (rtx dest, rtx ptrue, rtx src)
3233 {
3234 /* Decide which REV operation we need. The mode with narrower elements
3235 determines the mode of the operands and the mode with the wider
3236 elements determines the reverse width. */
3237 machine_mode mode_with_wider_elts = GET_MODE (dest);
3238 machine_mode mode_with_narrower_elts = GET_MODE (src);
3239 if (GET_MODE_UNIT_SIZE (mode_with_wider_elts)
3240 < GET_MODE_UNIT_SIZE (mode_with_narrower_elts))
3241 std::swap (mode_with_wider_elts, mode_with_narrower_elts);
3242
3243 unsigned int wider_bytes = GET_MODE_UNIT_SIZE (mode_with_wider_elts);
3244 unsigned int unspec;
3245 if (wider_bytes == 8)
3246 unspec = UNSPEC_REV64;
3247 else if (wider_bytes == 4)
3248 unspec = UNSPEC_REV32;
3249 else if (wider_bytes == 2)
3250 unspec = UNSPEC_REV16;
3251 else
3252 gcc_unreachable ();
3253 machine_mode pred_mode = aarch64_sve_pred_mode (wider_bytes).require ();
3254
3255 /* Emit:
3256
3257 (set DEST (unspec [PTRUE (unspec [SRC] UNSPEC_REV<nn>)]
3258 UNSPEC_MERGE_PTRUE))
3259
3260 with the appropriate modes. */
3261 ptrue = gen_lowpart (pred_mode, ptrue);
3262 dest = aarch64_replace_reg_mode (dest, mode_with_narrower_elts);
3263 src = aarch64_replace_reg_mode (src, mode_with_narrower_elts);
3264 src = gen_rtx_UNSPEC (mode_with_narrower_elts, gen_rtvec (1, src), unspec);
3265 src = gen_rtx_UNSPEC (mode_with_narrower_elts, gen_rtvec (2, ptrue, src),
3266 UNSPEC_MERGE_PTRUE);
3267 emit_insn (gen_rtx_SET (dest, src));
3268 }
3269
3270 static bool
3271 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
3272 tree exp ATTRIBUTE_UNUSED)
3273 {
3274 /* Currently, always true. */
3275 return true;
3276 }
3277
3278 /* Implement TARGET_PASS_BY_REFERENCE. */
3279
3280 static bool
3281 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED,
3282 machine_mode mode,
3283 const_tree type,
3284 bool named ATTRIBUTE_UNUSED)
3285 {
3286 HOST_WIDE_INT size;
3287 machine_mode dummymode;
3288 int nregs;
3289
3290 /* GET_MODE_SIZE (BLKmode) is useless since it is 0. */
3291 if (mode == BLKmode && type)
3292 size = int_size_in_bytes (type);
3293 else
3294 /* No frontends can create types with variable-sized modes, so we
3295 shouldn't be asked to pass or return them. */
3296 size = GET_MODE_SIZE (mode).to_constant ();
3297
3298 /* Aggregates are passed by reference based on their size. */
3299 if (type && AGGREGATE_TYPE_P (type))
3300 {
3301 size = int_size_in_bytes (type);
3302 }
3303
3304 /* Variable sized arguments are always returned by reference. */
3305 if (size < 0)
3306 return true;
3307
3308 /* Can this be a candidate to be passed in fp/simd register(s)? */
3309 if (aarch64_vfp_is_call_or_return_candidate (mode, type,
3310 &dummymode, &nregs,
3311 NULL))
3312 return false;
3313
3314 /* Arguments which are variable sized or larger than 2 registers are
3315 passed by reference unless they are a homogenous floating point
3316 aggregate. */
3317 return size > 2 * UNITS_PER_WORD;
3318 }
3319
3320 /* Return TRUE if VALTYPE is padded to its least significant bits. */
3321 static bool
3322 aarch64_return_in_msb (const_tree valtype)
3323 {
3324 machine_mode dummy_mode;
3325 int dummy_int;
3326
3327 /* Never happens in little-endian mode. */
3328 if (!BYTES_BIG_ENDIAN)
3329 return false;
3330
3331 /* Only composite types smaller than or equal to 16 bytes can
3332 be potentially returned in registers. */
3333 if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
3334 || int_size_in_bytes (valtype) <= 0
3335 || int_size_in_bytes (valtype) > 16)
3336 return false;
3337
3338 /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
3339 or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
3340 is always passed/returned in the least significant bits of fp/simd
3341 register(s). */
3342 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
3343 &dummy_mode, &dummy_int, NULL))
3344 return false;
3345
3346 return true;
3347 }
3348
3349 /* Implement TARGET_FUNCTION_VALUE.
3350 Define how to find the value returned by a function. */
3351
3352 static rtx
3353 aarch64_function_value (const_tree type, const_tree func,
3354 bool outgoing ATTRIBUTE_UNUSED)
3355 {
3356 machine_mode mode;
3357 int unsignedp;
3358 int count;
3359 machine_mode ag_mode;
3360
3361 mode = TYPE_MODE (type);
3362 if (INTEGRAL_TYPE_P (type))
3363 mode = promote_function_mode (type, mode, &unsignedp, func, 1);
3364
3365 if (aarch64_return_in_msb (type))
3366 {
3367 HOST_WIDE_INT size = int_size_in_bytes (type);
3368
3369 if (size % UNITS_PER_WORD != 0)
3370 {
3371 size += UNITS_PER_WORD - size % UNITS_PER_WORD;
3372 mode = int_mode_for_size (size * BITS_PER_UNIT, 0).require ();
3373 }
3374 }
3375
3376 if (aarch64_vfp_is_call_or_return_candidate (mode, type,
3377 &ag_mode, &count, NULL))
3378 {
3379 if (!aarch64_composite_type_p (type, mode))
3380 {
3381 gcc_assert (count == 1 && mode == ag_mode);
3382 return gen_rtx_REG (mode, V0_REGNUM);
3383 }
3384 else
3385 {
3386 int i;
3387 rtx par;
3388
3389 par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
3390 for (i = 0; i < count; i++)
3391 {
3392 rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
3393 rtx offset = gen_int_mode (i * GET_MODE_SIZE (ag_mode), Pmode);
3394 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
3395 XVECEXP (par, 0, i) = tmp;
3396 }
3397 return par;
3398 }
3399 }
3400 else
3401 return gen_rtx_REG (mode, R0_REGNUM);
3402 }
3403
3404 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
3405 Return true if REGNO is the number of a hard register in which the values
3406 of called function may come back. */
3407
3408 static bool
3409 aarch64_function_value_regno_p (const unsigned int regno)
3410 {
3411 /* Maximum of 16 bytes can be returned in the general registers. Examples
3412 of 16-byte return values are: 128-bit integers and 16-byte small
3413 structures (excluding homogeneous floating-point aggregates). */
3414 if (regno == R0_REGNUM || regno == R1_REGNUM)
3415 return true;
3416
3417 /* Up to four fp/simd registers can return a function value, e.g. a
3418 homogeneous floating-point aggregate having four members. */
3419 if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
3420 return TARGET_FLOAT;
3421
3422 return false;
3423 }
3424
3425 /* Implement TARGET_RETURN_IN_MEMORY.
3426
3427 If the type T of the result of a function is such that
3428 void func (T arg)
3429 would require that arg be passed as a value in a register (or set of
3430 registers) according to the parameter passing rules, then the result
3431 is returned in the same registers as would be used for such an
3432 argument. */
3433
3434 static bool
3435 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
3436 {
3437 HOST_WIDE_INT size;
3438 machine_mode ag_mode;
3439 int count;
3440
3441 if (!AGGREGATE_TYPE_P (type)
3442 && TREE_CODE (type) != COMPLEX_TYPE
3443 && TREE_CODE (type) != VECTOR_TYPE)
3444 /* Simple scalar types always returned in registers. */
3445 return false;
3446
3447 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
3448 type,
3449 &ag_mode,
3450 &count,
3451 NULL))
3452 return false;
3453
3454 /* Types larger than 2 registers returned in memory. */
3455 size = int_size_in_bytes (type);
3456 return (size < 0 || size > 2 * UNITS_PER_WORD);
3457 }
3458
3459 static bool
3460 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
3461 const_tree type, int *nregs)
3462 {
3463 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3464 return aarch64_vfp_is_call_or_return_candidate (mode,
3465 type,
3466 &pcum->aapcs_vfp_rmode,
3467 nregs,
3468 NULL);
3469 }
3470
3471 /* Given MODE and TYPE of a function argument, return the alignment in
3472 bits. The idea is to suppress any stronger alignment requested by
3473 the user and opt for the natural alignment (specified in AAPCS64 \S 4.1).
3474 This is a helper function for local use only. */
3475
3476 static unsigned int
3477 aarch64_function_arg_alignment (machine_mode mode, const_tree type)
3478 {
3479 if (!type)
3480 return GET_MODE_ALIGNMENT (mode);
3481
3482 if (integer_zerop (TYPE_SIZE (type)))
3483 return 0;
3484
3485 gcc_assert (TYPE_MODE (type) == mode);
3486
3487 if (!AGGREGATE_TYPE_P (type))
3488 return TYPE_ALIGN (TYPE_MAIN_VARIANT (type));
3489
3490 if (TREE_CODE (type) == ARRAY_TYPE)
3491 return TYPE_ALIGN (TREE_TYPE (type));
3492
3493 unsigned int alignment = 0;
3494 for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
3495 if (TREE_CODE (field) == FIELD_DECL)
3496 alignment = std::max (alignment, DECL_ALIGN (field));
3497
3498 return alignment;
3499 }
3500
3501 /* Layout a function argument according to the AAPCS64 rules. The rule
3502 numbers refer to the rule numbers in the AAPCS64. */
3503
3504 static void
3505 aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
3506 const_tree type,
3507 bool named ATTRIBUTE_UNUSED)
3508 {
3509 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3510 int ncrn, nvrn, nregs;
3511 bool allocate_ncrn, allocate_nvrn;
3512 HOST_WIDE_INT size;
3513
3514 /* We need to do this once per argument. */
3515 if (pcum->aapcs_arg_processed)
3516 return;
3517
3518 pcum->aapcs_arg_processed = true;
3519
3520 /* Size in bytes, rounded to the nearest multiple of 8 bytes. */
3521 if (type)
3522 size = int_size_in_bytes (type);
3523 else
3524 /* No frontends can create types with variable-sized modes, so we
3525 shouldn't be asked to pass or return them. */
3526 size = GET_MODE_SIZE (mode).to_constant ();
3527 size = ROUND_UP (size, UNITS_PER_WORD);
3528
3529 allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
3530 allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
3531 mode,
3532 type,
3533 &nregs);
3534
3535 /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
3536 The following code thus handles passing by SIMD/FP registers first. */
3537
3538 nvrn = pcum->aapcs_nvrn;
3539
3540 /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
3541 and homogenous short-vector aggregates (HVA). */
3542 if (allocate_nvrn)
3543 {
3544 if (!TARGET_FLOAT)
3545 aarch64_err_no_fpadvsimd (mode);
3546
3547 if (nvrn + nregs <= NUM_FP_ARG_REGS)
3548 {
3549 pcum->aapcs_nextnvrn = nvrn + nregs;
3550 if (!aarch64_composite_type_p (type, mode))
3551 {
3552 gcc_assert (nregs == 1);
3553 pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
3554 }
3555 else
3556 {
3557 rtx par;
3558 int i;
3559 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
3560 for (i = 0; i < nregs; i++)
3561 {
3562 rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
3563 V0_REGNUM + nvrn + i);
3564 rtx offset = gen_int_mode
3565 (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode), Pmode);
3566 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
3567 XVECEXP (par, 0, i) = tmp;
3568 }
3569 pcum->aapcs_reg = par;
3570 }
3571 return;
3572 }
3573 else
3574 {
3575 /* C.3 NSRN is set to 8. */
3576 pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
3577 goto on_stack;
3578 }
3579 }
3580
3581 ncrn = pcum->aapcs_ncrn;
3582 nregs = size / UNITS_PER_WORD;
3583
3584 /* C6 - C9. though the sign and zero extension semantics are
3585 handled elsewhere. This is the case where the argument fits
3586 entirely general registers. */
3587 if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
3588 {
3589
3590 gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
3591
3592 /* C.8 if the argument has an alignment of 16 then the NGRN is
3593 rounded up to the next even number. */
3594 if (nregs == 2
3595 && ncrn % 2
3596 /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
3597 comparison is there because for > 16 * BITS_PER_UNIT
3598 alignment nregs should be > 2 and therefore it should be
3599 passed by reference rather than value. */
3600 && aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
3601 {
3602 ++ncrn;
3603 gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
3604 }
3605
3606 /* NREGS can be 0 when e.g. an empty structure is to be passed.
3607 A reg is still generated for it, but the caller should be smart
3608 enough not to use it. */
3609 if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
3610 pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
3611 else
3612 {
3613 rtx par;
3614 int i;
3615
3616 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
3617 for (i = 0; i < nregs; i++)
3618 {
3619 rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
3620 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
3621 GEN_INT (i * UNITS_PER_WORD));
3622 XVECEXP (par, 0, i) = tmp;
3623 }
3624 pcum->aapcs_reg = par;
3625 }
3626
3627 pcum->aapcs_nextncrn = ncrn + nregs;
3628 return;
3629 }
3630
3631 /* C.11 */
3632 pcum->aapcs_nextncrn = NUM_ARG_REGS;
3633
3634 /* The argument is passed on stack; record the needed number of words for
3635 this argument and align the total size if necessary. */
3636 on_stack:
3637 pcum->aapcs_stack_words = size / UNITS_PER_WORD;
3638
3639 if (aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
3640 pcum->aapcs_stack_size = ROUND_UP (pcum->aapcs_stack_size,
3641 16 / UNITS_PER_WORD);
3642 return;
3643 }
3644
3645 /* Implement TARGET_FUNCTION_ARG. */
3646
3647 static rtx
3648 aarch64_function_arg (cumulative_args_t pcum_v, machine_mode mode,
3649 const_tree type, bool named)
3650 {
3651 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3652 gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64);
3653
3654 if (mode == VOIDmode)
3655 return NULL_RTX;
3656
3657 aarch64_layout_arg (pcum_v, mode, type, named);
3658 return pcum->aapcs_reg;
3659 }
3660
3661 void
3662 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
3663 const_tree fntype ATTRIBUTE_UNUSED,
3664 rtx libname ATTRIBUTE_UNUSED,
3665 const_tree fndecl ATTRIBUTE_UNUSED,
3666 unsigned n_named ATTRIBUTE_UNUSED)
3667 {
3668 pcum->aapcs_ncrn = 0;
3669 pcum->aapcs_nvrn = 0;
3670 pcum->aapcs_nextncrn = 0;
3671 pcum->aapcs_nextnvrn = 0;
3672 pcum->pcs_variant = ARM_PCS_AAPCS64;
3673 pcum->aapcs_reg = NULL_RTX;
3674 pcum->aapcs_arg_processed = false;
3675 pcum->aapcs_stack_words = 0;
3676 pcum->aapcs_stack_size = 0;
3677
3678 if (!TARGET_FLOAT
3679 && fndecl && TREE_PUBLIC (fndecl)
3680 && fntype && fntype != error_mark_node)
3681 {
3682 const_tree type = TREE_TYPE (fntype);
3683 machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument. */
3684 int nregs ATTRIBUTE_UNUSED; /* Likewise. */
3685 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
3686 &mode, &nregs, NULL))
3687 aarch64_err_no_fpadvsimd (TYPE_MODE (type));
3688 }
3689 return;
3690 }
3691
3692 static void
3693 aarch64_function_arg_advance (cumulative_args_t pcum_v,
3694 machine_mode mode,
3695 const_tree type,
3696 bool named)
3697 {
3698 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3699 if (pcum->pcs_variant == ARM_PCS_AAPCS64)
3700 {
3701 aarch64_layout_arg (pcum_v, mode, type, named);
3702 gcc_assert ((pcum->aapcs_reg != NULL_RTX)
3703 != (pcum->aapcs_stack_words != 0));
3704 pcum->aapcs_arg_processed = false;
3705 pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
3706 pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
3707 pcum->aapcs_stack_size += pcum->aapcs_stack_words;
3708 pcum->aapcs_stack_words = 0;
3709 pcum->aapcs_reg = NULL_RTX;
3710 }
3711 }
3712
3713 bool
3714 aarch64_function_arg_regno_p (unsigned regno)
3715 {
3716 return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
3717 || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
3718 }
3719
3720 /* Implement FUNCTION_ARG_BOUNDARY. Every parameter gets at least
3721 PARM_BOUNDARY bits of alignment, but will be given anything up
3722 to STACK_BOUNDARY bits if the type requires it. This makes sure
3723 that both before and after the layout of each argument, the Next
3724 Stacked Argument Address (NSAA) will have a minimum alignment of
3725 8 bytes. */
3726
3727 static unsigned int
3728 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
3729 {
3730 unsigned int alignment = aarch64_function_arg_alignment (mode, type);
3731 return MIN (MAX (alignment, PARM_BOUNDARY), STACK_BOUNDARY);
3732 }
3733
3734 /* Implement TARGET_GET_RAW_RESULT_MODE and TARGET_GET_RAW_ARG_MODE. */
3735
3736 static fixed_size_mode
3737 aarch64_get_reg_raw_mode (int regno)
3738 {
3739 if (TARGET_SVE && FP_REGNUM_P (regno))
3740 /* Don't use the SVE part of the register for __builtin_apply and
3741 __builtin_return. The SVE registers aren't used by the normal PCS,
3742 so using them there would be a waste of time. The PCS extensions
3743 for SVE types are fundamentally incompatible with the
3744 __builtin_return/__builtin_apply interface. */
3745 return as_a <fixed_size_mode> (V16QImode);
3746 return default_get_reg_raw_mode (regno);
3747 }
3748
3749 /* Implement TARGET_FUNCTION_ARG_PADDING.
3750
3751 Small aggregate types are placed in the lowest memory address.
3752
3753 The related parameter passing rules are B.4, C.3, C.5 and C.14. */
3754
3755 static pad_direction
3756 aarch64_function_arg_padding (machine_mode mode, const_tree type)
3757 {
3758 /* On little-endian targets, the least significant byte of every stack
3759 argument is passed at the lowest byte address of the stack slot. */
3760 if (!BYTES_BIG_ENDIAN)
3761 return PAD_UPWARD;
3762
3763 /* Otherwise, integral, floating-point and pointer types are padded downward:
3764 the least significant byte of a stack argument is passed at the highest
3765 byte address of the stack slot. */
3766 if (type
3767 ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
3768 || POINTER_TYPE_P (type))
3769 : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
3770 return PAD_DOWNWARD;
3771
3772 /* Everything else padded upward, i.e. data in first byte of stack slot. */
3773 return PAD_UPWARD;
3774 }
3775
3776 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
3777
3778 It specifies padding for the last (may also be the only)
3779 element of a block move between registers and memory. If
3780 assuming the block is in the memory, padding upward means that
3781 the last element is padded after its highest significant byte,
3782 while in downward padding, the last element is padded at the
3783 its least significant byte side.
3784
3785 Small aggregates and small complex types are always padded
3786 upwards.
3787
3788 We don't need to worry about homogeneous floating-point or
3789 short-vector aggregates; their move is not affected by the
3790 padding direction determined here. Regardless of endianness,
3791 each element of such an aggregate is put in the least
3792 significant bits of a fp/simd register.
3793
3794 Return !BYTES_BIG_ENDIAN if the least significant byte of the
3795 register has useful data, and return the opposite if the most
3796 significant byte does. */
3797
3798 bool
3799 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
3800 bool first ATTRIBUTE_UNUSED)
3801 {
3802
3803 /* Small composite types are always padded upward. */
3804 if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
3805 {
3806 HOST_WIDE_INT size;
3807 if (type)
3808 size = int_size_in_bytes (type);
3809 else
3810 /* No frontends can create types with variable-sized modes, so we
3811 shouldn't be asked to pass or return them. */
3812 size = GET_MODE_SIZE (mode).to_constant ();
3813 if (size < 2 * UNITS_PER_WORD)
3814 return true;
3815 }
3816
3817 /* Otherwise, use the default padding. */
3818 return !BYTES_BIG_ENDIAN;
3819 }
3820
3821 static scalar_int_mode
3822 aarch64_libgcc_cmp_return_mode (void)
3823 {
3824 return SImode;
3825 }
3826
3827 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
3828
3829 /* We use the 12-bit shifted immediate arithmetic instructions so values
3830 must be multiple of (1 << 12), i.e. 4096. */
3831 #define ARITH_FACTOR 4096
3832
3833 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
3834 #error Cannot use simple address calculation for stack probing
3835 #endif
3836
3837 /* The pair of scratch registers used for stack probing. */
3838 #define PROBE_STACK_FIRST_REG 9
3839 #define PROBE_STACK_SECOND_REG 10
3840
3841 /* Emit code to probe a range of stack addresses from FIRST to FIRST+POLY_SIZE,
3842 inclusive. These are offsets from the current stack pointer. */
3843
3844 static void
3845 aarch64_emit_probe_stack_range (HOST_WIDE_INT first, poly_int64 poly_size)
3846 {
3847 HOST_WIDE_INT size;
3848 if (!poly_size.is_constant (&size))
3849 {
3850 sorry ("stack probes for SVE frames");
3851 return;
3852 }
3853
3854 rtx reg1 = gen_rtx_REG (Pmode, PROBE_STACK_FIRST_REG);
3855
3856 /* See the same assertion on PROBE_INTERVAL above. */
3857 gcc_assert ((first % ARITH_FACTOR) == 0);
3858
3859 /* See if we have a constant small number of probes to generate. If so,
3860 that's the easy case. */
3861 if (size <= PROBE_INTERVAL)
3862 {
3863 const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR);
3864
3865 emit_set_insn (reg1,
3866 plus_constant (Pmode,
3867 stack_pointer_rtx, -(first + base)));
3868 emit_stack_probe (plus_constant (Pmode, reg1, base - size));
3869 }
3870
3871 /* The run-time loop is made up of 8 insns in the generic case while the
3872 compile-time loop is made up of 4+2*(n-2) insns for n # of intervals. */
3873 else if (size <= 4 * PROBE_INTERVAL)
3874 {
3875 HOST_WIDE_INT i, rem;
3876
3877 emit_set_insn (reg1,
3878 plus_constant (Pmode,
3879 stack_pointer_rtx,
3880 -(first + PROBE_INTERVAL)));
3881 emit_stack_probe (reg1);
3882
3883 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
3884 it exceeds SIZE. If only two probes are needed, this will not
3885 generate any code. Then probe at FIRST + SIZE. */
3886 for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
3887 {
3888 emit_set_insn (reg1,
3889 plus_constant (Pmode, reg1, -PROBE_INTERVAL));
3890 emit_stack_probe (reg1);
3891 }
3892
3893 rem = size - (i - PROBE_INTERVAL);
3894 if (rem > 256)
3895 {
3896 const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
3897
3898 emit_set_insn (reg1, plus_constant (Pmode, reg1, -base));
3899 emit_stack_probe (plus_constant (Pmode, reg1, base - rem));
3900 }
3901 else
3902 emit_stack_probe (plus_constant (Pmode, reg1, -rem));
3903 }
3904
3905 /* Otherwise, do the same as above, but in a loop. Note that we must be
3906 extra careful with variables wrapping around because we might be at
3907 the very top (or the very bottom) of the address space and we have
3908 to be able to handle this case properly; in particular, we use an
3909 equality test for the loop condition. */
3910 else
3911 {
3912 rtx reg2 = gen_rtx_REG (Pmode, PROBE_STACK_SECOND_REG);
3913
3914 /* Step 1: round SIZE to the previous multiple of the interval. */
3915
3916 HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
3917
3918
3919 /* Step 2: compute initial and final value of the loop counter. */
3920
3921 /* TEST_ADDR = SP + FIRST. */
3922 emit_set_insn (reg1,
3923 plus_constant (Pmode, stack_pointer_rtx, -first));
3924
3925 /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE. */
3926 HOST_WIDE_INT adjustment = - (first + rounded_size);
3927 if (! aarch64_uimm12_shift (adjustment))
3928 {
3929 aarch64_internal_mov_immediate (reg2, GEN_INT (adjustment),
3930 true, Pmode);
3931 emit_set_insn (reg2, gen_rtx_PLUS (Pmode, stack_pointer_rtx, reg2));
3932 }
3933 else
3934 emit_set_insn (reg2,
3935 plus_constant (Pmode, stack_pointer_rtx, adjustment));
3936
3937 /* Step 3: the loop
3938
3939 do
3940 {
3941 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
3942 probe at TEST_ADDR
3943 }
3944 while (TEST_ADDR != LAST_ADDR)
3945
3946 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
3947 until it is equal to ROUNDED_SIZE. */
3948
3949 emit_insn (gen_probe_stack_range (reg1, reg1, reg2));
3950
3951
3952 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
3953 that SIZE is equal to ROUNDED_SIZE. */
3954
3955 if (size != rounded_size)
3956 {
3957 HOST_WIDE_INT rem = size - rounded_size;
3958
3959 if (rem > 256)
3960 {
3961 const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
3962
3963 emit_set_insn (reg2, plus_constant (Pmode, reg2, -base));
3964 emit_stack_probe (plus_constant (Pmode, reg2, base - rem));
3965 }
3966 else
3967 emit_stack_probe (plus_constant (Pmode, reg2, -rem));
3968 }
3969 }
3970
3971 /* Make sure nothing is scheduled before we are done. */
3972 emit_insn (gen_blockage ());
3973 }
3974
3975 /* Probe a range of stack addresses from REG1 to REG2 inclusive. These are
3976 absolute addresses. */
3977
3978 const char *
3979 aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
3980 {
3981 static int labelno = 0;
3982 char loop_lab[32];
3983 rtx xops[2];
3984
3985 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
3986
3987 /* Loop. */
3988 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
3989
3990 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
3991 xops[0] = reg1;
3992 xops[1] = GEN_INT (PROBE_INTERVAL);
3993 output_asm_insn ("sub\t%0, %0, %1", xops);
3994
3995 /* Probe at TEST_ADDR. */
3996 output_asm_insn ("str\txzr, [%0]", xops);
3997
3998 /* Test if TEST_ADDR == LAST_ADDR. */
3999 xops[1] = reg2;
4000 output_asm_insn ("cmp\t%0, %1", xops);
4001
4002 /* Branch. */
4003 fputs ("\tb.ne\t", asm_out_file);
4004 assemble_name_raw (asm_out_file, loop_lab);
4005 fputc ('\n', asm_out_file);
4006
4007 return "";
4008 }
4009
4010 /* Determine whether a frame chain needs to be generated. */
4011 static bool
4012 aarch64_needs_frame_chain (void)
4013 {
4014 /* Force a frame chain for EH returns so the return address is at FP+8. */
4015 if (frame_pointer_needed || crtl->calls_eh_return)
4016 return true;
4017
4018 /* A leaf function cannot have calls or write LR. */
4019 bool is_leaf = crtl->is_leaf && !df_regs_ever_live_p (LR_REGNUM);
4020
4021 /* Don't use a frame chain in leaf functions if leaf frame pointers
4022 are disabled. */
4023 if (flag_omit_leaf_frame_pointer && is_leaf)
4024 return false;
4025
4026 return aarch64_use_frame_pointer;
4027 }
4028
4029 /* Mark the registers that need to be saved by the callee and calculate
4030 the size of the callee-saved registers area and frame record (both FP
4031 and LR may be omitted). */
4032 static void
4033 aarch64_layout_frame (void)
4034 {
4035 HOST_WIDE_INT offset = 0;
4036 int regno, last_fp_reg = INVALID_REGNUM;
4037
4038 if (reload_completed && cfun->machine->frame.laid_out)
4039 return;
4040
4041 cfun->machine->frame.emit_frame_chain = aarch64_needs_frame_chain ();
4042
4043 #define SLOT_NOT_REQUIRED (-2)
4044 #define SLOT_REQUIRED (-1)
4045
4046 cfun->machine->frame.wb_candidate1 = INVALID_REGNUM;
4047 cfun->machine->frame.wb_candidate2 = INVALID_REGNUM;
4048
4049 /* First mark all the registers that really need to be saved... */
4050 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
4051 cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
4052
4053 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
4054 cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
4055
4056 /* ... that includes the eh data registers (if needed)... */
4057 if (crtl->calls_eh_return)
4058 for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
4059 cfun->machine->frame.reg_offset[EH_RETURN_DATA_REGNO (regno)]
4060 = SLOT_REQUIRED;
4061
4062 /* ... and any callee saved register that dataflow says is live. */
4063 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
4064 if (df_regs_ever_live_p (regno)
4065 && (regno == R30_REGNUM
4066 || !call_used_regs[regno]))
4067 cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
4068
4069 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
4070 if (df_regs_ever_live_p (regno)
4071 && !call_used_regs[regno])
4072 {
4073 cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
4074 last_fp_reg = regno;
4075 }
4076
4077 if (cfun->machine->frame.emit_frame_chain)
4078 {
4079 /* FP and LR are placed in the linkage record. */
4080 cfun->machine->frame.reg_offset[R29_REGNUM] = 0;
4081 cfun->machine->frame.wb_candidate1 = R29_REGNUM;
4082 cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
4083 cfun->machine->frame.wb_candidate2 = R30_REGNUM;
4084 offset = 2 * UNITS_PER_WORD;
4085 }
4086
4087 /* Now assign stack slots for them. */
4088 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
4089 if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
4090 {
4091 cfun->machine->frame.reg_offset[regno] = offset;
4092 if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
4093 cfun->machine->frame.wb_candidate1 = regno;
4094 else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM)
4095 cfun->machine->frame.wb_candidate2 = regno;
4096 offset += UNITS_PER_WORD;
4097 }
4098
4099 HOST_WIDE_INT max_int_offset = offset;
4100 offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
4101 bool has_align_gap = offset != max_int_offset;
4102
4103 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
4104 if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
4105 {
4106 /* If there is an alignment gap between integer and fp callee-saves,
4107 allocate the last fp register to it if possible. */
4108 if (regno == last_fp_reg && has_align_gap && (offset & 8) == 0)
4109 {
4110 cfun->machine->frame.reg_offset[regno] = max_int_offset;
4111 break;
4112 }
4113
4114 cfun->machine->frame.reg_offset[regno] = offset;
4115 if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
4116 cfun->machine->frame.wb_candidate1 = regno;
4117 else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM
4118 && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
4119 cfun->machine->frame.wb_candidate2 = regno;
4120 offset += UNITS_PER_WORD;
4121 }
4122
4123 offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
4124
4125 cfun->machine->frame.saved_regs_size = offset;
4126
4127 HOST_WIDE_INT varargs_and_saved_regs_size
4128 = offset + cfun->machine->frame.saved_varargs_size;
4129
4130 cfun->machine->frame.hard_fp_offset
4131 = aligned_upper_bound (varargs_and_saved_regs_size
4132 + get_frame_size (),
4133 STACK_BOUNDARY / BITS_PER_UNIT);
4134
4135 /* Both these values are already aligned. */
4136 gcc_assert (multiple_p (crtl->outgoing_args_size,
4137 STACK_BOUNDARY / BITS_PER_UNIT));
4138 cfun->machine->frame.frame_size
4139 = (cfun->machine->frame.hard_fp_offset
4140 + crtl->outgoing_args_size);
4141
4142 cfun->machine->frame.locals_offset = cfun->machine->frame.saved_varargs_size;
4143
4144 cfun->machine->frame.initial_adjust = 0;
4145 cfun->machine->frame.final_adjust = 0;
4146 cfun->machine->frame.callee_adjust = 0;
4147 cfun->machine->frame.callee_offset = 0;
4148
4149 HOST_WIDE_INT max_push_offset = 0;
4150 if (cfun->machine->frame.wb_candidate2 != INVALID_REGNUM)
4151 max_push_offset = 512;
4152 else if (cfun->machine->frame.wb_candidate1 != INVALID_REGNUM)
4153 max_push_offset = 256;
4154
4155 HOST_WIDE_INT const_size, const_fp_offset;
4156 if (cfun->machine->frame.frame_size.is_constant (&const_size)
4157 && const_size < max_push_offset
4158 && known_eq (crtl->outgoing_args_size, 0))
4159 {
4160 /* Simple, small frame with no outgoing arguments:
4161 stp reg1, reg2, [sp, -frame_size]!
4162 stp reg3, reg4, [sp, 16] */
4163 cfun->machine->frame.callee_adjust = const_size;
4164 }
4165 else if (known_lt (crtl->outgoing_args_size
4166 + cfun->machine->frame.saved_regs_size, 512)
4167 && !(cfun->calls_alloca
4168 && known_lt (cfun->machine->frame.hard_fp_offset,
4169 max_push_offset)))
4170 {
4171 /* Frame with small outgoing arguments:
4172 sub sp, sp, frame_size
4173 stp reg1, reg2, [sp, outgoing_args_size]
4174 stp reg3, reg4, [sp, outgoing_args_size + 16] */
4175 cfun->machine->frame.initial_adjust = cfun->machine->frame.frame_size;
4176 cfun->machine->frame.callee_offset
4177 = cfun->machine->frame.frame_size - cfun->machine->frame.hard_fp_offset;
4178 }
4179 else if (cfun->machine->frame.hard_fp_offset.is_constant (&const_fp_offset)
4180 && const_fp_offset < max_push_offset)
4181 {
4182 /* Frame with large outgoing arguments but a small local area:
4183 stp reg1, reg2, [sp, -hard_fp_offset]!
4184 stp reg3, reg4, [sp, 16]
4185 sub sp, sp, outgoing_args_size */
4186 cfun->machine->frame.callee_adjust = const_fp_offset;
4187 cfun->machine->frame.final_adjust
4188 = cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
4189 }
4190 else
4191 {
4192 /* Frame with large local area and outgoing arguments using frame pointer:
4193 sub sp, sp, hard_fp_offset
4194 stp x29, x30, [sp, 0]
4195 add x29, sp, 0
4196 stp reg3, reg4, [sp, 16]
4197 sub sp, sp, outgoing_args_size */
4198 cfun->machine->frame.initial_adjust = cfun->machine->frame.hard_fp_offset;
4199 cfun->machine->frame.final_adjust
4200 = cfun->machine->frame.frame_size - cfun->machine->frame.initial_adjust;
4201 }
4202
4203 cfun->machine->frame.laid_out = true;
4204 }
4205
4206 /* Return true if the register REGNO is saved on entry to
4207 the current function. */
4208
4209 static bool
4210 aarch64_register_saved_on_entry (int regno)
4211 {
4212 return cfun->machine->frame.reg_offset[regno] >= 0;
4213 }
4214
4215 /* Return the next register up from REGNO up to LIMIT for the callee
4216 to save. */
4217
4218 static unsigned
4219 aarch64_next_callee_save (unsigned regno, unsigned limit)
4220 {
4221 while (regno <= limit && !aarch64_register_saved_on_entry (regno))
4222 regno ++;
4223 return regno;
4224 }
4225
4226 /* Push the register number REGNO of mode MODE to the stack with write-back
4227 adjusting the stack by ADJUSTMENT. */
4228
4229 static void
4230 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
4231 HOST_WIDE_INT adjustment)
4232 {
4233 rtx base_rtx = stack_pointer_rtx;
4234 rtx insn, reg, mem;
4235
4236 reg = gen_rtx_REG (mode, regno);
4237 mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
4238 plus_constant (Pmode, base_rtx, -adjustment));
4239 mem = gen_frame_mem (mode, mem);
4240
4241 insn = emit_move_insn (mem, reg);
4242 RTX_FRAME_RELATED_P (insn) = 1;
4243 }
4244
4245 /* Generate and return an instruction to store the pair of registers
4246 REG and REG2 of mode MODE to location BASE with write-back adjusting
4247 the stack location BASE by ADJUSTMENT. */
4248
4249 static rtx
4250 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
4251 HOST_WIDE_INT adjustment)
4252 {
4253 switch (mode)
4254 {
4255 case E_DImode:
4256 return gen_storewb_pairdi_di (base, base, reg, reg2,
4257 GEN_INT (-adjustment),
4258 GEN_INT (UNITS_PER_WORD - adjustment));
4259 case E_DFmode:
4260 return gen_storewb_pairdf_di (base, base, reg, reg2,
4261 GEN_INT (-adjustment),
4262 GEN_INT (UNITS_PER_WORD - adjustment));
4263 default:
4264 gcc_unreachable ();
4265 }
4266 }
4267
4268 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
4269 stack pointer by ADJUSTMENT. */
4270
4271 static void
4272 aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
4273 {
4274 rtx_insn *insn;
4275 machine_mode mode = (regno1 <= R30_REGNUM) ? E_DImode : E_DFmode;
4276
4277 if (regno2 == INVALID_REGNUM)
4278 return aarch64_pushwb_single_reg (mode, regno1, adjustment);
4279
4280 rtx reg1 = gen_rtx_REG (mode, regno1);
4281 rtx reg2 = gen_rtx_REG (mode, regno2);
4282
4283 insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
4284 reg2, adjustment));
4285 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
4286 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
4287 RTX_FRAME_RELATED_P (insn) = 1;
4288 }
4289
4290 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
4291 adjusting it by ADJUSTMENT afterwards. */
4292
4293 static rtx
4294 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
4295 HOST_WIDE_INT adjustment)
4296 {
4297 switch (mode)
4298 {
4299 case E_DImode:
4300 return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
4301 GEN_INT (UNITS_PER_WORD));
4302 case E_DFmode:
4303 return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
4304 GEN_INT (UNITS_PER_WORD));
4305 default:
4306 gcc_unreachable ();
4307 }
4308 }
4309
4310 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
4311 afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
4312 into CFI_OPS. */
4313
4314 static void
4315 aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
4316 rtx *cfi_ops)
4317 {
4318 machine_mode mode = (regno1 <= R30_REGNUM) ? E_DImode : E_DFmode;
4319 rtx reg1 = gen_rtx_REG (mode, regno1);
4320
4321 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
4322
4323 if (regno2 == INVALID_REGNUM)
4324 {
4325 rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment);
4326 mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
4327 emit_move_insn (reg1, gen_frame_mem (mode, mem));
4328 }
4329 else
4330 {
4331 rtx reg2 = gen_rtx_REG (mode, regno2);
4332 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
4333 emit_insn (aarch64_gen_loadwb_pair (mode, stack_pointer_rtx, reg1,
4334 reg2, adjustment));
4335 }
4336 }
4337
4338 /* Generate and return a store pair instruction of mode MODE to store
4339 register REG1 to MEM1 and register REG2 to MEM2. */
4340
4341 static rtx
4342 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
4343 rtx reg2)
4344 {
4345 switch (mode)
4346 {
4347 case E_DImode:
4348 return gen_store_pair_dw_didi (mem1, reg1, mem2, reg2);
4349
4350 case E_DFmode:
4351 return gen_store_pair_dw_dfdf (mem1, reg1, mem2, reg2);
4352
4353 default:
4354 gcc_unreachable ();
4355 }
4356 }
4357
4358 /* Generate and regurn a load pair isntruction of mode MODE to load register
4359 REG1 from MEM1 and register REG2 from MEM2. */
4360
4361 static rtx
4362 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
4363 rtx mem2)
4364 {
4365 switch (mode)
4366 {
4367 case E_DImode:
4368 return gen_load_pair_dw_didi (reg1, mem1, reg2, mem2);
4369
4370 case E_DFmode:
4371 return gen_load_pair_dw_dfdf (reg1, mem1, reg2, mem2);
4372
4373 default:
4374 gcc_unreachable ();
4375 }
4376 }
4377
4378 /* Return TRUE if return address signing should be enabled for the current
4379 function, otherwise return FALSE. */
4380
4381 bool
4382 aarch64_return_address_signing_enabled (void)
4383 {
4384 /* This function should only be called after frame laid out. */
4385 gcc_assert (cfun->machine->frame.laid_out);
4386
4387 /* If signing scope is AARCH64_FUNCTION_NON_LEAF, we only sign a leaf function
4388 if it's LR is pushed onto stack. */
4389 return (aarch64_ra_sign_scope == AARCH64_FUNCTION_ALL
4390 || (aarch64_ra_sign_scope == AARCH64_FUNCTION_NON_LEAF
4391 && cfun->machine->frame.reg_offset[LR_REGNUM] >= 0));
4392 }
4393
4394 /* Emit code to save the callee-saved registers from register number START
4395 to LIMIT to the stack at the location starting at offset START_OFFSET,
4396 skipping any write-back candidates if SKIP_WB is true. */
4397
4398 static void
4399 aarch64_save_callee_saves (machine_mode mode, poly_int64 start_offset,
4400 unsigned start, unsigned limit, bool skip_wb)
4401 {
4402 rtx_insn *insn;
4403 unsigned regno;
4404 unsigned regno2;
4405
4406 for (regno = aarch64_next_callee_save (start, limit);
4407 regno <= limit;
4408 regno = aarch64_next_callee_save (regno + 1, limit))
4409 {
4410 rtx reg, mem;
4411 poly_int64 offset;
4412
4413 if (skip_wb
4414 && (regno == cfun->machine->frame.wb_candidate1
4415 || regno == cfun->machine->frame.wb_candidate2))
4416 continue;
4417
4418 if (cfun->machine->reg_is_wrapped_separately[regno])
4419 continue;
4420
4421 reg = gen_rtx_REG (mode, regno);
4422 offset = start_offset + cfun->machine->frame.reg_offset[regno];
4423 mem = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
4424 offset));
4425
4426 regno2 = aarch64_next_callee_save (regno + 1, limit);
4427
4428 if (regno2 <= limit
4429 && !cfun->machine->reg_is_wrapped_separately[regno2]
4430 && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
4431 == cfun->machine->frame.reg_offset[regno2]))
4432
4433 {
4434 rtx reg2 = gen_rtx_REG (mode, regno2);
4435 rtx mem2;
4436
4437 offset = start_offset + cfun->machine->frame.reg_offset[regno2];
4438 mem2 = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
4439 offset));
4440 insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
4441 reg2));
4442
4443 /* The first part of a frame-related parallel insn is
4444 always assumed to be relevant to the frame
4445 calculations; subsequent parts, are only
4446 frame-related if explicitly marked. */
4447 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
4448 regno = regno2;
4449 }
4450 else
4451 insn = emit_move_insn (mem, reg);
4452
4453 RTX_FRAME_RELATED_P (insn) = 1;
4454 }
4455 }
4456
4457 /* Emit code to restore the callee registers of mode MODE from register
4458 number START up to and including LIMIT. Restore from the stack offset
4459 START_OFFSET, skipping any write-back candidates if SKIP_WB is true.
4460 Write the appropriate REG_CFA_RESTORE notes into CFI_OPS. */
4461
4462 static void
4463 aarch64_restore_callee_saves (machine_mode mode,
4464 poly_int64 start_offset, unsigned start,
4465 unsigned limit, bool skip_wb, rtx *cfi_ops)
4466 {
4467 rtx base_rtx = stack_pointer_rtx;
4468 unsigned regno;
4469 unsigned regno2;
4470 poly_int64 offset;
4471
4472 for (regno = aarch64_next_callee_save (start, limit);
4473 regno <= limit;
4474 regno = aarch64_next_callee_save (regno + 1, limit))
4475 {
4476 if (cfun->machine->reg_is_wrapped_separately[regno])
4477 continue;
4478
4479 rtx reg, mem;
4480
4481 if (skip_wb
4482 && (regno == cfun->machine->frame.wb_candidate1
4483 || regno == cfun->machine->frame.wb_candidate2))
4484 continue;
4485
4486 reg = gen_rtx_REG (mode, regno);
4487 offset = start_offset + cfun->machine->frame.reg_offset[regno];
4488 mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
4489
4490 regno2 = aarch64_next_callee_save (regno + 1, limit);
4491
4492 if (regno2 <= limit
4493 && !cfun->machine->reg_is_wrapped_separately[regno2]
4494 && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
4495 == cfun->machine->frame.reg_offset[regno2]))
4496 {
4497 rtx reg2 = gen_rtx_REG (mode, regno2);
4498 rtx mem2;
4499
4500 offset = start_offset + cfun->machine->frame.reg_offset[regno2];
4501 mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
4502 emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
4503
4504 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
4505 regno = regno2;
4506 }
4507 else
4508 emit_move_insn (reg, mem);
4509 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
4510 }
4511 }
4512
4513 /* Return true if OFFSET is a signed 4-bit value multiplied by the size
4514 of MODE. */
4515
4516 static inline bool
4517 offset_4bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
4518 {
4519 HOST_WIDE_INT multiple;
4520 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4521 && IN_RANGE (multiple, -8, 7));
4522 }
4523
4524 /* Return true if OFFSET is a unsigned 6-bit value multiplied by the size
4525 of MODE. */
4526
4527 static inline bool
4528 offset_6bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
4529 {
4530 HOST_WIDE_INT multiple;
4531 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4532 && IN_RANGE (multiple, 0, 63));
4533 }
4534
4535 /* Return true if OFFSET is a signed 7-bit value multiplied by the size
4536 of MODE. */
4537
4538 bool
4539 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
4540 {
4541 HOST_WIDE_INT multiple;
4542 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4543 && IN_RANGE (multiple, -64, 63));
4544 }
4545
4546 /* Return true if OFFSET is a signed 9-bit value. */
4547
4548 static inline bool
4549 offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
4550 poly_int64 offset)
4551 {
4552 HOST_WIDE_INT const_offset;
4553 return (offset.is_constant (&const_offset)
4554 && IN_RANGE (const_offset, -256, 255));
4555 }
4556
4557 /* Return true if OFFSET is a signed 9-bit value multiplied by the size
4558 of MODE. */
4559
4560 static inline bool
4561 offset_9bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
4562 {
4563 HOST_WIDE_INT multiple;
4564 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4565 && IN_RANGE (multiple, -256, 255));
4566 }
4567
4568 /* Return true if OFFSET is an unsigned 12-bit value multiplied by the size
4569 of MODE. */
4570
4571 static inline bool
4572 offset_12bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
4573 {
4574 HOST_WIDE_INT multiple;
4575 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4576 && IN_RANGE (multiple, 0, 4095));
4577 }
4578
4579 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS. */
4580
4581 static sbitmap
4582 aarch64_get_separate_components (void)
4583 {
4584 aarch64_layout_frame ();
4585
4586 sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
4587 bitmap_clear (components);
4588
4589 /* The registers we need saved to the frame. */
4590 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
4591 if (aarch64_register_saved_on_entry (regno))
4592 {
4593 poly_int64 offset = cfun->machine->frame.reg_offset[regno];
4594 if (!frame_pointer_needed)
4595 offset += cfun->machine->frame.frame_size
4596 - cfun->machine->frame.hard_fp_offset;
4597 /* Check that we can access the stack slot of the register with one
4598 direct load with no adjustments needed. */
4599 if (offset_12bit_unsigned_scaled_p (DImode, offset))
4600 bitmap_set_bit (components, regno);
4601 }
4602
4603 /* Don't mess with the hard frame pointer. */
4604 if (frame_pointer_needed)
4605 bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM);
4606
4607 unsigned reg1 = cfun->machine->frame.wb_candidate1;
4608 unsigned reg2 = cfun->machine->frame.wb_candidate2;
4609 /* If aarch64_layout_frame has chosen registers to store/restore with
4610 writeback don't interfere with them to avoid having to output explicit
4611 stack adjustment instructions. */
4612 if (reg2 != INVALID_REGNUM)
4613 bitmap_clear_bit (components, reg2);
4614 if (reg1 != INVALID_REGNUM)
4615 bitmap_clear_bit (components, reg1);
4616
4617 bitmap_clear_bit (components, LR_REGNUM);
4618 bitmap_clear_bit (components, SP_REGNUM);
4619
4620 return components;
4621 }
4622
4623 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB. */
4624
4625 static sbitmap
4626 aarch64_components_for_bb (basic_block bb)
4627 {
4628 bitmap in = DF_LIVE_IN (bb);
4629 bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
4630 bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
4631
4632 sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
4633 bitmap_clear (components);
4634
4635 /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets. */
4636 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
4637 if ((!call_used_regs[regno])
4638 && (bitmap_bit_p (in, regno)
4639 || bitmap_bit_p (gen, regno)
4640 || bitmap_bit_p (kill, regno)))
4641 {
4642 unsigned regno2, offset, offset2;
4643 bitmap_set_bit (components, regno);
4644
4645 /* If there is a callee-save at an adjacent offset, add it too
4646 to increase the use of LDP/STP. */
4647 offset = cfun->machine->frame.reg_offset[regno];
4648 regno2 = ((offset & 8) == 0) ? regno + 1 : regno - 1;
4649
4650 if (regno2 <= LAST_SAVED_REGNUM)
4651 {
4652 offset2 = cfun->machine->frame.reg_offset[regno2];
4653 if ((offset & ~8) == (offset2 & ~8))
4654 bitmap_set_bit (components, regno2);
4655 }
4656 }
4657
4658 return components;
4659 }
4660
4661 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
4662 Nothing to do for aarch64. */
4663
4664 static void
4665 aarch64_disqualify_components (sbitmap, edge, sbitmap, bool)
4666 {
4667 }
4668
4669 /* Return the next set bit in BMP from START onwards. Return the total number
4670 of bits in BMP if no set bit is found at or after START. */
4671
4672 static unsigned int
4673 aarch64_get_next_set_bit (sbitmap bmp, unsigned int start)
4674 {
4675 unsigned int nbits = SBITMAP_SIZE (bmp);
4676 if (start == nbits)
4677 return start;
4678
4679 gcc_assert (start < nbits);
4680 for (unsigned int i = start; i < nbits; i++)
4681 if (bitmap_bit_p (bmp, i))
4682 return i;
4683
4684 return nbits;
4685 }
4686
4687 /* Do the work for aarch64_emit_prologue_components and
4688 aarch64_emit_epilogue_components. COMPONENTS is the bitmap of registers
4689 to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
4690 for these components or the epilogue sequence. That is, it determines
4691 whether we should emit stores or loads and what kind of CFA notes to attach
4692 to the insns. Otherwise the logic for the two sequences is very
4693 similar. */
4694
4695 static void
4696 aarch64_process_components (sbitmap components, bool prologue_p)
4697 {
4698 rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
4699 ? HARD_FRAME_POINTER_REGNUM
4700 : STACK_POINTER_REGNUM);
4701
4702 unsigned last_regno = SBITMAP_SIZE (components);
4703 unsigned regno = aarch64_get_next_set_bit (components, R0_REGNUM);
4704 rtx_insn *insn = NULL;
4705
4706 while (regno != last_regno)
4707 {
4708 /* AAPCS64 section 5.1.2 requires only the bottom 64 bits to be saved
4709 so DFmode for the vector registers is enough. */
4710 machine_mode mode = GP_REGNUM_P (regno) ? E_DImode : E_DFmode;
4711 rtx reg = gen_rtx_REG (mode, regno);
4712 poly_int64 offset = cfun->machine->frame.reg_offset[regno];
4713 if (!frame_pointer_needed)
4714 offset += cfun->machine->frame.frame_size
4715 - cfun->machine->frame.hard_fp_offset;
4716 rtx addr = plus_constant (Pmode, ptr_reg, offset);
4717 rtx mem = gen_frame_mem (mode, addr);
4718
4719 rtx set = prologue_p ? gen_rtx_SET (mem, reg) : gen_rtx_SET (reg, mem);
4720 unsigned regno2 = aarch64_get_next_set_bit (components, regno + 1);
4721 /* No more registers to handle after REGNO.
4722 Emit a single save/restore and exit. */
4723 if (regno2 == last_regno)
4724 {
4725 insn = emit_insn (set);
4726 RTX_FRAME_RELATED_P (insn) = 1;
4727 if (prologue_p)
4728 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
4729 else
4730 add_reg_note (insn, REG_CFA_RESTORE, reg);
4731 break;
4732 }
4733
4734 poly_int64 offset2 = cfun->machine->frame.reg_offset[regno2];
4735 /* The next register is not of the same class or its offset is not
4736 mergeable with the current one into a pair. */
4737 if (!satisfies_constraint_Ump (mem)
4738 || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
4739 || maybe_ne ((offset2 - cfun->machine->frame.reg_offset[regno]),
4740 GET_MODE_SIZE (mode)))
4741 {
4742 insn = emit_insn (set);
4743 RTX_FRAME_RELATED_P (insn) = 1;
4744 if (prologue_p)
4745 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
4746 else
4747 add_reg_note (insn, REG_CFA_RESTORE, reg);
4748
4749 regno = regno2;
4750 continue;
4751 }
4752
4753 /* REGNO2 can be saved/restored in a pair with REGNO. */
4754 rtx reg2 = gen_rtx_REG (mode, regno2);
4755 if (!frame_pointer_needed)
4756 offset2 += cfun->machine->frame.frame_size
4757 - cfun->machine->frame.hard_fp_offset;
4758 rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
4759 rtx mem2 = gen_frame_mem (mode, addr2);
4760 rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
4761 : gen_rtx_SET (reg2, mem2);
4762
4763 if (prologue_p)
4764 insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2, reg2));
4765 else
4766 insn = emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
4767
4768 RTX_FRAME_RELATED_P (insn) = 1;
4769 if (prologue_p)
4770 {
4771 add_reg_note (insn, REG_CFA_OFFSET, set);
4772 add_reg_note (insn, REG_CFA_OFFSET, set2);
4773 }
4774 else
4775 {
4776 add_reg_note (insn, REG_CFA_RESTORE, reg);
4777 add_reg_note (insn, REG_CFA_RESTORE, reg2);
4778 }
4779
4780 regno = aarch64_get_next_set_bit (components, regno2 + 1);
4781 }
4782 }
4783
4784 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS. */
4785
4786 static void
4787 aarch64_emit_prologue_components (sbitmap components)
4788 {
4789 aarch64_process_components (components, true);
4790 }
4791
4792 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS. */
4793
4794 static void
4795 aarch64_emit_epilogue_components (sbitmap components)
4796 {
4797 aarch64_process_components (components, false);
4798 }
4799
4800 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS. */
4801
4802 static void
4803 aarch64_set_handled_components (sbitmap components)
4804 {
4805 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
4806 if (bitmap_bit_p (components, regno))
4807 cfun->machine->reg_is_wrapped_separately[regno] = true;
4808 }
4809
4810 /* Add a REG_CFA_EXPRESSION note to INSN to say that register REG
4811 is saved at BASE + OFFSET. */
4812
4813 static void
4814 aarch64_add_cfa_expression (rtx_insn *insn, unsigned int reg,
4815 rtx base, poly_int64 offset)
4816 {
4817 rtx mem = gen_frame_mem (DImode, plus_constant (Pmode, base, offset));
4818 add_reg_note (insn, REG_CFA_EXPRESSION,
4819 gen_rtx_SET (mem, regno_reg_rtx[reg]));
4820 }
4821
4822 /* AArch64 stack frames generated by this compiler look like:
4823
4824 +-------------------------------+
4825 | |
4826 | incoming stack arguments |
4827 | |
4828 +-------------------------------+
4829 | | <-- incoming stack pointer (aligned)
4830 | callee-allocated save area |
4831 | for register varargs |
4832 | |
4833 +-------------------------------+
4834 | local variables | <-- frame_pointer_rtx
4835 | |
4836 +-------------------------------+
4837 | padding0 | \
4838 +-------------------------------+ |
4839 | callee-saved registers | | frame.saved_regs_size
4840 +-------------------------------+ |
4841 | LR' | |
4842 +-------------------------------+ |
4843 | FP' | / <- hard_frame_pointer_rtx (aligned)
4844 +-------------------------------+
4845 | dynamic allocation |
4846 +-------------------------------+
4847 | padding |
4848 +-------------------------------+
4849 | outgoing stack arguments | <-- arg_pointer
4850 | |
4851 +-------------------------------+
4852 | | <-- stack_pointer_rtx (aligned)
4853
4854 Dynamic stack allocations via alloca() decrease stack_pointer_rtx
4855 but leave frame_pointer_rtx and hard_frame_pointer_rtx
4856 unchanged. */
4857
4858 /* Generate the prologue instructions for entry into a function.
4859 Establish the stack frame by decreasing the stack pointer with a
4860 properly calculated size and, if necessary, create a frame record
4861 filled with the values of LR and previous frame pointer. The
4862 current FP is also set up if it is in use. */
4863
4864 void
4865 aarch64_expand_prologue (void)
4866 {
4867 aarch64_layout_frame ();
4868
4869 poly_int64 frame_size = cfun->machine->frame.frame_size;
4870 poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
4871 HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
4872 poly_int64 final_adjust = cfun->machine->frame.final_adjust;
4873 poly_int64 callee_offset = cfun->machine->frame.callee_offset;
4874 unsigned reg1 = cfun->machine->frame.wb_candidate1;
4875 unsigned reg2 = cfun->machine->frame.wb_candidate2;
4876 bool emit_frame_chain = cfun->machine->frame.emit_frame_chain;
4877 rtx_insn *insn;
4878
4879 /* Sign return address for functions. */
4880 if (aarch64_return_address_signing_enabled ())
4881 {
4882 insn = emit_insn (gen_pacisp ());
4883 add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
4884 RTX_FRAME_RELATED_P (insn) = 1;
4885 }
4886
4887 if (flag_stack_usage_info)
4888 current_function_static_stack_size = constant_lower_bound (frame_size);
4889
4890 if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
4891 {
4892 if (crtl->is_leaf && !cfun->calls_alloca)
4893 {
4894 if (maybe_gt (frame_size, PROBE_INTERVAL)
4895 && maybe_gt (frame_size, get_stack_check_protect ()))
4896 aarch64_emit_probe_stack_range (get_stack_check_protect (),
4897 (frame_size
4898 - get_stack_check_protect ()));
4899 }
4900 else if (maybe_gt (frame_size, 0))
4901 aarch64_emit_probe_stack_range (get_stack_check_protect (), frame_size);
4902 }
4903
4904 rtx ip0_rtx = gen_rtx_REG (Pmode, IP0_REGNUM);
4905 rtx ip1_rtx = gen_rtx_REG (Pmode, IP1_REGNUM);
4906
4907 aarch64_sub_sp (ip0_rtx, ip1_rtx, initial_adjust, true);
4908
4909 if (callee_adjust != 0)
4910 aarch64_push_regs (reg1, reg2, callee_adjust);
4911
4912 if (emit_frame_chain)
4913 {
4914 poly_int64 reg_offset = callee_adjust;
4915 if (callee_adjust == 0)
4916 {
4917 reg1 = R29_REGNUM;
4918 reg2 = R30_REGNUM;
4919 reg_offset = callee_offset;
4920 aarch64_save_callee_saves (DImode, reg_offset, reg1, reg2, false);
4921 }
4922 aarch64_add_offset (Pmode, hard_frame_pointer_rtx,
4923 stack_pointer_rtx, callee_offset,
4924 ip1_rtx, ip0_rtx, frame_pointer_needed);
4925 if (frame_pointer_needed && !frame_size.is_constant ())
4926 {
4927 /* Variable-sized frames need to describe the save slot
4928 address using DW_CFA_expression rather than DW_CFA_offset.
4929 This means that, without taking further action, the
4930 locations of the registers that we've already saved would
4931 remain based on the stack pointer even after we redefine
4932 the CFA based on the frame pointer. We therefore need new
4933 DW_CFA_expressions to re-express the save slots with addresses
4934 based on the frame pointer. */
4935 rtx_insn *insn = get_last_insn ();
4936 gcc_assert (RTX_FRAME_RELATED_P (insn));
4937
4938 /* Add an explicit CFA definition if this was previously
4939 implicit. */
4940 if (!find_reg_note (insn, REG_CFA_ADJUST_CFA, NULL_RTX))
4941 {
4942 rtx src = plus_constant (Pmode, stack_pointer_rtx,
4943 callee_offset);
4944 add_reg_note (insn, REG_CFA_ADJUST_CFA,
4945 gen_rtx_SET (hard_frame_pointer_rtx, src));
4946 }
4947
4948 /* Change the save slot expressions for the registers that
4949 we've already saved. */
4950 reg_offset -= callee_offset;
4951 aarch64_add_cfa_expression (insn, reg2, hard_frame_pointer_rtx,
4952 reg_offset + UNITS_PER_WORD);
4953 aarch64_add_cfa_expression (insn, reg1, hard_frame_pointer_rtx,
4954 reg_offset);
4955 }
4956 emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
4957 }
4958
4959 aarch64_save_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
4960 callee_adjust != 0 || emit_frame_chain);
4961 aarch64_save_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
4962 callee_adjust != 0 || emit_frame_chain);
4963 aarch64_sub_sp (ip1_rtx, ip0_rtx, final_adjust, !frame_pointer_needed);
4964 }
4965
4966 /* Return TRUE if we can use a simple_return insn.
4967
4968 This function checks whether the callee saved stack is empty, which
4969 means no restore actions are need. The pro_and_epilogue will use
4970 this to check whether shrink-wrapping opt is feasible. */
4971
4972 bool
4973 aarch64_use_return_insn_p (void)
4974 {
4975 if (!reload_completed)
4976 return false;
4977
4978 if (crtl->profile)
4979 return false;
4980
4981 aarch64_layout_frame ();
4982
4983 return known_eq (cfun->machine->frame.frame_size, 0);
4984 }
4985
4986 /* Generate the epilogue instructions for returning from a function.
4987 This is almost exactly the reverse of the prolog sequence, except
4988 that we need to insert barriers to avoid scheduling loads that read
4989 from a deallocated stack, and we optimize the unwind records by
4990 emitting them all together if possible. */
4991 void
4992 aarch64_expand_epilogue (bool for_sibcall)
4993 {
4994 aarch64_layout_frame ();
4995
4996 poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
4997 HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
4998 poly_int64 final_adjust = cfun->machine->frame.final_adjust;
4999 poly_int64 callee_offset = cfun->machine->frame.callee_offset;
5000 unsigned reg1 = cfun->machine->frame.wb_candidate1;
5001 unsigned reg2 = cfun->machine->frame.wb_candidate2;
5002 rtx cfi_ops = NULL;
5003 rtx_insn *insn;
5004 /* A stack clash protection prologue may not have left IP0_REGNUM or
5005 IP1_REGNUM in a usable state. The same is true for allocations
5006 with an SVE component, since we then need both temporary registers
5007 for each allocation. */
5008 bool can_inherit_p = (initial_adjust.is_constant ()
5009 && final_adjust.is_constant ()
5010 && !flag_stack_clash_protection);
5011
5012 /* We need to add memory barrier to prevent read from deallocated stack. */
5013 bool need_barrier_p
5014 = maybe_ne (get_frame_size ()
5015 + cfun->machine->frame.saved_varargs_size, 0);
5016
5017 /* Emit a barrier to prevent loads from a deallocated stack. */
5018 if (maybe_gt (final_adjust, crtl->outgoing_args_size)
5019 || cfun->calls_alloca
5020 || crtl->calls_eh_return)
5021 {
5022 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
5023 need_barrier_p = false;
5024 }
5025
5026 /* Restore the stack pointer from the frame pointer if it may not
5027 be the same as the stack pointer. */
5028 rtx ip0_rtx = gen_rtx_REG (Pmode, IP0_REGNUM);
5029 rtx ip1_rtx = gen_rtx_REG (Pmode, IP1_REGNUM);
5030 if (frame_pointer_needed
5031 && (maybe_ne (final_adjust, 0) || cfun->calls_alloca))
5032 /* If writeback is used when restoring callee-saves, the CFA
5033 is restored on the instruction doing the writeback. */
5034 aarch64_add_offset (Pmode, stack_pointer_rtx,
5035 hard_frame_pointer_rtx, -callee_offset,
5036 ip1_rtx, ip0_rtx, callee_adjust == 0);
5037 else
5038 aarch64_add_sp (ip1_rtx, ip0_rtx, final_adjust,
5039 !can_inherit_p || df_regs_ever_live_p (IP1_REGNUM));
5040
5041 aarch64_restore_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
5042 callee_adjust != 0, &cfi_ops);
5043 aarch64_restore_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
5044 callee_adjust != 0, &cfi_ops);
5045
5046 if (need_barrier_p)
5047 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
5048
5049 if (callee_adjust != 0)
5050 aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops);
5051
5052 if (callee_adjust != 0 || maybe_gt (initial_adjust, 65536))
5053 {
5054 /* Emit delayed restores and set the CFA to be SP + initial_adjust. */
5055 insn = get_last_insn ();
5056 rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust);
5057 REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
5058 RTX_FRAME_RELATED_P (insn) = 1;
5059 cfi_ops = NULL;
5060 }
5061
5062 aarch64_add_sp (ip0_rtx, ip1_rtx, initial_adjust,
5063 !can_inherit_p || df_regs_ever_live_p (IP0_REGNUM));
5064
5065 if (cfi_ops)
5066 {
5067 /* Emit delayed restores and reset the CFA to be SP. */
5068 insn = get_last_insn ();
5069 cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
5070 REG_NOTES (insn) = cfi_ops;
5071 RTX_FRAME_RELATED_P (insn) = 1;
5072 }
5073
5074 /* We prefer to emit the combined return/authenticate instruction RETAA,
5075 however there are three cases in which we must instead emit an explicit
5076 authentication instruction.
5077
5078 1) Sibcalls don't return in a normal way, so if we're about to call one
5079 we must authenticate.
5080
5081 2) The RETAA instruction is not available before ARMv8.3-A, so if we are
5082 generating code for !TARGET_ARMV8_3 we can't use it and must
5083 explicitly authenticate.
5084
5085 3) On an eh_return path we make extra stack adjustments to update the
5086 canonical frame address to be the exception handler's CFA. We want
5087 to authenticate using the CFA of the function which calls eh_return.
5088 */
5089 if (aarch64_return_address_signing_enabled ()
5090 && (for_sibcall || !TARGET_ARMV8_3 || crtl->calls_eh_return))
5091 {
5092 insn = emit_insn (gen_autisp ());
5093 add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
5094 RTX_FRAME_RELATED_P (insn) = 1;
5095 }
5096
5097 /* Stack adjustment for exception handler. */
5098 if (crtl->calls_eh_return)
5099 {
5100 /* We need to unwind the stack by the offset computed by
5101 EH_RETURN_STACKADJ_RTX. We have already reset the CFA
5102 to be SP; letting the CFA move during this adjustment
5103 is just as correct as retaining the CFA from the body
5104 of the function. Therefore, do nothing special. */
5105 emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
5106 }
5107
5108 emit_use (gen_rtx_REG (DImode, LR_REGNUM));
5109 if (!for_sibcall)
5110 emit_jump_insn (ret_rtx);
5111 }
5112
5113 /* Implement EH_RETURN_HANDLER_RTX. EH returns need to either return
5114 normally or return to a previous frame after unwinding.
5115
5116 An EH return uses a single shared return sequence. The epilogue is
5117 exactly like a normal epilogue except that it has an extra input
5118 register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
5119 that must be applied after the frame has been destroyed. An extra label
5120 is inserted before the epilogue which initializes this register to zero,
5121 and this is the entry point for a normal return.
5122
5123 An actual EH return updates the return address, initializes the stack
5124 adjustment and jumps directly into the epilogue (bypassing the zeroing
5125 of the adjustment). Since the return address is typically saved on the
5126 stack when a function makes a call, the saved LR must be updated outside
5127 the epilogue.
5128
5129 This poses problems as the store is generated well before the epilogue,
5130 so the offset of LR is not known yet. Also optimizations will remove the
5131 store as it appears dead, even after the epilogue is generated (as the
5132 base or offset for loading LR is different in many cases).
5133
5134 To avoid these problems this implementation forces the frame pointer
5135 in eh_return functions so that the location of LR is fixed and known early.
5136 It also marks the store volatile, so no optimization is permitted to
5137 remove the store. */
5138 rtx
5139 aarch64_eh_return_handler_rtx (void)
5140 {
5141 rtx tmp = gen_frame_mem (Pmode,
5142 plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
5143
5144 /* Mark the store volatile, so no optimization is permitted to remove it. */
5145 MEM_VOLATILE_P (tmp) = true;
5146 return tmp;
5147 }
5148
5149 /* Output code to add DELTA to the first argument, and then jump
5150 to FUNCTION. Used for C++ multiple inheritance. */
5151 static void
5152 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
5153 HOST_WIDE_INT delta,
5154 HOST_WIDE_INT vcall_offset,
5155 tree function)
5156 {
5157 /* The this pointer is always in x0. Note that this differs from
5158 Arm where the this pointer maybe bumped to r1 if r0 is required
5159 to return a pointer to an aggregate. On AArch64 a result value
5160 pointer will be in x8. */
5161 int this_regno = R0_REGNUM;
5162 rtx this_rtx, temp0, temp1, addr, funexp;
5163 rtx_insn *insn;
5164
5165 reload_completed = 1;
5166 emit_note (NOTE_INSN_PROLOGUE_END);
5167
5168 this_rtx = gen_rtx_REG (Pmode, this_regno);
5169 temp0 = gen_rtx_REG (Pmode, IP0_REGNUM);
5170 temp1 = gen_rtx_REG (Pmode, IP1_REGNUM);
5171
5172 if (vcall_offset == 0)
5173 aarch64_add_offset (Pmode, this_rtx, this_rtx, delta, temp1, temp0, false);
5174 else
5175 {
5176 gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
5177
5178 addr = this_rtx;
5179 if (delta != 0)
5180 {
5181 if (delta >= -256 && delta < 256)
5182 addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
5183 plus_constant (Pmode, this_rtx, delta));
5184 else
5185 aarch64_add_offset (Pmode, this_rtx, this_rtx, delta,
5186 temp1, temp0, false);
5187 }
5188
5189 if (Pmode == ptr_mode)
5190 aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
5191 else
5192 aarch64_emit_move (temp0,
5193 gen_rtx_ZERO_EXTEND (Pmode,
5194 gen_rtx_MEM (ptr_mode, addr)));
5195
5196 if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
5197 addr = plus_constant (Pmode, temp0, vcall_offset);
5198 else
5199 {
5200 aarch64_internal_mov_immediate (temp1, GEN_INT (vcall_offset), true,
5201 Pmode);
5202 addr = gen_rtx_PLUS (Pmode, temp0, temp1);
5203 }
5204
5205 if (Pmode == ptr_mode)
5206 aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
5207 else
5208 aarch64_emit_move (temp1,
5209 gen_rtx_SIGN_EXTEND (Pmode,
5210 gen_rtx_MEM (ptr_mode, addr)));
5211
5212 emit_insn (gen_add2_insn (this_rtx, temp1));
5213 }
5214
5215 /* Generate a tail call to the target function. */
5216 if (!TREE_USED (function))
5217 {
5218 assemble_external (function);
5219 TREE_USED (function) = 1;
5220 }
5221 funexp = XEXP (DECL_RTL (function), 0);
5222 funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
5223 insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, NULL_RTX));
5224 SIBLING_CALL_P (insn) = 1;
5225
5226 insn = get_insns ();
5227 shorten_branches (insn);
5228 final_start_function (insn, file, 1);
5229 final (insn, file, 1);
5230 final_end_function ();
5231
5232 /* Stop pretending to be a post-reload pass. */
5233 reload_completed = 0;
5234 }
5235
5236 static bool
5237 aarch64_tls_referenced_p (rtx x)
5238 {
5239 if (!TARGET_HAVE_TLS)
5240 return false;
5241 subrtx_iterator::array_type array;
5242 FOR_EACH_SUBRTX (iter, array, x, ALL)
5243 {
5244 const_rtx x = *iter;
5245 if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
5246 return true;
5247 /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
5248 TLS offsets, not real symbol references. */
5249 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
5250 iter.skip_subrtxes ();
5251 }
5252 return false;
5253 }
5254
5255
5256 /* Return true if val can be encoded as a 12-bit unsigned immediate with
5257 a left shift of 0 or 12 bits. */
5258 bool
5259 aarch64_uimm12_shift (HOST_WIDE_INT val)
5260 {
5261 return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
5262 || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
5263 );
5264 }
5265
5266
5267 /* Return true if val is an immediate that can be loaded into a
5268 register by a MOVZ instruction. */
5269 static bool
5270 aarch64_movw_imm (HOST_WIDE_INT val, scalar_int_mode mode)
5271 {
5272 if (GET_MODE_SIZE (mode) > 4)
5273 {
5274 if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
5275 || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
5276 return 1;
5277 }
5278 else
5279 {
5280 /* Ignore sign extension. */
5281 val &= (HOST_WIDE_INT) 0xffffffff;
5282 }
5283 return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
5284 || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
5285 }
5286
5287 /* VAL is a value with the inner mode of MODE. Replicate it to fill a
5288 64-bit (DImode) integer. */
5289
5290 static unsigned HOST_WIDE_INT
5291 aarch64_replicate_bitmask_imm (unsigned HOST_WIDE_INT val, machine_mode mode)
5292 {
5293 unsigned int size = GET_MODE_UNIT_PRECISION (mode);
5294 while (size < 64)
5295 {
5296 val &= (HOST_WIDE_INT_1U << size) - 1;
5297 val |= val << size;
5298 size *= 2;
5299 }
5300 return val;
5301 }
5302
5303 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2. */
5304
5305 static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
5306 {
5307 0x0000000100000001ull,
5308 0x0001000100010001ull,
5309 0x0101010101010101ull,
5310 0x1111111111111111ull,
5311 0x5555555555555555ull,
5312 };
5313
5314
5315 /* Return true if val is a valid bitmask immediate. */
5316
5317 bool
5318 aarch64_bitmask_imm (HOST_WIDE_INT val_in, machine_mode mode)
5319 {
5320 unsigned HOST_WIDE_INT val, tmp, mask, first_one, next_one;
5321 int bits;
5322
5323 /* Check for a single sequence of one bits and return quickly if so.
5324 The special cases of all ones and all zeroes returns false. */
5325 val = aarch64_replicate_bitmask_imm (val_in, mode);
5326 tmp = val + (val & -val);
5327
5328 if (tmp == (tmp & -tmp))
5329 return (val + 1) > 1;
5330
5331 /* Replicate 32-bit immediates so we can treat them as 64-bit. */
5332 if (mode == SImode)
5333 val = (val << 32) | (val & 0xffffffff);
5334
5335 /* Invert if the immediate doesn't start with a zero bit - this means we
5336 only need to search for sequences of one bits. */
5337 if (val & 1)
5338 val = ~val;
5339
5340 /* Find the first set bit and set tmp to val with the first sequence of one
5341 bits removed. Return success if there is a single sequence of ones. */
5342 first_one = val & -val;
5343 tmp = val & (val + first_one);
5344
5345 if (tmp == 0)
5346 return true;
5347
5348 /* Find the next set bit and compute the difference in bit position. */
5349 next_one = tmp & -tmp;
5350 bits = clz_hwi (first_one) - clz_hwi (next_one);
5351 mask = val ^ tmp;
5352
5353 /* Check the bit position difference is a power of 2, and that the first
5354 sequence of one bits fits within 'bits' bits. */
5355 if ((mask >> bits) != 0 || bits != (bits & -bits))
5356 return false;
5357
5358 /* Check the sequence of one bits is repeated 64/bits times. */
5359 return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
5360 }
5361
5362 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
5363 Assumed precondition: VAL_IN Is not zero. */
5364
5365 unsigned HOST_WIDE_INT
5366 aarch64_and_split_imm1 (HOST_WIDE_INT val_in)
5367 {
5368 int lowest_bit_set = ctz_hwi (val_in);
5369 int highest_bit_set = floor_log2 (val_in);
5370 gcc_assert (val_in != 0);
5371
5372 return ((HOST_WIDE_INT_UC (2) << highest_bit_set) -
5373 (HOST_WIDE_INT_1U << lowest_bit_set));
5374 }
5375
5376 /* Create constant where bits outside of lowest bit set to highest bit set
5377 are set to 1. */
5378
5379 unsigned HOST_WIDE_INT
5380 aarch64_and_split_imm2 (HOST_WIDE_INT val_in)
5381 {
5382 return val_in | ~aarch64_and_split_imm1 (val_in);
5383 }
5384
5385 /* Return true if VAL_IN is a valid 'and' bitmask immediate. */
5386
5387 bool
5388 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode)
5389 {
5390 scalar_int_mode int_mode;
5391 if (!is_a <scalar_int_mode> (mode, &int_mode))
5392 return false;
5393
5394 if (aarch64_bitmask_imm (val_in, int_mode))
5395 return false;
5396
5397 if (aarch64_move_imm (val_in, int_mode))
5398 return false;
5399
5400 unsigned HOST_WIDE_INT imm2 = aarch64_and_split_imm2 (val_in);
5401
5402 return aarch64_bitmask_imm (imm2, int_mode);
5403 }
5404
5405 /* Return true if val is an immediate that can be loaded into a
5406 register in a single instruction. */
5407 bool
5408 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
5409 {
5410 scalar_int_mode int_mode;
5411 if (!is_a <scalar_int_mode> (mode, &int_mode))
5412 return false;
5413
5414 if (aarch64_movw_imm (val, int_mode) || aarch64_movw_imm (~val, int_mode))
5415 return 1;
5416 return aarch64_bitmask_imm (val, int_mode);
5417 }
5418
5419 static bool
5420 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
5421 {
5422 rtx base, offset;
5423
5424 if (GET_CODE (x) == HIGH)
5425 return true;
5426
5427 /* There's no way to calculate VL-based values using relocations. */
5428 subrtx_iterator::array_type array;
5429 FOR_EACH_SUBRTX (iter, array, x, ALL)
5430 if (GET_CODE (*iter) == CONST_POLY_INT)
5431 return true;
5432
5433 split_const (x, &base, &offset);
5434 if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
5435 {
5436 if (aarch64_classify_symbol (base, INTVAL (offset))
5437 != SYMBOL_FORCE_TO_MEM)
5438 return true;
5439 else
5440 /* Avoid generating a 64-bit relocation in ILP32; leave
5441 to aarch64_expand_mov_immediate to handle it properly. */
5442 return mode != ptr_mode;
5443 }
5444
5445 return aarch64_tls_referenced_p (x);
5446 }
5447
5448 /* Implement TARGET_CASE_VALUES_THRESHOLD.
5449 The expansion for a table switch is quite expensive due to the number
5450 of instructions, the table lookup and hard to predict indirect jump.
5451 When optimizing for speed, and -O3 enabled, use the per-core tuning if
5452 set, otherwise use tables for > 16 cases as a tradeoff between size and
5453 performance. When optimizing for size, use the default setting. */
5454
5455 static unsigned int
5456 aarch64_case_values_threshold (void)
5457 {
5458 /* Use the specified limit for the number of cases before using jump
5459 tables at higher optimization levels. */
5460 if (optimize > 2
5461 && selected_cpu->tune->max_case_values != 0)
5462 return selected_cpu->tune->max_case_values;
5463 else
5464 return optimize_size ? default_case_values_threshold () : 17;
5465 }
5466
5467 /* Return true if register REGNO is a valid index register.
5468 STRICT_P is true if REG_OK_STRICT is in effect. */
5469
5470 bool
5471 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
5472 {
5473 if (!HARD_REGISTER_NUM_P (regno))
5474 {
5475 if (!strict_p)
5476 return true;
5477
5478 if (!reg_renumber)
5479 return false;
5480
5481 regno = reg_renumber[regno];
5482 }
5483 return GP_REGNUM_P (regno);
5484 }
5485
5486 /* Return true if register REGNO is a valid base register for mode MODE.
5487 STRICT_P is true if REG_OK_STRICT is in effect. */
5488
5489 bool
5490 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
5491 {
5492 if (!HARD_REGISTER_NUM_P (regno))
5493 {
5494 if (!strict_p)
5495 return true;
5496
5497 if (!reg_renumber)
5498 return false;
5499
5500 regno = reg_renumber[regno];
5501 }
5502
5503 /* The fake registers will be eliminated to either the stack or
5504 hard frame pointer, both of which are usually valid base registers.
5505 Reload deals with the cases where the eliminated form isn't valid. */
5506 return (GP_REGNUM_P (regno)
5507 || regno == SP_REGNUM
5508 || regno == FRAME_POINTER_REGNUM
5509 || regno == ARG_POINTER_REGNUM);
5510 }
5511
5512 /* Return true if X is a valid base register for mode MODE.
5513 STRICT_P is true if REG_OK_STRICT is in effect. */
5514
5515 static bool
5516 aarch64_base_register_rtx_p (rtx x, bool strict_p)
5517 {
5518 if (!strict_p
5519 && GET_CODE (x) == SUBREG
5520 && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (x))])
5521 x = SUBREG_REG (x);
5522
5523 return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
5524 }
5525
5526 /* Return true if address offset is a valid index. If it is, fill in INFO
5527 appropriately. STRICT_P is true if REG_OK_STRICT is in effect. */
5528
5529 static bool
5530 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
5531 machine_mode mode, bool strict_p)
5532 {
5533 enum aarch64_address_type type;
5534 rtx index;
5535 int shift;
5536
5537 /* (reg:P) */
5538 if ((REG_P (x) || GET_CODE (x) == SUBREG)
5539 && GET_MODE (x) == Pmode)
5540 {
5541 type = ADDRESS_REG_REG;
5542 index = x;
5543 shift = 0;
5544 }
5545 /* (sign_extend:DI (reg:SI)) */
5546 else if ((GET_CODE (x) == SIGN_EXTEND
5547 || GET_CODE (x) == ZERO_EXTEND)
5548 && GET_MODE (x) == DImode
5549 && GET_MODE (XEXP (x, 0)) == SImode)
5550 {
5551 type = (GET_CODE (x) == SIGN_EXTEND)
5552 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
5553 index = XEXP (x, 0);
5554 shift = 0;
5555 }
5556 /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
5557 else if (GET_CODE (x) == MULT
5558 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
5559 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
5560 && GET_MODE (XEXP (x, 0)) == DImode
5561 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
5562 && CONST_INT_P (XEXP (x, 1)))
5563 {
5564 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
5565 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
5566 index = XEXP (XEXP (x, 0), 0);
5567 shift = exact_log2 (INTVAL (XEXP (x, 1)));
5568 }
5569 /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
5570 else if (GET_CODE (x) == ASHIFT
5571 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
5572 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
5573 && GET_MODE (XEXP (x, 0)) == DImode
5574 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
5575 && CONST_INT_P (XEXP (x, 1)))
5576 {
5577 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
5578 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
5579 index = XEXP (XEXP (x, 0), 0);
5580 shift = INTVAL (XEXP (x, 1));
5581 }
5582 /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
5583 else if ((GET_CODE (x) == SIGN_EXTRACT
5584 || GET_CODE (x) == ZERO_EXTRACT)
5585 && GET_MODE (x) == DImode
5586 && GET_CODE (XEXP (x, 0)) == MULT
5587 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
5588 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
5589 {
5590 type = (GET_CODE (x) == SIGN_EXTRACT)
5591 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
5592 index = XEXP (XEXP (x, 0), 0);
5593 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
5594 if (INTVAL (XEXP (x, 1)) != 32 + shift
5595 || INTVAL (XEXP (x, 2)) != 0)
5596 shift = -1;
5597 }
5598 /* (and:DI (mult:DI (reg:DI) (const_int scale))
5599 (const_int 0xffffffff<<shift)) */
5600 else if (GET_CODE (x) == AND
5601 && GET_MODE (x) == DImode
5602 && GET_CODE (XEXP (x, 0)) == MULT
5603 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
5604 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
5605 && CONST_INT_P (XEXP (x, 1)))
5606 {
5607 type = ADDRESS_REG_UXTW;
5608 index = XEXP (XEXP (x, 0), 0);
5609 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
5610 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
5611 shift = -1;
5612 }
5613 /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
5614 else if ((GET_CODE (x) == SIGN_EXTRACT
5615 || GET_CODE (x) == ZERO_EXTRACT)
5616 && GET_MODE (x) == DImode
5617 && GET_CODE (XEXP (x, 0)) == ASHIFT
5618 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
5619 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
5620 {
5621 type = (GET_CODE (x) == SIGN_EXTRACT)
5622 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
5623 index = XEXP (XEXP (x, 0), 0);
5624 shift = INTVAL (XEXP (XEXP (x, 0), 1));
5625 if (INTVAL (XEXP (x, 1)) != 32 + shift
5626 || INTVAL (XEXP (x, 2)) != 0)
5627 shift = -1;
5628 }
5629 /* (and:DI (ashift:DI (reg:DI) (const_int shift))
5630 (const_int 0xffffffff<<shift)) */
5631 else if (GET_CODE (x) == AND
5632 && GET_MODE (x) == DImode
5633 && GET_CODE (XEXP (x, 0)) == ASHIFT
5634 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
5635 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
5636 && CONST_INT_P (XEXP (x, 1)))
5637 {
5638 type = ADDRESS_REG_UXTW;
5639 index = XEXP (XEXP (x, 0), 0);
5640 shift = INTVAL (XEXP (XEXP (x, 0), 1));
5641 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
5642 shift = -1;
5643 }
5644 /* (mult:P (reg:P) (const_int scale)) */
5645 else if (GET_CODE (x) == MULT
5646 && GET_MODE (x) == Pmode
5647 && GET_MODE (XEXP (x, 0)) == Pmode
5648 && CONST_INT_P (XEXP (x, 1)))
5649 {
5650 type = ADDRESS_REG_REG;
5651 index = XEXP (x, 0);
5652 shift = exact_log2 (INTVAL (XEXP (x, 1)));
5653 }
5654 /* (ashift:P (reg:P) (const_int shift)) */
5655 else if (GET_CODE (x) == ASHIFT
5656 && GET_MODE (x) == Pmode
5657 && GET_MODE (XEXP (x, 0)) == Pmode
5658 && CONST_INT_P (XEXP (x, 1)))
5659 {
5660 type = ADDRESS_REG_REG;
5661 index = XEXP (x, 0);
5662 shift = INTVAL (XEXP (x, 1));
5663 }
5664 else
5665 return false;
5666
5667 if (!strict_p
5668 && GET_CODE (index) == SUBREG
5669 && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (index))])
5670 index = SUBREG_REG (index);
5671
5672 if (aarch64_sve_data_mode_p (mode))
5673 {
5674 if (type != ADDRESS_REG_REG
5675 || (1 << shift) != GET_MODE_UNIT_SIZE (mode))
5676 return false;
5677 }
5678 else
5679 {
5680 if (shift != 0
5681 && !(IN_RANGE (shift, 1, 3)
5682 && known_eq (1 << shift, GET_MODE_SIZE (mode))))
5683 return false;
5684 }
5685
5686 if (REG_P (index)
5687 && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
5688 {
5689 info->type = type;
5690 info->offset = index;
5691 info->shift = shift;
5692 return true;
5693 }
5694
5695 return false;
5696 }
5697
5698 /* Return true if MODE is one of the modes for which we
5699 support LDP/STP operations. */
5700
5701 static bool
5702 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
5703 {
5704 return mode == SImode || mode == DImode
5705 || mode == SFmode || mode == DFmode
5706 || (aarch64_vector_mode_supported_p (mode)
5707 && (known_eq (GET_MODE_SIZE (mode), 8)
5708 || (known_eq (GET_MODE_SIZE (mode), 16)
5709 && (aarch64_tune_params.extra_tuning_flags
5710 & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS) == 0)));
5711 }
5712
5713 /* Return true if REGNO is a virtual pointer register, or an eliminable
5714 "soft" frame register. Like REGNO_PTR_FRAME_P except that we don't
5715 include stack_pointer or hard_frame_pointer. */
5716 static bool
5717 virt_or_elim_regno_p (unsigned regno)
5718 {
5719 return ((regno >= FIRST_VIRTUAL_REGISTER
5720 && regno <= LAST_VIRTUAL_POINTER_REGISTER)
5721 || regno == FRAME_POINTER_REGNUM
5722 || regno == ARG_POINTER_REGNUM);
5723 }
5724
5725 /* Return true if X is a valid address of type TYPE for machine mode MODE.
5726 If it is, fill in INFO appropriately. STRICT_P is true if
5727 REG_OK_STRICT is in effect. */
5728
5729 static bool
5730 aarch64_classify_address (struct aarch64_address_info *info,
5731 rtx x, machine_mode mode, bool strict_p,
5732 aarch64_addr_query_type type = ADDR_QUERY_M)
5733 {
5734 enum rtx_code code = GET_CODE (x);
5735 rtx op0, op1;
5736 poly_int64 offset;
5737
5738 HOST_WIDE_INT const_size;
5739
5740 /* On BE, we use load/store pair for all large int mode load/stores.
5741 TI/TFmode may also use a load/store pair. */
5742 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
5743 bool advsimd_struct_p = (vec_flags == (VEC_ADVSIMD | VEC_STRUCT));
5744 bool load_store_pair_p = (type == ADDR_QUERY_LDP_STP
5745 || mode == TImode
5746 || mode == TFmode
5747 || (BYTES_BIG_ENDIAN && advsimd_struct_p));
5748
5749 bool allow_reg_index_p = (!load_store_pair_p
5750 && (known_lt (GET_MODE_SIZE (mode), 16)
5751 || vec_flags == VEC_ADVSIMD
5752 || vec_flags == VEC_SVE_DATA));
5753
5754 /* For SVE, only accept [Rn], [Rn, Rm, LSL #shift] and
5755 [Rn, #offset, MUL VL]. */
5756 if ((vec_flags & (VEC_SVE_DATA | VEC_SVE_PRED)) != 0
5757 && (code != REG && code != PLUS))
5758 return false;
5759
5760 /* On LE, for AdvSIMD, don't support anything other than POST_INC or
5761 REG addressing. */
5762 if (advsimd_struct_p
5763 && !BYTES_BIG_ENDIAN
5764 && (code != POST_INC && code != REG))
5765 return false;
5766
5767 gcc_checking_assert (GET_MODE (x) == VOIDmode
5768 || SCALAR_INT_MODE_P (GET_MODE (x)));
5769
5770 switch (code)
5771 {
5772 case REG:
5773 case SUBREG:
5774 info->type = ADDRESS_REG_IMM;
5775 info->base = x;
5776 info->offset = const0_rtx;
5777 info->const_offset = 0;
5778 return aarch64_base_register_rtx_p (x, strict_p);
5779
5780 case PLUS:
5781 op0 = XEXP (x, 0);
5782 op1 = XEXP (x, 1);
5783
5784 if (! strict_p
5785 && REG_P (op0)
5786 && virt_or_elim_regno_p (REGNO (op0))
5787 && poly_int_rtx_p (op1, &offset))
5788 {
5789 info->type = ADDRESS_REG_IMM;
5790 info->base = op0;
5791 info->offset = op1;
5792 info->const_offset = offset;
5793
5794 return true;
5795 }
5796
5797 if (maybe_ne (GET_MODE_SIZE (mode), 0)
5798 && aarch64_base_register_rtx_p (op0, strict_p)
5799 && poly_int_rtx_p (op1, &offset))
5800 {
5801 info->type = ADDRESS_REG_IMM;
5802 info->base = op0;
5803 info->offset = op1;
5804 info->const_offset = offset;
5805
5806 /* TImode and TFmode values are allowed in both pairs of X
5807 registers and individual Q registers. The available
5808 address modes are:
5809 X,X: 7-bit signed scaled offset
5810 Q: 9-bit signed offset
5811 We conservatively require an offset representable in either mode.
5812 When performing the check for pairs of X registers i.e. LDP/STP
5813 pass down DImode since that is the natural size of the LDP/STP
5814 instruction memory accesses. */
5815 if (mode == TImode || mode == TFmode)
5816 return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
5817 && (offset_9bit_signed_unscaled_p (mode, offset)
5818 || offset_12bit_unsigned_scaled_p (mode, offset)));
5819
5820 /* A 7bit offset check because OImode will emit a ldp/stp
5821 instruction (only big endian will get here).
5822 For ldp/stp instructions, the offset is scaled for the size of a
5823 single element of the pair. */
5824 if (mode == OImode)
5825 return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
5826
5827 /* Three 9/12 bit offsets checks because CImode will emit three
5828 ldr/str instructions (only big endian will get here). */
5829 if (mode == CImode)
5830 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
5831 && (offset_9bit_signed_unscaled_p (V16QImode, offset + 32)
5832 || offset_12bit_unsigned_scaled_p (V16QImode,
5833 offset + 32)));
5834
5835 /* Two 7bit offsets checks because XImode will emit two ldp/stp
5836 instructions (only big endian will get here). */
5837 if (mode == XImode)
5838 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
5839 && aarch64_offset_7bit_signed_scaled_p (TImode,
5840 offset + 32));
5841
5842 /* Make "m" use the LD1 offset range for SVE data modes, so
5843 that pre-RTL optimizers like ivopts will work to that
5844 instead of the wider LDR/STR range. */
5845 if (vec_flags == VEC_SVE_DATA)
5846 return (type == ADDR_QUERY_M
5847 ? offset_4bit_signed_scaled_p (mode, offset)
5848 : offset_9bit_signed_scaled_p (mode, offset));
5849
5850 if (vec_flags == (VEC_SVE_DATA | VEC_STRUCT))
5851 {
5852 poly_int64 end_offset = (offset
5853 + GET_MODE_SIZE (mode)
5854 - BYTES_PER_SVE_VECTOR);
5855 return (type == ADDR_QUERY_M
5856 ? offset_4bit_signed_scaled_p (mode, offset)
5857 : (offset_9bit_signed_scaled_p (SVE_BYTE_MODE, offset)
5858 && offset_9bit_signed_scaled_p (SVE_BYTE_MODE,
5859 end_offset)));
5860 }
5861
5862 if (vec_flags == VEC_SVE_PRED)
5863 return offset_9bit_signed_scaled_p (mode, offset);
5864
5865 if (load_store_pair_p)
5866 return ((known_eq (GET_MODE_SIZE (mode), 4)
5867 || known_eq (GET_MODE_SIZE (mode), 8)
5868 || known_eq (GET_MODE_SIZE (mode), 16))
5869 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
5870 else
5871 return (offset_9bit_signed_unscaled_p (mode, offset)
5872 || offset_12bit_unsigned_scaled_p (mode, offset));
5873 }
5874
5875 if (allow_reg_index_p)
5876 {
5877 /* Look for base + (scaled/extended) index register. */
5878 if (aarch64_base_register_rtx_p (op0, strict_p)
5879 && aarch64_classify_index (info, op1, mode, strict_p))
5880 {
5881 info->base = op0;
5882 return true;
5883 }
5884 if (aarch64_base_register_rtx_p (op1, strict_p)
5885 && aarch64_classify_index (info, op0, mode, strict_p))
5886 {
5887 info->base = op1;
5888 return true;
5889 }
5890 }
5891
5892 return false;
5893
5894 case POST_INC:
5895 case POST_DEC:
5896 case PRE_INC:
5897 case PRE_DEC:
5898 info->type = ADDRESS_REG_WB;
5899 info->base = XEXP (x, 0);
5900 info->offset = NULL_RTX;
5901 return aarch64_base_register_rtx_p (info->base, strict_p);
5902
5903 case POST_MODIFY:
5904 case PRE_MODIFY:
5905 info->type = ADDRESS_REG_WB;
5906 info->base = XEXP (x, 0);
5907 if (GET_CODE (XEXP (x, 1)) == PLUS
5908 && poly_int_rtx_p (XEXP (XEXP (x, 1), 1), &offset)
5909 && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
5910 && aarch64_base_register_rtx_p (info->base, strict_p))
5911 {
5912 info->offset = XEXP (XEXP (x, 1), 1);
5913 info->const_offset = offset;
5914
5915 /* TImode and TFmode values are allowed in both pairs of X
5916 registers and individual Q registers. The available
5917 address modes are:
5918 X,X: 7-bit signed scaled offset
5919 Q: 9-bit signed offset
5920 We conservatively require an offset representable in either mode.
5921 */
5922 if (mode == TImode || mode == TFmode)
5923 return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
5924 && offset_9bit_signed_unscaled_p (mode, offset));
5925
5926 if (load_store_pair_p)
5927 return ((known_eq (GET_MODE_SIZE (mode), 4)
5928 || known_eq (GET_MODE_SIZE (mode), 8)
5929 || known_eq (GET_MODE_SIZE (mode), 16))
5930 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
5931 else
5932 return offset_9bit_signed_unscaled_p (mode, offset);
5933 }
5934 return false;
5935
5936 case CONST:
5937 case SYMBOL_REF:
5938 case LABEL_REF:
5939 /* load literal: pc-relative constant pool entry. Only supported
5940 for SI mode or larger. */
5941 info->type = ADDRESS_SYMBOLIC;
5942
5943 if (!load_store_pair_p
5944 && GET_MODE_SIZE (mode).is_constant (&const_size)
5945 && const_size >= 4)
5946 {
5947 rtx sym, addend;
5948
5949 split_const (x, &sym, &addend);
5950 return ((GET_CODE (sym) == LABEL_REF
5951 || (GET_CODE (sym) == SYMBOL_REF
5952 && CONSTANT_POOL_ADDRESS_P (sym)
5953 && aarch64_pcrelative_literal_loads)));
5954 }
5955 return false;
5956
5957 case LO_SUM:
5958 info->type = ADDRESS_LO_SUM;
5959 info->base = XEXP (x, 0);
5960 info->offset = XEXP (x, 1);
5961 if (allow_reg_index_p
5962 && aarch64_base_register_rtx_p (info->base, strict_p))
5963 {
5964 rtx sym, offs;
5965 split_const (info->offset, &sym, &offs);
5966 if (GET_CODE (sym) == SYMBOL_REF
5967 && (aarch64_classify_symbol (sym, INTVAL (offs))
5968 == SYMBOL_SMALL_ABSOLUTE))
5969 {
5970 /* The symbol and offset must be aligned to the access size. */
5971 unsigned int align;
5972
5973 if (CONSTANT_POOL_ADDRESS_P (sym))
5974 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
5975 else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
5976 {
5977 tree exp = SYMBOL_REF_DECL (sym);
5978 align = TYPE_ALIGN (TREE_TYPE (exp));
5979 align = aarch64_constant_alignment (exp, align);
5980 }
5981 else if (SYMBOL_REF_DECL (sym))
5982 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
5983 else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
5984 && SYMBOL_REF_BLOCK (sym) != NULL)
5985 align = SYMBOL_REF_BLOCK (sym)->alignment;
5986 else
5987 align = BITS_PER_UNIT;
5988
5989 poly_int64 ref_size = GET_MODE_SIZE (mode);
5990 if (known_eq (ref_size, 0))
5991 ref_size = GET_MODE_SIZE (DImode);
5992
5993 return (multiple_p (INTVAL (offs), ref_size)
5994 && multiple_p (align / BITS_PER_UNIT, ref_size));
5995 }
5996 }
5997 return false;
5998
5999 default:
6000 return false;
6001 }
6002 }
6003
6004 /* Return true if the address X is valid for a PRFM instruction.
6005 STRICT_P is true if we should do strict checking with
6006 aarch64_classify_address. */
6007
6008 bool
6009 aarch64_address_valid_for_prefetch_p (rtx x, bool strict_p)
6010 {
6011 struct aarch64_address_info addr;
6012
6013 /* PRFM accepts the same addresses as DImode... */
6014 bool res = aarch64_classify_address (&addr, x, DImode, strict_p);
6015 if (!res)
6016 return false;
6017
6018 /* ... except writeback forms. */
6019 return addr.type != ADDRESS_REG_WB;
6020 }
6021
6022 bool
6023 aarch64_symbolic_address_p (rtx x)
6024 {
6025 rtx offset;
6026
6027 split_const (x, &x, &offset);
6028 return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
6029 }
6030
6031 /* Classify the base of symbolic expression X. */
6032
6033 enum aarch64_symbol_type
6034 aarch64_classify_symbolic_expression (rtx x)
6035 {
6036 rtx offset;
6037
6038 split_const (x, &x, &offset);
6039 return aarch64_classify_symbol (x, INTVAL (offset));
6040 }
6041
6042
6043 /* Return TRUE if X is a legitimate address for accessing memory in
6044 mode MODE. */
6045 static bool
6046 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
6047 {
6048 struct aarch64_address_info addr;
6049
6050 return aarch64_classify_address (&addr, x, mode, strict_p);
6051 }
6052
6053 /* Return TRUE if X is a legitimate address of type TYPE for accessing
6054 memory in mode MODE. STRICT_P is true if REG_OK_STRICT is in effect. */
6055 bool
6056 aarch64_legitimate_address_p (machine_mode mode, rtx x, bool strict_p,
6057 aarch64_addr_query_type type)
6058 {
6059 struct aarch64_address_info addr;
6060
6061 return aarch64_classify_address (&addr, x, mode, strict_p, type);
6062 }
6063
6064 /* Implement TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT. */
6065
6066 static bool
6067 aarch64_legitimize_address_displacement (rtx *offset1, rtx *offset2,
6068 poly_int64 orig_offset,
6069 machine_mode mode)
6070 {
6071 HOST_WIDE_INT size;
6072 if (GET_MODE_SIZE (mode).is_constant (&size))
6073 {
6074 HOST_WIDE_INT const_offset, second_offset;
6075
6076 /* A general SVE offset is A * VQ + B. Remove the A component from
6077 coefficient 0 in order to get the constant B. */
6078 const_offset = orig_offset.coeffs[0] - orig_offset.coeffs[1];
6079
6080 /* Split an out-of-range address displacement into a base and
6081 offset. Use 4KB range for 1- and 2-byte accesses and a 16KB
6082 range otherwise to increase opportunities for sharing the base
6083 address of different sizes. Unaligned accesses use the signed
6084 9-bit range, TImode/TFmode use the intersection of signed
6085 scaled 7-bit and signed 9-bit offset. */
6086 if (mode == TImode || mode == TFmode)
6087 second_offset = ((const_offset + 0x100) & 0x1f8) - 0x100;
6088 else if ((const_offset & (size - 1)) != 0)
6089 second_offset = ((const_offset + 0x100) & 0x1ff) - 0x100;
6090 else
6091 second_offset = const_offset & (size < 4 ? 0xfff : 0x3ffc);
6092
6093 if (second_offset == 0 || known_eq (orig_offset, second_offset))
6094 return false;
6095
6096 /* Split the offset into second_offset and the rest. */
6097 *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
6098 *offset2 = gen_int_mode (second_offset, Pmode);
6099 return true;
6100 }
6101 else
6102 {
6103 /* Get the mode we should use as the basis of the range. For structure
6104 modes this is the mode of one vector. */
6105 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
6106 machine_mode step_mode
6107 = (vec_flags & VEC_STRUCT) != 0 ? SVE_BYTE_MODE : mode;
6108
6109 /* Get the "mul vl" multiplier we'd like to use. */
6110 HOST_WIDE_INT factor = GET_MODE_SIZE (step_mode).coeffs[1];
6111 HOST_WIDE_INT vnum = orig_offset.coeffs[1] / factor;
6112 if (vec_flags & VEC_SVE_DATA)
6113 /* LDR supports a 9-bit range, but the move patterns for
6114 structure modes require all vectors to be in range of the
6115 same base. The simplest way of accomodating that while still
6116 promoting reuse of anchor points between different modes is
6117 to use an 8-bit range unconditionally. */
6118 vnum = ((vnum + 128) & 255) - 128;
6119 else
6120 /* Predicates are only handled singly, so we might as well use
6121 the full range. */
6122 vnum = ((vnum + 256) & 511) - 256;
6123 if (vnum == 0)
6124 return false;
6125
6126 /* Convert the "mul vl" multiplier into a byte offset. */
6127 poly_int64 second_offset = GET_MODE_SIZE (step_mode) * vnum;
6128 if (known_eq (second_offset, orig_offset))
6129 return false;
6130
6131 /* Split the offset into second_offset and the rest. */
6132 *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
6133 *offset2 = gen_int_mode (second_offset, Pmode);
6134 return true;
6135 }
6136 }
6137
6138 /* Return the binary representation of floating point constant VALUE in INTVAL.
6139 If the value cannot be converted, return false without setting INTVAL.
6140 The conversion is done in the given MODE. */
6141 bool
6142 aarch64_reinterpret_float_as_int (rtx value, unsigned HOST_WIDE_INT *intval)
6143 {
6144
6145 /* We make a general exception for 0. */
6146 if (aarch64_float_const_zero_rtx_p (value))
6147 {
6148 *intval = 0;
6149 return true;
6150 }
6151
6152 scalar_float_mode mode;
6153 if (GET_CODE (value) != CONST_DOUBLE
6154 || !is_a <scalar_float_mode> (GET_MODE (value), &mode)
6155 || GET_MODE_BITSIZE (mode) > HOST_BITS_PER_WIDE_INT
6156 /* Only support up to DF mode. */
6157 || GET_MODE_BITSIZE (mode) > GET_MODE_BITSIZE (DFmode))
6158 return false;
6159
6160 unsigned HOST_WIDE_INT ival = 0;
6161
6162 long res[2];
6163 real_to_target (res,
6164 CONST_DOUBLE_REAL_VALUE (value),
6165 REAL_MODE_FORMAT (mode));
6166
6167 if (mode == DFmode)
6168 {
6169 int order = BYTES_BIG_ENDIAN ? 1 : 0;
6170 ival = zext_hwi (res[order], 32);
6171 ival |= (zext_hwi (res[1 - order], 32) << 32);
6172 }
6173 else
6174 ival = zext_hwi (res[0], 32);
6175
6176 *intval = ival;
6177 return true;
6178 }
6179
6180 /* Return TRUE if rtx X is an immediate constant that can be moved using a
6181 single MOV(+MOVK) followed by an FMOV. */
6182 bool
6183 aarch64_float_const_rtx_p (rtx x)
6184 {
6185 machine_mode mode = GET_MODE (x);
6186 if (mode == VOIDmode)
6187 return false;
6188
6189 /* Determine whether it's cheaper to write float constants as
6190 mov/movk pairs over ldr/adrp pairs. */
6191 unsigned HOST_WIDE_INT ival;
6192
6193 if (GET_CODE (x) == CONST_DOUBLE
6194 && SCALAR_FLOAT_MODE_P (mode)
6195 && aarch64_reinterpret_float_as_int (x, &ival))
6196 {
6197 scalar_int_mode imode = (mode == HFmode
6198 ? SImode
6199 : int_mode_for_mode (mode).require ());
6200 int num_instr = aarch64_internal_mov_immediate
6201 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
6202 return num_instr < 3;
6203 }
6204
6205 return false;
6206 }
6207
6208 /* Return TRUE if rtx X is immediate constant 0.0 */
6209 bool
6210 aarch64_float_const_zero_rtx_p (rtx x)
6211 {
6212 if (GET_MODE (x) == VOIDmode)
6213 return false;
6214
6215 if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
6216 return !HONOR_SIGNED_ZEROS (GET_MODE (x));
6217 return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
6218 }
6219
6220 /* Return TRUE if rtx X is immediate constant that fits in a single
6221 MOVI immediate operation. */
6222 bool
6223 aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode)
6224 {
6225 if (!TARGET_SIMD)
6226 return false;
6227
6228 machine_mode vmode;
6229 scalar_int_mode imode;
6230 unsigned HOST_WIDE_INT ival;
6231
6232 if (GET_CODE (x) == CONST_DOUBLE
6233 && SCALAR_FLOAT_MODE_P (mode))
6234 {
6235 if (!aarch64_reinterpret_float_as_int (x, &ival))
6236 return false;
6237
6238 /* We make a general exception for 0. */
6239 if (aarch64_float_const_zero_rtx_p (x))
6240 return true;
6241
6242 imode = int_mode_for_mode (mode).require ();
6243 }
6244 else if (GET_CODE (x) == CONST_INT
6245 && is_a <scalar_int_mode> (mode, &imode))
6246 ival = INTVAL (x);
6247 else
6248 return false;
6249
6250 /* use a 64 bit mode for everything except for DI/DF mode, where we use
6251 a 128 bit vector mode. */
6252 int width = GET_MODE_BITSIZE (imode) == 64 ? 128 : 64;
6253
6254 vmode = aarch64_simd_container_mode (imode, width);
6255 rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, ival);
6256
6257 return aarch64_simd_valid_immediate (v_op, NULL);
6258 }
6259
6260
6261 /* Return the fixed registers used for condition codes. */
6262
6263 static bool
6264 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
6265 {
6266 *p1 = CC_REGNUM;
6267 *p2 = INVALID_REGNUM;
6268 return true;
6269 }
6270
6271 /* This function is used by the call expanders of the machine description.
6272 RESULT is the register in which the result is returned. It's NULL for
6273 "call" and "sibcall".
6274 MEM is the location of the function call.
6275 SIBCALL indicates whether this function call is normal call or sibling call.
6276 It will generate different pattern accordingly. */
6277
6278 void
6279 aarch64_expand_call (rtx result, rtx mem, bool sibcall)
6280 {
6281 rtx call, callee, tmp;
6282 rtvec vec;
6283 machine_mode mode;
6284
6285 gcc_assert (MEM_P (mem));
6286 callee = XEXP (mem, 0);
6287 mode = GET_MODE (callee);
6288 gcc_assert (mode == Pmode);
6289
6290 /* Decide if we should generate indirect calls by loading the
6291 address of the callee into a register before performing
6292 the branch-and-link. */
6293 if (SYMBOL_REF_P (callee)
6294 ? (aarch64_is_long_call_p (callee)
6295 || aarch64_is_noplt_call_p (callee))
6296 : !REG_P (callee))
6297 XEXP (mem, 0) = force_reg (mode, callee);
6298
6299 call = gen_rtx_CALL (VOIDmode, mem, const0_rtx);
6300
6301 if (result != NULL_RTX)
6302 call = gen_rtx_SET (result, call);
6303
6304 if (sibcall)
6305 tmp = ret_rtx;
6306 else
6307 tmp = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (Pmode, LR_REGNUM));
6308
6309 vec = gen_rtvec (2, call, tmp);
6310 call = gen_rtx_PARALLEL (VOIDmode, vec);
6311
6312 aarch64_emit_call_insn (call);
6313 }
6314
6315 /* Emit call insn with PAT and do aarch64-specific handling. */
6316
6317 void
6318 aarch64_emit_call_insn (rtx pat)
6319 {
6320 rtx insn = emit_call_insn (pat);
6321
6322 rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
6323 clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
6324 clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
6325 }
6326
6327 machine_mode
6328 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
6329 {
6330 /* All floating point compares return CCFP if it is an equality
6331 comparison, and CCFPE otherwise. */
6332 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
6333 {
6334 switch (code)
6335 {
6336 case EQ:
6337 case NE:
6338 case UNORDERED:
6339 case ORDERED:
6340 case UNLT:
6341 case UNLE:
6342 case UNGT:
6343 case UNGE:
6344 case UNEQ:
6345 return CCFPmode;
6346
6347 case LT:
6348 case LE:
6349 case GT:
6350 case GE:
6351 case LTGT:
6352 return CCFPEmode;
6353
6354 default:
6355 gcc_unreachable ();
6356 }
6357 }
6358
6359 /* Equality comparisons of short modes against zero can be performed
6360 using the TST instruction with the appropriate bitmask. */
6361 if (y == const0_rtx && REG_P (x)
6362 && (code == EQ || code == NE)
6363 && (GET_MODE (x) == HImode || GET_MODE (x) == QImode))
6364 return CC_NZmode;
6365
6366 /* Similarly, comparisons of zero_extends from shorter modes can
6367 be performed using an ANDS with an immediate mask. */
6368 if (y == const0_rtx && GET_CODE (x) == ZERO_EXTEND
6369 && (GET_MODE (x) == SImode || GET_MODE (x) == DImode)
6370 && (GET_MODE (XEXP (x, 0)) == HImode || GET_MODE (XEXP (x, 0)) == QImode)
6371 && (code == EQ || code == NE))
6372 return CC_NZmode;
6373
6374 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
6375 && y == const0_rtx
6376 && (code == EQ || code == NE || code == LT || code == GE)
6377 && (GET_CODE (x) == PLUS || GET_CODE (x) == MINUS || GET_CODE (x) == AND
6378 || GET_CODE (x) == NEG
6379 || (GET_CODE (x) == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
6380 && CONST_INT_P (XEXP (x, 2)))))
6381 return CC_NZmode;
6382
6383 /* A compare with a shifted operand. Because of canonicalization,
6384 the comparison will have to be swapped when we emit the assembly
6385 code. */
6386 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
6387 && (REG_P (y) || GET_CODE (y) == SUBREG || y == const0_rtx)
6388 && (GET_CODE (x) == ASHIFT || GET_CODE (x) == ASHIFTRT
6389 || GET_CODE (x) == LSHIFTRT
6390 || GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND))
6391 return CC_SWPmode;
6392
6393 /* Similarly for a negated operand, but we can only do this for
6394 equalities. */
6395 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
6396 && (REG_P (y) || GET_CODE (y) == SUBREG)
6397 && (code == EQ || code == NE)
6398 && GET_CODE (x) == NEG)
6399 return CC_Zmode;
6400
6401 /* A test for unsigned overflow. */
6402 if ((GET_MODE (x) == DImode || GET_MODE (x) == TImode)
6403 && code == NE
6404 && GET_CODE (x) == PLUS
6405 && GET_CODE (y) == ZERO_EXTEND)
6406 return CC_Cmode;
6407
6408 /* For everything else, return CCmode. */
6409 return CCmode;
6410 }
6411
6412 static int
6413 aarch64_get_condition_code_1 (machine_mode, enum rtx_code);
6414
6415 int
6416 aarch64_get_condition_code (rtx x)
6417 {
6418 machine_mode mode = GET_MODE (XEXP (x, 0));
6419 enum rtx_code comp_code = GET_CODE (x);
6420
6421 if (GET_MODE_CLASS (mode) != MODE_CC)
6422 mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
6423 return aarch64_get_condition_code_1 (mode, comp_code);
6424 }
6425
6426 static int
6427 aarch64_get_condition_code_1 (machine_mode mode, enum rtx_code comp_code)
6428 {
6429 switch (mode)
6430 {
6431 case E_CCFPmode:
6432 case E_CCFPEmode:
6433 switch (comp_code)
6434 {
6435 case GE: return AARCH64_GE;
6436 case GT: return AARCH64_GT;
6437 case LE: return AARCH64_LS;
6438 case LT: return AARCH64_MI;
6439 case NE: return AARCH64_NE;
6440 case EQ: return AARCH64_EQ;
6441 case ORDERED: return AARCH64_VC;
6442 case UNORDERED: return AARCH64_VS;
6443 case UNLT: return AARCH64_LT;
6444 case UNLE: return AARCH64_LE;
6445 case UNGT: return AARCH64_HI;
6446 case UNGE: return AARCH64_PL;
6447 default: return -1;
6448 }
6449 break;
6450
6451 case E_CCmode:
6452 switch (comp_code)
6453 {
6454 case NE: return AARCH64_NE;
6455 case EQ: return AARCH64_EQ;
6456 case GE: return AARCH64_GE;
6457 case GT: return AARCH64_GT;
6458 case LE: return AARCH64_LE;
6459 case LT: return AARCH64_LT;
6460 case GEU: return AARCH64_CS;
6461 case GTU: return AARCH64_HI;
6462 case LEU: return AARCH64_LS;
6463 case LTU: return AARCH64_CC;
6464 default: return -1;
6465 }
6466 break;
6467
6468 case E_CC_SWPmode:
6469 switch (comp_code)
6470 {
6471 case NE: return AARCH64_NE;
6472 case EQ: return AARCH64_EQ;
6473 case GE: return AARCH64_LE;
6474 case GT: return AARCH64_LT;
6475 case LE: return AARCH64_GE;
6476 case LT: return AARCH64_GT;
6477 case GEU: return AARCH64_LS;
6478 case GTU: return AARCH64_CC;
6479 case LEU: return AARCH64_CS;
6480 case LTU: return AARCH64_HI;
6481 default: return -1;
6482 }
6483 break;
6484
6485 case E_CC_NZmode:
6486 switch (comp_code)
6487 {
6488 case NE: return AARCH64_NE;
6489 case EQ: return AARCH64_EQ;
6490 case GE: return AARCH64_PL;
6491 case LT: return AARCH64_MI;
6492 default: return -1;
6493 }
6494 break;
6495
6496 case E_CC_Zmode:
6497 switch (comp_code)
6498 {
6499 case NE: return AARCH64_NE;
6500 case EQ: return AARCH64_EQ;
6501 default: return -1;
6502 }
6503 break;
6504
6505 case E_CC_Cmode:
6506 switch (comp_code)
6507 {
6508 case NE: return AARCH64_CS;
6509 case EQ: return AARCH64_CC;
6510 default: return -1;
6511 }
6512 break;
6513
6514 default:
6515 return -1;
6516 }
6517
6518 return -1;
6519 }
6520
6521 bool
6522 aarch64_const_vec_all_same_in_range_p (rtx x,
6523 HOST_WIDE_INT minval,
6524 HOST_WIDE_INT maxval)
6525 {
6526 rtx elt;
6527 return (const_vec_duplicate_p (x, &elt)
6528 && CONST_INT_P (elt)
6529 && IN_RANGE (INTVAL (elt), minval, maxval));
6530 }
6531
6532 bool
6533 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
6534 {
6535 return aarch64_const_vec_all_same_in_range_p (x, val, val);
6536 }
6537
6538 /* Return true if VEC is a constant in which every element is in the range
6539 [MINVAL, MAXVAL]. The elements do not need to have the same value. */
6540
6541 static bool
6542 aarch64_const_vec_all_in_range_p (rtx vec,
6543 HOST_WIDE_INT minval,
6544 HOST_WIDE_INT maxval)
6545 {
6546 if (GET_CODE (vec) != CONST_VECTOR
6547 || GET_MODE_CLASS (GET_MODE (vec)) != MODE_VECTOR_INT)
6548 return false;
6549
6550 int nunits;
6551 if (!CONST_VECTOR_STEPPED_P (vec))
6552 nunits = const_vector_encoded_nelts (vec);
6553 else if (!CONST_VECTOR_NUNITS (vec).is_constant (&nunits))
6554 return false;
6555
6556 for (int i = 0; i < nunits; i++)
6557 {
6558 rtx vec_elem = CONST_VECTOR_ELT (vec, i);
6559 if (!CONST_INT_P (vec_elem)
6560 || !IN_RANGE (INTVAL (vec_elem), minval, maxval))
6561 return false;
6562 }
6563 return true;
6564 }
6565
6566 /* N Z C V. */
6567 #define AARCH64_CC_V 1
6568 #define AARCH64_CC_C (1 << 1)
6569 #define AARCH64_CC_Z (1 << 2)
6570 #define AARCH64_CC_N (1 << 3)
6571
6572 /* N Z C V flags for ccmp. Indexed by AARCH64_COND_CODE. */
6573 static const int aarch64_nzcv_codes[] =
6574 {
6575 0, /* EQ, Z == 1. */
6576 AARCH64_CC_Z, /* NE, Z == 0. */
6577 0, /* CS, C == 1. */
6578 AARCH64_CC_C, /* CC, C == 0. */
6579 0, /* MI, N == 1. */
6580 AARCH64_CC_N, /* PL, N == 0. */
6581 0, /* VS, V == 1. */
6582 AARCH64_CC_V, /* VC, V == 0. */
6583 0, /* HI, C ==1 && Z == 0. */
6584 AARCH64_CC_C, /* LS, !(C == 1 && Z == 0). */
6585 AARCH64_CC_V, /* GE, N == V. */
6586 0, /* LT, N != V. */
6587 AARCH64_CC_Z, /* GT, Z == 0 && N == V. */
6588 0, /* LE, !(Z == 0 && N == V). */
6589 0, /* AL, Any. */
6590 0 /* NV, Any. */
6591 };
6592
6593 /* Print floating-point vector immediate operand X to F, negating it
6594 first if NEGATE is true. Return true on success, false if it isn't
6595 a constant we can handle. */
6596
6597 static bool
6598 aarch64_print_vector_float_operand (FILE *f, rtx x, bool negate)
6599 {
6600 rtx elt;
6601
6602 if (!const_vec_duplicate_p (x, &elt))
6603 return false;
6604
6605 REAL_VALUE_TYPE r = *CONST_DOUBLE_REAL_VALUE (elt);
6606 if (negate)
6607 r = real_value_negate (&r);
6608
6609 /* We only handle the SVE single-bit immediates here. */
6610 if (real_equal (&r, &dconst0))
6611 asm_fprintf (f, "0.0");
6612 else if (real_equal (&r, &dconst1))
6613 asm_fprintf (f, "1.0");
6614 else if (real_equal (&r, &dconsthalf))
6615 asm_fprintf (f, "0.5");
6616 else
6617 return false;
6618
6619 return true;
6620 }
6621
6622 /* Return the equivalent letter for size. */
6623 static char
6624 sizetochar (int size)
6625 {
6626 switch (size)
6627 {
6628 case 64: return 'd';
6629 case 32: return 's';
6630 case 16: return 'h';
6631 case 8 : return 'b';
6632 default: gcc_unreachable ();
6633 }
6634 }
6635
6636 /* Print operand X to file F in a target specific manner according to CODE.
6637 The acceptable formatting commands given by CODE are:
6638 'c': An integer or symbol address without a preceding #
6639 sign.
6640 'C': Take the duplicated element in a vector constant
6641 and print it in hex.
6642 'D': Take the duplicated element in a vector constant
6643 and print it as an unsigned integer, in decimal.
6644 'e': Print the sign/zero-extend size as a character 8->b,
6645 16->h, 32->w.
6646 'p': Prints N such that 2^N == X (X must be power of 2 and
6647 const int).
6648 'P': Print the number of non-zero bits in X (a const_int).
6649 'H': Print the higher numbered register of a pair (TImode)
6650 of regs.
6651 'm': Print a condition (eq, ne, etc).
6652 'M': Same as 'm', but invert condition.
6653 'N': Take the duplicated element in a vector constant
6654 and print the negative of it in decimal.
6655 'b/h/s/d/q': Print a scalar FP/SIMD register name.
6656 'S/T/U/V': Print a FP/SIMD register name for a register list.
6657 The register printed is the FP/SIMD register name
6658 of X + 0/1/2/3 for S/T/U/V.
6659 'R': Print a scalar FP/SIMD register name + 1.
6660 'X': Print bottom 16 bits of integer constant in hex.
6661 'w/x': Print a general register name or the zero register
6662 (32-bit or 64-bit).
6663 '0': Print a normal operand, if it's a general register,
6664 then we assume DImode.
6665 'k': Print NZCV for conditional compare instructions.
6666 'A': Output address constant representing the first
6667 argument of X, specifying a relocation offset
6668 if appropriate.
6669 'L': Output constant address specified by X
6670 with a relocation offset if appropriate.
6671 'G': Prints address of X, specifying a PC relative
6672 relocation mode if appropriate.
6673 'y': Output address of LDP or STP - this is used for
6674 some LDP/STPs which don't use a PARALLEL in their
6675 pattern (so the mode needs to be adjusted).
6676 'z': Output address of a typical LDP or STP. */
6677
6678 static void
6679 aarch64_print_operand (FILE *f, rtx x, int code)
6680 {
6681 rtx elt;
6682 switch (code)
6683 {
6684 case 'c':
6685 switch (GET_CODE (x))
6686 {
6687 case CONST_INT:
6688 fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
6689 break;
6690
6691 case SYMBOL_REF:
6692 output_addr_const (f, x);
6693 break;
6694
6695 case CONST:
6696 if (GET_CODE (XEXP (x, 0)) == PLUS
6697 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
6698 {
6699 output_addr_const (f, x);
6700 break;
6701 }
6702 /* Fall through. */
6703
6704 default:
6705 output_operand_lossage ("unsupported operand for code '%c'", code);
6706 }
6707 break;
6708
6709 case 'e':
6710 {
6711 int n;
6712
6713 if (!CONST_INT_P (x)
6714 || (n = exact_log2 (INTVAL (x) & ~7)) <= 0)
6715 {
6716 output_operand_lossage ("invalid operand for '%%%c'", code);
6717 return;
6718 }
6719
6720 switch (n)
6721 {
6722 case 3:
6723 fputc ('b', f);
6724 break;
6725 case 4:
6726 fputc ('h', f);
6727 break;
6728 case 5:
6729 fputc ('w', f);
6730 break;
6731 default:
6732 output_operand_lossage ("invalid operand for '%%%c'", code);
6733 return;
6734 }
6735 }
6736 break;
6737
6738 case 'p':
6739 {
6740 int n;
6741
6742 if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
6743 {
6744 output_operand_lossage ("invalid operand for '%%%c'", code);
6745 return;
6746 }
6747
6748 asm_fprintf (f, "%d", n);
6749 }
6750 break;
6751
6752 case 'P':
6753 if (!CONST_INT_P (x))
6754 {
6755 output_operand_lossage ("invalid operand for '%%%c'", code);
6756 return;
6757 }
6758
6759 asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
6760 break;
6761
6762 case 'H':
6763 if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
6764 {
6765 output_operand_lossage ("invalid operand for '%%%c'", code);
6766 return;
6767 }
6768
6769 asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
6770 break;
6771
6772 case 'M':
6773 case 'm':
6774 {
6775 int cond_code;
6776 /* CONST_TRUE_RTX means al/nv (al is the default, don't print it). */
6777 if (x == const_true_rtx)
6778 {
6779 if (code == 'M')
6780 fputs ("nv", f);
6781 return;
6782 }
6783
6784 if (!COMPARISON_P (x))
6785 {
6786 output_operand_lossage ("invalid operand for '%%%c'", code);
6787 return;
6788 }
6789
6790 cond_code = aarch64_get_condition_code (x);
6791 gcc_assert (cond_code >= 0);
6792 if (code == 'M')
6793 cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code);
6794 fputs (aarch64_condition_codes[cond_code], f);
6795 }
6796 break;
6797
6798 case 'N':
6799 if (!const_vec_duplicate_p (x, &elt))
6800 {
6801 output_operand_lossage ("invalid vector constant");
6802 return;
6803 }
6804
6805 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
6806 asm_fprintf (f, "%wd", -INTVAL (elt));
6807 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
6808 && aarch64_print_vector_float_operand (f, x, true))
6809 ;
6810 else
6811 {
6812 output_operand_lossage ("invalid vector constant");
6813 return;
6814 }
6815 break;
6816
6817 case 'b':
6818 case 'h':
6819 case 's':
6820 case 'd':
6821 case 'q':
6822 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
6823 {
6824 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
6825 return;
6826 }
6827 asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
6828 break;
6829
6830 case 'S':
6831 case 'T':
6832 case 'U':
6833 case 'V':
6834 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
6835 {
6836 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
6837 return;
6838 }
6839 asm_fprintf (f, "%c%d",
6840 aarch64_sve_data_mode_p (GET_MODE (x)) ? 'z' : 'v',
6841 REGNO (x) - V0_REGNUM + (code - 'S'));
6842 break;
6843
6844 case 'R':
6845 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
6846 {
6847 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
6848 return;
6849 }
6850 asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
6851 break;
6852
6853 case 'X':
6854 if (!CONST_INT_P (x))
6855 {
6856 output_operand_lossage ("invalid operand for '%%%c'", code);
6857 return;
6858 }
6859 asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
6860 break;
6861
6862 case 'C':
6863 {
6864 /* Print a replicated constant in hex. */
6865 if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
6866 {
6867 output_operand_lossage ("invalid operand for '%%%c'", code);
6868 return;
6869 }
6870 scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
6871 asm_fprintf (f, "0x%wx", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
6872 }
6873 break;
6874
6875 case 'D':
6876 {
6877 /* Print a replicated constant in decimal, treating it as
6878 unsigned. */
6879 if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
6880 {
6881 output_operand_lossage ("invalid operand for '%%%c'", code);
6882 return;
6883 }
6884 scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
6885 asm_fprintf (f, "%wd", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
6886 }
6887 break;
6888
6889 case 'w':
6890 case 'x':
6891 if (x == const0_rtx
6892 || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
6893 {
6894 asm_fprintf (f, "%czr", code);
6895 break;
6896 }
6897
6898 if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
6899 {
6900 asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
6901 break;
6902 }
6903
6904 if (REG_P (x) && REGNO (x) == SP_REGNUM)
6905 {
6906 asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
6907 break;
6908 }
6909
6910 /* Fall through */
6911
6912 case 0:
6913 if (x == NULL)
6914 {
6915 output_operand_lossage ("missing operand");
6916 return;
6917 }
6918
6919 switch (GET_CODE (x))
6920 {
6921 case REG:
6922 if (aarch64_sve_data_mode_p (GET_MODE (x)))
6923 {
6924 if (REG_NREGS (x) == 1)
6925 asm_fprintf (f, "z%d", REGNO (x) - V0_REGNUM);
6926 else
6927 {
6928 char suffix
6929 = sizetochar (GET_MODE_UNIT_BITSIZE (GET_MODE (x)));
6930 asm_fprintf (f, "{z%d.%c - z%d.%c}",
6931 REGNO (x) - V0_REGNUM, suffix,
6932 END_REGNO (x) - V0_REGNUM - 1, suffix);
6933 }
6934 }
6935 else
6936 asm_fprintf (f, "%s", reg_names [REGNO (x)]);
6937 break;
6938
6939 case MEM:
6940 output_address (GET_MODE (x), XEXP (x, 0));
6941 break;
6942
6943 case LABEL_REF:
6944 case SYMBOL_REF:
6945 output_addr_const (asm_out_file, x);
6946 break;
6947
6948 case CONST_INT:
6949 asm_fprintf (f, "%wd", INTVAL (x));
6950 break;
6951
6952 case CONST:
6953 if (!VECTOR_MODE_P (GET_MODE (x)))
6954 {
6955 output_addr_const (asm_out_file, x);
6956 break;
6957 }
6958 /* fall through */
6959
6960 case CONST_VECTOR:
6961 if (!const_vec_duplicate_p (x, &elt))
6962 {
6963 output_operand_lossage ("invalid vector constant");
6964 return;
6965 }
6966
6967 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
6968 asm_fprintf (f, "%wd", INTVAL (elt));
6969 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
6970 && aarch64_print_vector_float_operand (f, x, false))
6971 ;
6972 else
6973 {
6974 output_operand_lossage ("invalid vector constant");
6975 return;
6976 }
6977 break;
6978
6979 case CONST_DOUBLE:
6980 /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
6981 be getting CONST_DOUBLEs holding integers. */
6982 gcc_assert (GET_MODE (x) != VOIDmode);
6983 if (aarch64_float_const_zero_rtx_p (x))
6984 {
6985 fputc ('0', f);
6986 break;
6987 }
6988 else if (aarch64_float_const_representable_p (x))
6989 {
6990 #define buf_size 20
6991 char float_buf[buf_size] = {'\0'};
6992 real_to_decimal_for_mode (float_buf,
6993 CONST_DOUBLE_REAL_VALUE (x),
6994 buf_size, buf_size,
6995 1, GET_MODE (x));
6996 asm_fprintf (asm_out_file, "%s", float_buf);
6997 break;
6998 #undef buf_size
6999 }
7000 output_operand_lossage ("invalid constant");
7001 return;
7002 default:
7003 output_operand_lossage ("invalid operand");
7004 return;
7005 }
7006 break;
7007
7008 case 'A':
7009 if (GET_CODE (x) == HIGH)
7010 x = XEXP (x, 0);
7011
7012 switch (aarch64_classify_symbolic_expression (x))
7013 {
7014 case SYMBOL_SMALL_GOT_4G:
7015 asm_fprintf (asm_out_file, ":got:");
7016 break;
7017
7018 case SYMBOL_SMALL_TLSGD:
7019 asm_fprintf (asm_out_file, ":tlsgd:");
7020 break;
7021
7022 case SYMBOL_SMALL_TLSDESC:
7023 asm_fprintf (asm_out_file, ":tlsdesc:");
7024 break;
7025
7026 case SYMBOL_SMALL_TLSIE:
7027 asm_fprintf (asm_out_file, ":gottprel:");
7028 break;
7029
7030 case SYMBOL_TLSLE24:
7031 asm_fprintf (asm_out_file, ":tprel:");
7032 break;
7033
7034 case SYMBOL_TINY_GOT:
7035 gcc_unreachable ();
7036 break;
7037
7038 default:
7039 break;
7040 }
7041 output_addr_const (asm_out_file, x);
7042 break;
7043
7044 case 'L':
7045 switch (aarch64_classify_symbolic_expression (x))
7046 {
7047 case SYMBOL_SMALL_GOT_4G:
7048 asm_fprintf (asm_out_file, ":lo12:");
7049 break;
7050
7051 case SYMBOL_SMALL_TLSGD:
7052 asm_fprintf (asm_out_file, ":tlsgd_lo12:");
7053 break;
7054
7055 case SYMBOL_SMALL_TLSDESC:
7056 asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
7057 break;
7058
7059 case SYMBOL_SMALL_TLSIE:
7060 asm_fprintf (asm_out_file, ":gottprel_lo12:");
7061 break;
7062
7063 case SYMBOL_TLSLE12:
7064 asm_fprintf (asm_out_file, ":tprel_lo12:");
7065 break;
7066
7067 case SYMBOL_TLSLE24:
7068 asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
7069 break;
7070
7071 case SYMBOL_TINY_GOT:
7072 asm_fprintf (asm_out_file, ":got:");
7073 break;
7074
7075 case SYMBOL_TINY_TLSIE:
7076 asm_fprintf (asm_out_file, ":gottprel:");
7077 break;
7078
7079 default:
7080 break;
7081 }
7082 output_addr_const (asm_out_file, x);
7083 break;
7084
7085 case 'G':
7086 switch (aarch64_classify_symbolic_expression (x))
7087 {
7088 case SYMBOL_TLSLE24:
7089 asm_fprintf (asm_out_file, ":tprel_hi12:");
7090 break;
7091 default:
7092 break;
7093 }
7094 output_addr_const (asm_out_file, x);
7095 break;
7096
7097 case 'k':
7098 {
7099 HOST_WIDE_INT cond_code;
7100
7101 if (!CONST_INT_P (x))
7102 {
7103 output_operand_lossage ("invalid operand for '%%%c'", code);
7104 return;
7105 }
7106
7107 cond_code = INTVAL (x);
7108 gcc_assert (cond_code >= 0 && cond_code <= AARCH64_NV);
7109 asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code]);
7110 }
7111 break;
7112
7113 case 'y':
7114 case 'z':
7115 {
7116 machine_mode mode = GET_MODE (x);
7117
7118 if (GET_CODE (x) != MEM
7119 || (code == 'y' && maybe_ne (GET_MODE_SIZE (mode), 16)))
7120 {
7121 output_operand_lossage ("invalid operand for '%%%c'", code);
7122 return;
7123 }
7124
7125 if (code == 'y')
7126 /* LDP/STP which uses a single double-width memory operand.
7127 Adjust the mode to appear like a typical LDP/STP.
7128 Currently this is supported for 16-byte accesses only. */
7129 mode = DFmode;
7130
7131 if (!aarch64_print_ldpstp_address (f, mode, XEXP (x, 0)))
7132 output_operand_lossage ("invalid operand prefix '%%%c'", code);
7133 }
7134 break;
7135
7136 default:
7137 output_operand_lossage ("invalid operand prefix '%%%c'", code);
7138 return;
7139 }
7140 }
7141
7142 /* Print address 'x' of a memory access with mode 'mode'.
7143 'op' is the context required by aarch64_classify_address. It can either be
7144 MEM for a normal memory access or PARALLEL for LDP/STP. */
7145 static bool
7146 aarch64_print_address_internal (FILE *f, machine_mode mode, rtx x,
7147 aarch64_addr_query_type type)
7148 {
7149 struct aarch64_address_info addr;
7150 unsigned int size;
7151
7152 /* Check all addresses are Pmode - including ILP32. */
7153 if (GET_MODE (x) != Pmode)
7154 output_operand_lossage ("invalid address mode");
7155
7156 if (aarch64_classify_address (&addr, x, mode, true, type))
7157 switch (addr.type)
7158 {
7159 case ADDRESS_REG_IMM:
7160 if (known_eq (addr.const_offset, 0))
7161 asm_fprintf (f, "[%s]", reg_names [REGNO (addr.base)]);
7162 else if (aarch64_sve_data_mode_p (mode))
7163 {
7164 HOST_WIDE_INT vnum
7165 = exact_div (addr.const_offset,
7166 BYTES_PER_SVE_VECTOR).to_constant ();
7167 asm_fprintf (f, "[%s, #%wd, mul vl]",
7168 reg_names[REGNO (addr.base)], vnum);
7169 }
7170 else if (aarch64_sve_pred_mode_p (mode))
7171 {
7172 HOST_WIDE_INT vnum
7173 = exact_div (addr.const_offset,
7174 BYTES_PER_SVE_PRED).to_constant ();
7175 asm_fprintf (f, "[%s, #%wd, mul vl]",
7176 reg_names[REGNO (addr.base)], vnum);
7177 }
7178 else
7179 asm_fprintf (f, "[%s, %wd]", reg_names [REGNO (addr.base)],
7180 INTVAL (addr.offset));
7181 return true;
7182
7183 case ADDRESS_REG_REG:
7184 if (addr.shift == 0)
7185 asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
7186 reg_names [REGNO (addr.offset)]);
7187 else
7188 asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
7189 reg_names [REGNO (addr.offset)], addr.shift);
7190 return true;
7191
7192 case ADDRESS_REG_UXTW:
7193 if (addr.shift == 0)
7194 asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
7195 REGNO (addr.offset) - R0_REGNUM);
7196 else
7197 asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
7198 REGNO (addr.offset) - R0_REGNUM, addr.shift);
7199 return true;
7200
7201 case ADDRESS_REG_SXTW:
7202 if (addr.shift == 0)
7203 asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
7204 REGNO (addr.offset) - R0_REGNUM);
7205 else
7206 asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
7207 REGNO (addr.offset) - R0_REGNUM, addr.shift);
7208 return true;
7209
7210 case ADDRESS_REG_WB:
7211 /* Writeback is only supported for fixed-width modes. */
7212 size = GET_MODE_SIZE (mode).to_constant ();
7213 switch (GET_CODE (x))
7214 {
7215 case PRE_INC:
7216 asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)], size);
7217 return true;
7218 case POST_INC:
7219 asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)], size);
7220 return true;
7221 case PRE_DEC:
7222 asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)], size);
7223 return true;
7224 case POST_DEC:
7225 asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)], size);
7226 return true;
7227 case PRE_MODIFY:
7228 asm_fprintf (f, "[%s, %wd]!", reg_names[REGNO (addr.base)],
7229 INTVAL (addr.offset));
7230 return true;
7231 case POST_MODIFY:
7232 asm_fprintf (f, "[%s], %wd", reg_names[REGNO (addr.base)],
7233 INTVAL (addr.offset));
7234 return true;
7235 default:
7236 break;
7237 }
7238 break;
7239
7240 case ADDRESS_LO_SUM:
7241 asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
7242 output_addr_const (f, addr.offset);
7243 asm_fprintf (f, "]");
7244 return true;
7245
7246 case ADDRESS_SYMBOLIC:
7247 output_addr_const (f, x);
7248 return true;
7249 }
7250
7251 return false;
7252 }
7253
7254 /* Print address 'x' of a LDP/STP with mode 'mode'. */
7255 static bool
7256 aarch64_print_ldpstp_address (FILE *f, machine_mode mode, rtx x)
7257 {
7258 return aarch64_print_address_internal (f, mode, x, ADDR_QUERY_LDP_STP);
7259 }
7260
7261 /* Print address 'x' of a memory access with mode 'mode'. */
7262 static void
7263 aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
7264 {
7265 if (!aarch64_print_address_internal (f, mode, x, ADDR_QUERY_ANY))
7266 output_addr_const (f, x);
7267 }
7268
7269 bool
7270 aarch64_label_mentioned_p (rtx x)
7271 {
7272 const char *fmt;
7273 int i;
7274
7275 if (GET_CODE (x) == LABEL_REF)
7276 return true;
7277
7278 /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
7279 referencing instruction, but they are constant offsets, not
7280 symbols. */
7281 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
7282 return false;
7283
7284 fmt = GET_RTX_FORMAT (GET_CODE (x));
7285 for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
7286 {
7287 if (fmt[i] == 'E')
7288 {
7289 int j;
7290
7291 for (j = XVECLEN (x, i) - 1; j >= 0; j--)
7292 if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
7293 return 1;
7294 }
7295 else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
7296 return 1;
7297 }
7298
7299 return 0;
7300 }
7301
7302 /* Implement REGNO_REG_CLASS. */
7303
7304 enum reg_class
7305 aarch64_regno_regclass (unsigned regno)
7306 {
7307 if (GP_REGNUM_P (regno))
7308 return GENERAL_REGS;
7309
7310 if (regno == SP_REGNUM)
7311 return STACK_REG;
7312
7313 if (regno == FRAME_POINTER_REGNUM
7314 || regno == ARG_POINTER_REGNUM)
7315 return POINTER_REGS;
7316
7317 if (FP_REGNUM_P (regno))
7318 return FP_LO_REGNUM_P (regno) ? FP_LO_REGS : FP_REGS;
7319
7320 if (PR_REGNUM_P (regno))
7321 return PR_LO_REGNUM_P (regno) ? PR_LO_REGS : PR_HI_REGS;
7322
7323 return NO_REGS;
7324 }
7325
7326 /* OFFSET is an address offset for mode MODE, which has SIZE bytes.
7327 If OFFSET is out of range, return an offset of an anchor point
7328 that is in range. Return 0 otherwise. */
7329
7330 static HOST_WIDE_INT
7331 aarch64_anchor_offset (HOST_WIDE_INT offset, HOST_WIDE_INT size,
7332 machine_mode mode)
7333 {
7334 /* Does it look like we'll need a 16-byte load/store-pair operation? */
7335 if (size > 16)
7336 return (offset + 0x400) & ~0x7f0;
7337
7338 /* For offsets that aren't a multiple of the access size, the limit is
7339 -256...255. */
7340 if (offset & (size - 1))
7341 {
7342 /* BLKmode typically uses LDP of X-registers. */
7343 if (mode == BLKmode)
7344 return (offset + 512) & ~0x3ff;
7345 return (offset + 0x100) & ~0x1ff;
7346 }
7347
7348 /* Small negative offsets are supported. */
7349 if (IN_RANGE (offset, -256, 0))
7350 return 0;
7351
7352 if (mode == TImode || mode == TFmode)
7353 return (offset + 0x100) & ~0x1ff;
7354
7355 /* Use 12-bit offset by access size. */
7356 return offset & (~0xfff * size);
7357 }
7358
7359 static rtx
7360 aarch64_legitimize_address (rtx x, rtx /* orig_x */, machine_mode mode)
7361 {
7362 /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
7363 where mask is selected by alignment and size of the offset.
7364 We try to pick as large a range for the offset as possible to
7365 maximize the chance of a CSE. However, for aligned addresses
7366 we limit the range to 4k so that structures with different sized
7367 elements are likely to use the same base. We need to be careful
7368 not to split a CONST for some forms of address expression, otherwise
7369 it will generate sub-optimal code. */
7370
7371 if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
7372 {
7373 rtx base = XEXP (x, 0);
7374 rtx offset_rtx = XEXP (x, 1);
7375 HOST_WIDE_INT offset = INTVAL (offset_rtx);
7376
7377 if (GET_CODE (base) == PLUS)
7378 {
7379 rtx op0 = XEXP (base, 0);
7380 rtx op1 = XEXP (base, 1);
7381
7382 /* Force any scaling into a temp for CSE. */
7383 op0 = force_reg (Pmode, op0);
7384 op1 = force_reg (Pmode, op1);
7385
7386 /* Let the pointer register be in op0. */
7387 if (REG_POINTER (op1))
7388 std::swap (op0, op1);
7389
7390 /* If the pointer is virtual or frame related, then we know that
7391 virtual register instantiation or register elimination is going
7392 to apply a second constant. We want the two constants folded
7393 together easily. Therefore, emit as (OP0 + CONST) + OP1. */
7394 if (virt_or_elim_regno_p (REGNO (op0)))
7395 {
7396 base = expand_binop (Pmode, add_optab, op0, offset_rtx,
7397 NULL_RTX, true, OPTAB_DIRECT);
7398 return gen_rtx_PLUS (Pmode, base, op1);
7399 }
7400
7401 /* Otherwise, in order to encourage CSE (and thence loop strength
7402 reduce) scaled addresses, emit as (OP0 + OP1) + CONST. */
7403 base = expand_binop (Pmode, add_optab, op0, op1,
7404 NULL_RTX, true, OPTAB_DIRECT);
7405 x = gen_rtx_PLUS (Pmode, base, offset_rtx);
7406 }
7407
7408 HOST_WIDE_INT size;
7409 if (GET_MODE_SIZE (mode).is_constant (&size))
7410 {
7411 HOST_WIDE_INT base_offset = aarch64_anchor_offset (offset, size,
7412 mode);
7413 if (base_offset != 0)
7414 {
7415 base = plus_constant (Pmode, base, base_offset);
7416 base = force_operand (base, NULL_RTX);
7417 return plus_constant (Pmode, base, offset - base_offset);
7418 }
7419 }
7420 }
7421
7422 return x;
7423 }
7424
7425 /* Return the reload icode required for a constant pool in mode. */
7426 static enum insn_code
7427 aarch64_constant_pool_reload_icode (machine_mode mode)
7428 {
7429 switch (mode)
7430 {
7431 case E_SFmode:
7432 return CODE_FOR_aarch64_reload_movcpsfdi;
7433
7434 case E_DFmode:
7435 return CODE_FOR_aarch64_reload_movcpdfdi;
7436
7437 case E_TFmode:
7438 return CODE_FOR_aarch64_reload_movcptfdi;
7439
7440 case E_V8QImode:
7441 return CODE_FOR_aarch64_reload_movcpv8qidi;
7442
7443 case E_V16QImode:
7444 return CODE_FOR_aarch64_reload_movcpv16qidi;
7445
7446 case E_V4HImode:
7447 return CODE_FOR_aarch64_reload_movcpv4hidi;
7448
7449 case E_V8HImode:
7450 return CODE_FOR_aarch64_reload_movcpv8hidi;
7451
7452 case E_V2SImode:
7453 return CODE_FOR_aarch64_reload_movcpv2sidi;
7454
7455 case E_V4SImode:
7456 return CODE_FOR_aarch64_reload_movcpv4sidi;
7457
7458 case E_V2DImode:
7459 return CODE_FOR_aarch64_reload_movcpv2didi;
7460
7461 case E_V2DFmode:
7462 return CODE_FOR_aarch64_reload_movcpv2dfdi;
7463
7464 default:
7465 gcc_unreachable ();
7466 }
7467
7468 gcc_unreachable ();
7469 }
7470 static reg_class_t
7471 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
7472 reg_class_t rclass,
7473 machine_mode mode,
7474 secondary_reload_info *sri)
7475 {
7476 /* Use aarch64_sve_reload_be for SVE reloads that cannot be handled
7477 directly by the *aarch64_sve_mov<mode>_be move pattern. See the
7478 comment at the head of aarch64-sve.md for more details about the
7479 big-endian handling. */
7480 if (BYTES_BIG_ENDIAN
7481 && reg_class_subset_p (rclass, FP_REGS)
7482 && !((REG_P (x) && HARD_REGISTER_P (x))
7483 || aarch64_simd_valid_immediate (x, NULL))
7484 && aarch64_sve_data_mode_p (mode))
7485 {
7486 sri->icode = CODE_FOR_aarch64_sve_reload_be;
7487 return NO_REGS;
7488 }
7489
7490 /* If we have to disable direct literal pool loads and stores because the
7491 function is too big, then we need a scratch register. */
7492 if (MEM_P (x) && GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x)
7493 && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
7494 || targetm.vector_mode_supported_p (GET_MODE (x)))
7495 && !aarch64_pcrelative_literal_loads)
7496 {
7497 sri->icode = aarch64_constant_pool_reload_icode (mode);
7498 return NO_REGS;
7499 }
7500
7501 /* Without the TARGET_SIMD instructions we cannot move a Q register
7502 to a Q register directly. We need a scratch. */
7503 if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
7504 && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
7505 && reg_class_subset_p (rclass, FP_REGS))
7506 {
7507 if (mode == TFmode)
7508 sri->icode = CODE_FOR_aarch64_reload_movtf;
7509 else if (mode == TImode)
7510 sri->icode = CODE_FOR_aarch64_reload_movti;
7511 return NO_REGS;
7512 }
7513
7514 /* A TFmode or TImode memory access should be handled via an FP_REGS
7515 because AArch64 has richer addressing modes for LDR/STR instructions
7516 than LDP/STP instructions. */
7517 if (TARGET_FLOAT && rclass == GENERAL_REGS
7518 && known_eq (GET_MODE_SIZE (mode), 16) && MEM_P (x))
7519 return FP_REGS;
7520
7521 if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
7522 return GENERAL_REGS;
7523
7524 return NO_REGS;
7525 }
7526
7527 static bool
7528 aarch64_can_eliminate (const int from ATTRIBUTE_UNUSED, const int to)
7529 {
7530 gcc_assert (from == ARG_POINTER_REGNUM || from == FRAME_POINTER_REGNUM);
7531
7532 /* If we need a frame pointer, ARG_POINTER_REGNUM and FRAME_POINTER_REGNUM
7533 can only eliminate to HARD_FRAME_POINTER_REGNUM. */
7534 if (frame_pointer_needed)
7535 return to == HARD_FRAME_POINTER_REGNUM;
7536 return true;
7537 }
7538
7539 poly_int64
7540 aarch64_initial_elimination_offset (unsigned from, unsigned to)
7541 {
7542 aarch64_layout_frame ();
7543
7544 if (to == HARD_FRAME_POINTER_REGNUM)
7545 {
7546 if (from == ARG_POINTER_REGNUM)
7547 return cfun->machine->frame.hard_fp_offset;
7548
7549 if (from == FRAME_POINTER_REGNUM)
7550 return cfun->machine->frame.hard_fp_offset
7551 - cfun->machine->frame.locals_offset;
7552 }
7553
7554 if (to == STACK_POINTER_REGNUM)
7555 {
7556 if (from == FRAME_POINTER_REGNUM)
7557 return cfun->machine->frame.frame_size
7558 - cfun->machine->frame.locals_offset;
7559 }
7560
7561 return cfun->machine->frame.frame_size;
7562 }
7563
7564 /* Implement RETURN_ADDR_RTX. We do not support moving back to a
7565 previous frame. */
7566
7567 rtx
7568 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
7569 {
7570 if (count != 0)
7571 return const0_rtx;
7572 return get_hard_reg_initial_val (Pmode, LR_REGNUM);
7573 }
7574
7575
7576 static void
7577 aarch64_asm_trampoline_template (FILE *f)
7578 {
7579 if (TARGET_ILP32)
7580 {
7581 asm_fprintf (f, "\tldr\tw%d, .+16\n", IP1_REGNUM - R0_REGNUM);
7582 asm_fprintf (f, "\tldr\tw%d, .+16\n", STATIC_CHAIN_REGNUM - R0_REGNUM);
7583 }
7584 else
7585 {
7586 asm_fprintf (f, "\tldr\t%s, .+16\n", reg_names [IP1_REGNUM]);
7587 asm_fprintf (f, "\tldr\t%s, .+20\n", reg_names [STATIC_CHAIN_REGNUM]);
7588 }
7589 asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
7590 assemble_aligned_integer (4, const0_rtx);
7591 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
7592 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
7593 }
7594
7595 static void
7596 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
7597 {
7598 rtx fnaddr, mem, a_tramp;
7599 const int tramp_code_sz = 16;
7600
7601 /* Don't need to copy the trailing D-words, we fill those in below. */
7602 emit_block_move (m_tramp, assemble_trampoline_template (),
7603 GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
7604 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
7605 fnaddr = XEXP (DECL_RTL (fndecl), 0);
7606 if (GET_MODE (fnaddr) != ptr_mode)
7607 fnaddr = convert_memory_address (ptr_mode, fnaddr);
7608 emit_move_insn (mem, fnaddr);
7609
7610 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
7611 emit_move_insn (mem, chain_value);
7612
7613 /* XXX We should really define a "clear_cache" pattern and use
7614 gen_clear_cache(). */
7615 a_tramp = XEXP (m_tramp, 0);
7616 emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
7617 LCT_NORMAL, VOIDmode, a_tramp, ptr_mode,
7618 plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
7619 ptr_mode);
7620 }
7621
7622 static unsigned char
7623 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
7624 {
7625 /* ??? Logically we should only need to provide a value when
7626 HARD_REGNO_MODE_OK says that at least one register in REGCLASS
7627 can hold MODE, but at the moment we need to handle all modes.
7628 Just ignore any runtime parts for registers that can't store them. */
7629 HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
7630 unsigned int nregs;
7631 switch (regclass)
7632 {
7633 case TAILCALL_ADDR_REGS:
7634 case POINTER_REGS:
7635 case GENERAL_REGS:
7636 case ALL_REGS:
7637 case POINTER_AND_FP_REGS:
7638 case FP_REGS:
7639 case FP_LO_REGS:
7640 if (aarch64_sve_data_mode_p (mode)
7641 && constant_multiple_p (GET_MODE_SIZE (mode),
7642 BYTES_PER_SVE_VECTOR, &nregs))
7643 return nregs;
7644 return (aarch64_vector_data_mode_p (mode)
7645 ? CEIL (lowest_size, UNITS_PER_VREG)
7646 : CEIL (lowest_size, UNITS_PER_WORD));
7647 case STACK_REG:
7648 case PR_REGS:
7649 case PR_LO_REGS:
7650 case PR_HI_REGS:
7651 return 1;
7652
7653 case NO_REGS:
7654 return 0;
7655
7656 default:
7657 break;
7658 }
7659 gcc_unreachable ();
7660 }
7661
7662 static reg_class_t
7663 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
7664 {
7665 if (regclass == POINTER_REGS)
7666 return GENERAL_REGS;
7667
7668 if (regclass == STACK_REG)
7669 {
7670 if (REG_P(x)
7671 && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
7672 return regclass;
7673
7674 return NO_REGS;
7675 }
7676
7677 /* Register eliminiation can result in a request for
7678 SP+constant->FP_REGS. We cannot support such operations which
7679 use SP as source and an FP_REG as destination, so reject out
7680 right now. */
7681 if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
7682 {
7683 rtx lhs = XEXP (x, 0);
7684
7685 /* Look through a possible SUBREG introduced by ILP32. */
7686 if (GET_CODE (lhs) == SUBREG)
7687 lhs = SUBREG_REG (lhs);
7688
7689 gcc_assert (REG_P (lhs));
7690 gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
7691 POINTER_REGS));
7692 return NO_REGS;
7693 }
7694
7695 return regclass;
7696 }
7697
7698 void
7699 aarch64_asm_output_labelref (FILE* f, const char *name)
7700 {
7701 asm_fprintf (f, "%U%s", name);
7702 }
7703
7704 static void
7705 aarch64_elf_asm_constructor (rtx symbol, int priority)
7706 {
7707 if (priority == DEFAULT_INIT_PRIORITY)
7708 default_ctor_section_asm_out_constructor (symbol, priority);
7709 else
7710 {
7711 section *s;
7712 /* While priority is known to be in range [0, 65535], so 18 bytes
7713 would be enough, the compiler might not know that. To avoid
7714 -Wformat-truncation false positive, use a larger size. */
7715 char buf[23];
7716 snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
7717 s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
7718 switch_to_section (s);
7719 assemble_align (POINTER_SIZE);
7720 assemble_aligned_integer (POINTER_BYTES, symbol);
7721 }
7722 }
7723
7724 static void
7725 aarch64_elf_asm_destructor (rtx symbol, int priority)
7726 {
7727 if (priority == DEFAULT_INIT_PRIORITY)
7728 default_dtor_section_asm_out_destructor (symbol, priority);
7729 else
7730 {
7731 section *s;
7732 /* While priority is known to be in range [0, 65535], so 18 bytes
7733 would be enough, the compiler might not know that. To avoid
7734 -Wformat-truncation false positive, use a larger size. */
7735 char buf[23];
7736 snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
7737 s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
7738 switch_to_section (s);
7739 assemble_align (POINTER_SIZE);
7740 assemble_aligned_integer (POINTER_BYTES, symbol);
7741 }
7742 }
7743
7744 const char*
7745 aarch64_output_casesi (rtx *operands)
7746 {
7747 char buf[100];
7748 char label[100];
7749 rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
7750 int index;
7751 static const char *const patterns[4][2] =
7752 {
7753 {
7754 "ldrb\t%w3, [%0,%w1,uxtw]",
7755 "add\t%3, %4, %w3, sxtb #2"
7756 },
7757 {
7758 "ldrh\t%w3, [%0,%w1,uxtw #1]",
7759 "add\t%3, %4, %w3, sxth #2"
7760 },
7761 {
7762 "ldr\t%w3, [%0,%w1,uxtw #2]",
7763 "add\t%3, %4, %w3, sxtw #2"
7764 },
7765 /* We assume that DImode is only generated when not optimizing and
7766 that we don't really need 64-bit address offsets. That would
7767 imply an object file with 8GB of code in a single function! */
7768 {
7769 "ldr\t%w3, [%0,%w1,uxtw #2]",
7770 "add\t%3, %4, %w3, sxtw #2"
7771 }
7772 };
7773
7774 gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
7775
7776 scalar_int_mode mode = as_a <scalar_int_mode> (GET_MODE (diff_vec));
7777 index = exact_log2 (GET_MODE_SIZE (mode));
7778
7779 gcc_assert (index >= 0 && index <= 3);
7780
7781 /* Need to implement table size reduction, by chaning the code below. */
7782 output_asm_insn (patterns[index][0], operands);
7783 ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
7784 snprintf (buf, sizeof (buf),
7785 "adr\t%%4, %s", targetm.strip_name_encoding (label));
7786 output_asm_insn (buf, operands);
7787 output_asm_insn (patterns[index][1], operands);
7788 output_asm_insn ("br\t%3", operands);
7789 assemble_label (asm_out_file, label);
7790 return "";
7791 }
7792
7793
7794 /* Return size in bits of an arithmetic operand which is shifted/scaled and
7795 masked such that it is suitable for a UXTB, UXTH, or UXTW extend
7796 operator. */
7797
7798 int
7799 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
7800 {
7801 if (shift >= 0 && shift <= 3)
7802 {
7803 int size;
7804 for (size = 8; size <= 32; size *= 2)
7805 {
7806 HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
7807 if (mask == bits << shift)
7808 return size;
7809 }
7810 }
7811 return 0;
7812 }
7813
7814 /* Constant pools are per function only when PC relative
7815 literal loads are true or we are in the large memory
7816 model. */
7817
7818 static inline bool
7819 aarch64_can_use_per_function_literal_pools_p (void)
7820 {
7821 return (aarch64_pcrelative_literal_loads
7822 || aarch64_cmodel == AARCH64_CMODEL_LARGE);
7823 }
7824
7825 static bool
7826 aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
7827 {
7828 /* We can't use blocks for constants when we're using a per-function
7829 constant pool. */
7830 return !aarch64_can_use_per_function_literal_pools_p ();
7831 }
7832
7833 /* Select appropriate section for constants depending
7834 on where we place literal pools. */
7835
7836 static section *
7837 aarch64_select_rtx_section (machine_mode mode,
7838 rtx x,
7839 unsigned HOST_WIDE_INT align)
7840 {
7841 if (aarch64_can_use_per_function_literal_pools_p ())
7842 return function_section (current_function_decl);
7843
7844 return default_elf_select_rtx_section (mode, x, align);
7845 }
7846
7847 /* Implement ASM_OUTPUT_POOL_EPILOGUE. */
7848 void
7849 aarch64_asm_output_pool_epilogue (FILE *f, const char *, tree,
7850 HOST_WIDE_INT offset)
7851 {
7852 /* When using per-function literal pools, we must ensure that any code
7853 section is aligned to the minimal instruction length, lest we get
7854 errors from the assembler re "unaligned instructions". */
7855 if ((offset & 3) && aarch64_can_use_per_function_literal_pools_p ())
7856 ASM_OUTPUT_ALIGN (f, 2);
7857 }
7858
7859 /* Costs. */
7860
7861 /* Helper function for rtx cost calculation. Strip a shift expression
7862 from X. Returns the inner operand if successful, or the original
7863 expression on failure. */
7864 static rtx
7865 aarch64_strip_shift (rtx x)
7866 {
7867 rtx op = x;
7868
7869 /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
7870 we can convert both to ROR during final output. */
7871 if ((GET_CODE (op) == ASHIFT
7872 || GET_CODE (op) == ASHIFTRT
7873 || GET_CODE (op) == LSHIFTRT
7874 || GET_CODE (op) == ROTATERT
7875 || GET_CODE (op) == ROTATE)
7876 && CONST_INT_P (XEXP (op, 1)))
7877 return XEXP (op, 0);
7878
7879 if (GET_CODE (op) == MULT
7880 && CONST_INT_P (XEXP (op, 1))
7881 && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
7882 return XEXP (op, 0);
7883
7884 return x;
7885 }
7886
7887 /* Helper function for rtx cost calculation. Strip an extend
7888 expression from X. Returns the inner operand if successful, or the
7889 original expression on failure. We deal with a number of possible
7890 canonicalization variations here. If STRIP_SHIFT is true, then
7891 we can strip off a shift also. */
7892 static rtx
7893 aarch64_strip_extend (rtx x, bool strip_shift)
7894 {
7895 scalar_int_mode mode;
7896 rtx op = x;
7897
7898 if (!is_a <scalar_int_mode> (GET_MODE (op), &mode))
7899 return op;
7900
7901 /* Zero and sign extraction of a widened value. */
7902 if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
7903 && XEXP (op, 2) == const0_rtx
7904 && GET_CODE (XEXP (op, 0)) == MULT
7905 && aarch64_is_extend_from_extract (mode, XEXP (XEXP (op, 0), 1),
7906 XEXP (op, 1)))
7907 return XEXP (XEXP (op, 0), 0);
7908
7909 /* It can also be represented (for zero-extend) as an AND with an
7910 immediate. */
7911 if (GET_CODE (op) == AND
7912 && GET_CODE (XEXP (op, 0)) == MULT
7913 && CONST_INT_P (XEXP (XEXP (op, 0), 1))
7914 && CONST_INT_P (XEXP (op, 1))
7915 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
7916 INTVAL (XEXP (op, 1))) != 0)
7917 return XEXP (XEXP (op, 0), 0);
7918
7919 /* Now handle extended register, as this may also have an optional
7920 left shift by 1..4. */
7921 if (strip_shift
7922 && GET_CODE (op) == ASHIFT
7923 && CONST_INT_P (XEXP (op, 1))
7924 && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
7925 op = XEXP (op, 0);
7926
7927 if (GET_CODE (op) == ZERO_EXTEND
7928 || GET_CODE (op) == SIGN_EXTEND)
7929 op = XEXP (op, 0);
7930
7931 if (op != x)
7932 return op;
7933
7934 return x;
7935 }
7936
7937 /* Return true iff CODE is a shift supported in combination
7938 with arithmetic instructions. */
7939
7940 static bool
7941 aarch64_shift_p (enum rtx_code code)
7942 {
7943 return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
7944 }
7945
7946
7947 /* Return true iff X is a cheap shift without a sign extend. */
7948
7949 static bool
7950 aarch64_cheap_mult_shift_p (rtx x)
7951 {
7952 rtx op0, op1;
7953
7954 op0 = XEXP (x, 0);
7955 op1 = XEXP (x, 1);
7956
7957 if (!(aarch64_tune_params.extra_tuning_flags
7958 & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND))
7959 return false;
7960
7961 if (GET_CODE (op0) == SIGN_EXTEND)
7962 return false;
7963
7964 if (GET_CODE (x) == ASHIFT && CONST_INT_P (op1)
7965 && UINTVAL (op1) <= 4)
7966 return true;
7967
7968 if (GET_CODE (x) != MULT || !CONST_INT_P (op1))
7969 return false;
7970
7971 HOST_WIDE_INT l2 = exact_log2 (INTVAL (op1));
7972
7973 if (l2 > 0 && l2 <= 4)
7974 return true;
7975
7976 return false;
7977 }
7978
7979 /* Helper function for rtx cost calculation. Calculate the cost of
7980 a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
7981 Return the calculated cost of the expression, recursing manually in to
7982 operands where needed. */
7983
7984 static int
7985 aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
7986 {
7987 rtx op0, op1;
7988 const struct cpu_cost_table *extra_cost
7989 = aarch64_tune_params.insn_extra_cost;
7990 int cost = 0;
7991 bool compound_p = (outer == PLUS || outer == MINUS);
7992 machine_mode mode = GET_MODE (x);
7993
7994 gcc_checking_assert (code == MULT);
7995
7996 op0 = XEXP (x, 0);
7997 op1 = XEXP (x, 1);
7998
7999 if (VECTOR_MODE_P (mode))
8000 mode = GET_MODE_INNER (mode);
8001
8002 /* Integer multiply/fma. */
8003 if (GET_MODE_CLASS (mode) == MODE_INT)
8004 {
8005 /* The multiply will be canonicalized as a shift, cost it as such. */
8006 if (aarch64_shift_p (GET_CODE (x))
8007 || (CONST_INT_P (op1)
8008 && exact_log2 (INTVAL (op1)) > 0))
8009 {
8010 bool is_extend = GET_CODE (op0) == ZERO_EXTEND
8011 || GET_CODE (op0) == SIGN_EXTEND;
8012 if (speed)
8013 {
8014 if (compound_p)
8015 {
8016 /* If the shift is considered cheap,
8017 then don't add any cost. */
8018 if (aarch64_cheap_mult_shift_p (x))
8019 ;
8020 else if (REG_P (op1))
8021 /* ARITH + shift-by-register. */
8022 cost += extra_cost->alu.arith_shift_reg;
8023 else if (is_extend)
8024 /* ARITH + extended register. We don't have a cost field
8025 for ARITH+EXTEND+SHIFT, so use extend_arith here. */
8026 cost += extra_cost->alu.extend_arith;
8027 else
8028 /* ARITH + shift-by-immediate. */
8029 cost += extra_cost->alu.arith_shift;
8030 }
8031 else
8032 /* LSL (immediate). */
8033 cost += extra_cost->alu.shift;
8034
8035 }
8036 /* Strip extends as we will have costed them in the case above. */
8037 if (is_extend)
8038 op0 = aarch64_strip_extend (op0, true);
8039
8040 cost += rtx_cost (op0, VOIDmode, code, 0, speed);
8041
8042 return cost;
8043 }
8044
8045 /* MNEG or [US]MNEGL. Extract the NEG operand and indicate that it's a
8046 compound and let the below cases handle it. After all, MNEG is a
8047 special-case alias of MSUB. */
8048 if (GET_CODE (op0) == NEG)
8049 {
8050 op0 = XEXP (op0, 0);
8051 compound_p = true;
8052 }
8053
8054 /* Integer multiplies or FMAs have zero/sign extending variants. */
8055 if ((GET_CODE (op0) == ZERO_EXTEND
8056 && GET_CODE (op1) == ZERO_EXTEND)
8057 || (GET_CODE (op0) == SIGN_EXTEND
8058 && GET_CODE (op1) == SIGN_EXTEND))
8059 {
8060 cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
8061 cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
8062
8063 if (speed)
8064 {
8065 if (compound_p)
8066 /* SMADDL/UMADDL/UMSUBL/SMSUBL. */
8067 cost += extra_cost->mult[0].extend_add;
8068 else
8069 /* MUL/SMULL/UMULL. */
8070 cost += extra_cost->mult[0].extend;
8071 }
8072
8073 return cost;
8074 }
8075
8076 /* This is either an integer multiply or a MADD. In both cases
8077 we want to recurse and cost the operands. */
8078 cost += rtx_cost (op0, mode, MULT, 0, speed);
8079 cost += rtx_cost (op1, mode, MULT, 1, speed);
8080
8081 if (speed)
8082 {
8083 if (compound_p)
8084 /* MADD/MSUB. */
8085 cost += extra_cost->mult[mode == DImode].add;
8086 else
8087 /* MUL. */
8088 cost += extra_cost->mult[mode == DImode].simple;
8089 }
8090
8091 return cost;
8092 }
8093 else
8094 {
8095 if (speed)
8096 {
8097 /* Floating-point FMA/FMUL can also support negations of the
8098 operands, unless the rounding mode is upward or downward in
8099 which case FNMUL is different than FMUL with operand negation. */
8100 bool neg0 = GET_CODE (op0) == NEG;
8101 bool neg1 = GET_CODE (op1) == NEG;
8102 if (compound_p || !flag_rounding_math || (neg0 && neg1))
8103 {
8104 if (neg0)
8105 op0 = XEXP (op0, 0);
8106 if (neg1)
8107 op1 = XEXP (op1, 0);
8108 }
8109
8110 if (compound_p)
8111 /* FMADD/FNMADD/FNMSUB/FMSUB. */
8112 cost += extra_cost->fp[mode == DFmode].fma;
8113 else
8114 /* FMUL/FNMUL. */
8115 cost += extra_cost->fp[mode == DFmode].mult;
8116 }
8117
8118 cost += rtx_cost (op0, mode, MULT, 0, speed);
8119 cost += rtx_cost (op1, mode, MULT, 1, speed);
8120 return cost;
8121 }
8122 }
8123
8124 static int
8125 aarch64_address_cost (rtx x,
8126 machine_mode mode,
8127 addr_space_t as ATTRIBUTE_UNUSED,
8128 bool speed)
8129 {
8130 enum rtx_code c = GET_CODE (x);
8131 const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
8132 struct aarch64_address_info info;
8133 int cost = 0;
8134 info.shift = 0;
8135
8136 if (!aarch64_classify_address (&info, x, mode, false))
8137 {
8138 if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
8139 {
8140 /* This is a CONST or SYMBOL ref which will be split
8141 in a different way depending on the code model in use.
8142 Cost it through the generic infrastructure. */
8143 int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
8144 /* Divide through by the cost of one instruction to
8145 bring it to the same units as the address costs. */
8146 cost_symbol_ref /= COSTS_N_INSNS (1);
8147 /* The cost is then the cost of preparing the address,
8148 followed by an immediate (possibly 0) offset. */
8149 return cost_symbol_ref + addr_cost->imm_offset;
8150 }
8151 else
8152 {
8153 /* This is most likely a jump table from a case
8154 statement. */
8155 return addr_cost->register_offset;
8156 }
8157 }
8158
8159 switch (info.type)
8160 {
8161 case ADDRESS_LO_SUM:
8162 case ADDRESS_SYMBOLIC:
8163 case ADDRESS_REG_IMM:
8164 cost += addr_cost->imm_offset;
8165 break;
8166
8167 case ADDRESS_REG_WB:
8168 if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
8169 cost += addr_cost->pre_modify;
8170 else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
8171 cost += addr_cost->post_modify;
8172 else
8173 gcc_unreachable ();
8174
8175 break;
8176
8177 case ADDRESS_REG_REG:
8178 cost += addr_cost->register_offset;
8179 break;
8180
8181 case ADDRESS_REG_SXTW:
8182 cost += addr_cost->register_sextend;
8183 break;
8184
8185 case ADDRESS_REG_UXTW:
8186 cost += addr_cost->register_zextend;
8187 break;
8188
8189 default:
8190 gcc_unreachable ();
8191 }
8192
8193
8194 if (info.shift > 0)
8195 {
8196 /* For the sake of calculating the cost of the shifted register
8197 component, we can treat same sized modes in the same way. */
8198 if (known_eq (GET_MODE_BITSIZE (mode), 16))
8199 cost += addr_cost->addr_scale_costs.hi;
8200 else if (known_eq (GET_MODE_BITSIZE (mode), 32))
8201 cost += addr_cost->addr_scale_costs.si;
8202 else if (known_eq (GET_MODE_BITSIZE (mode), 64))
8203 cost += addr_cost->addr_scale_costs.di;
8204 else
8205 /* We can't tell, or this is a 128-bit vector. */
8206 cost += addr_cost->addr_scale_costs.ti;
8207 }
8208
8209 return cost;
8210 }
8211
8212 /* Return the cost of a branch. If SPEED_P is true then the compiler is
8213 optimizing for speed. If PREDICTABLE_P is true then the branch is predicted
8214 to be taken. */
8215
8216 int
8217 aarch64_branch_cost (bool speed_p, bool predictable_p)
8218 {
8219 /* When optimizing for speed, use the cost of unpredictable branches. */
8220 const struct cpu_branch_cost *branch_costs =
8221 aarch64_tune_params.branch_costs;
8222
8223 if (!speed_p || predictable_p)
8224 return branch_costs->predictable;
8225 else
8226 return branch_costs->unpredictable;
8227 }
8228
8229 /* Return true if the RTX X in mode MODE is a zero or sign extract
8230 usable in an ADD or SUB (extended register) instruction. */
8231 static bool
8232 aarch64_rtx_arith_op_extract_p (rtx x, scalar_int_mode mode)
8233 {
8234 /* Catch add with a sign extract.
8235 This is add_<optab><mode>_multp2. */
8236 if (GET_CODE (x) == SIGN_EXTRACT
8237 || GET_CODE (x) == ZERO_EXTRACT)
8238 {
8239 rtx op0 = XEXP (x, 0);
8240 rtx op1 = XEXP (x, 1);
8241 rtx op2 = XEXP (x, 2);
8242
8243 if (GET_CODE (op0) == MULT
8244 && CONST_INT_P (op1)
8245 && op2 == const0_rtx
8246 && CONST_INT_P (XEXP (op0, 1))
8247 && aarch64_is_extend_from_extract (mode,
8248 XEXP (op0, 1),
8249 op1))
8250 {
8251 return true;
8252 }
8253 }
8254 /* The simple case <ARITH>, XD, XN, XM, [us]xt.
8255 No shift. */
8256 else if (GET_CODE (x) == SIGN_EXTEND
8257 || GET_CODE (x) == ZERO_EXTEND)
8258 return REG_P (XEXP (x, 0));
8259
8260 return false;
8261 }
8262
8263 static bool
8264 aarch64_frint_unspec_p (unsigned int u)
8265 {
8266 switch (u)
8267 {
8268 case UNSPEC_FRINTZ:
8269 case UNSPEC_FRINTP:
8270 case UNSPEC_FRINTM:
8271 case UNSPEC_FRINTA:
8272 case UNSPEC_FRINTN:
8273 case UNSPEC_FRINTX:
8274 case UNSPEC_FRINTI:
8275 return true;
8276
8277 default:
8278 return false;
8279 }
8280 }
8281
8282 /* Return true iff X is an rtx that will match an extr instruction
8283 i.e. as described in the *extr<mode>5_insn family of patterns.
8284 OP0 and OP1 will be set to the operands of the shifts involved
8285 on success and will be NULL_RTX otherwise. */
8286
8287 static bool
8288 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
8289 {
8290 rtx op0, op1;
8291 scalar_int_mode mode;
8292 if (!is_a <scalar_int_mode> (GET_MODE (x), &mode))
8293 return false;
8294
8295 *res_op0 = NULL_RTX;
8296 *res_op1 = NULL_RTX;
8297
8298 if (GET_CODE (x) != IOR)
8299 return false;
8300
8301 op0 = XEXP (x, 0);
8302 op1 = XEXP (x, 1);
8303
8304 if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
8305 || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
8306 {
8307 /* Canonicalise locally to ashift in op0, lshiftrt in op1. */
8308 if (GET_CODE (op1) == ASHIFT)
8309 std::swap (op0, op1);
8310
8311 if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
8312 return false;
8313
8314 unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
8315 unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
8316
8317 if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
8318 && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
8319 {
8320 *res_op0 = XEXP (op0, 0);
8321 *res_op1 = XEXP (op1, 0);
8322 return true;
8323 }
8324 }
8325
8326 return false;
8327 }
8328
8329 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
8330 storing it in *COST. Result is true if the total cost of the operation
8331 has now been calculated. */
8332 static bool
8333 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
8334 {
8335 rtx inner;
8336 rtx comparator;
8337 enum rtx_code cmpcode;
8338
8339 if (COMPARISON_P (op0))
8340 {
8341 inner = XEXP (op0, 0);
8342 comparator = XEXP (op0, 1);
8343 cmpcode = GET_CODE (op0);
8344 }
8345 else
8346 {
8347 inner = op0;
8348 comparator = const0_rtx;
8349 cmpcode = NE;
8350 }
8351
8352 if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
8353 {
8354 /* Conditional branch. */
8355 if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
8356 return true;
8357 else
8358 {
8359 if (cmpcode == NE || cmpcode == EQ)
8360 {
8361 if (comparator == const0_rtx)
8362 {
8363 /* TBZ/TBNZ/CBZ/CBNZ. */
8364 if (GET_CODE (inner) == ZERO_EXTRACT)
8365 /* TBZ/TBNZ. */
8366 *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
8367 ZERO_EXTRACT, 0, speed);
8368 else
8369 /* CBZ/CBNZ. */
8370 *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
8371
8372 return true;
8373 }
8374 }
8375 else if (cmpcode == LT || cmpcode == GE)
8376 {
8377 /* TBZ/TBNZ. */
8378 if (comparator == const0_rtx)
8379 return true;
8380 }
8381 }
8382 }
8383 else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
8384 {
8385 /* CCMP. */
8386 if (GET_CODE (op1) == COMPARE)
8387 {
8388 /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0. */
8389 if (XEXP (op1, 1) == const0_rtx)
8390 *cost += 1;
8391 if (speed)
8392 {
8393 machine_mode mode = GET_MODE (XEXP (op1, 0));
8394 const struct cpu_cost_table *extra_cost
8395 = aarch64_tune_params.insn_extra_cost;
8396
8397 if (GET_MODE_CLASS (mode) == MODE_INT)
8398 *cost += extra_cost->alu.arith;
8399 else
8400 *cost += extra_cost->fp[mode == DFmode].compare;
8401 }
8402 return true;
8403 }
8404
8405 /* It's a conditional operation based on the status flags,
8406 so it must be some flavor of CSEL. */
8407
8408 /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL. */
8409 if (GET_CODE (op1) == NEG
8410 || GET_CODE (op1) == NOT
8411 || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
8412 op1 = XEXP (op1, 0);
8413 else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND)
8414 {
8415 /* CSEL with zero-extension (*cmovdi_insn_uxtw). */
8416 op1 = XEXP (op1, 0);
8417 op2 = XEXP (op2, 0);
8418 }
8419
8420 *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
8421 *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
8422 return true;
8423 }
8424
8425 /* We don't know what this is, cost all operands. */
8426 return false;
8427 }
8428
8429 /* Check whether X is a bitfield operation of the form shift + extend that
8430 maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction. If so, return the
8431 operand to which the bitfield operation is applied. Otherwise return
8432 NULL_RTX. */
8433
8434 static rtx
8435 aarch64_extend_bitfield_pattern_p (rtx x)
8436 {
8437 rtx_code outer_code = GET_CODE (x);
8438 machine_mode outer_mode = GET_MODE (x);
8439
8440 if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND
8441 && outer_mode != SImode && outer_mode != DImode)
8442 return NULL_RTX;
8443
8444 rtx inner = XEXP (x, 0);
8445 rtx_code inner_code = GET_CODE (inner);
8446 machine_mode inner_mode = GET_MODE (inner);
8447 rtx op = NULL_RTX;
8448
8449 switch (inner_code)
8450 {
8451 case ASHIFT:
8452 if (CONST_INT_P (XEXP (inner, 1))
8453 && (inner_mode == QImode || inner_mode == HImode))
8454 op = XEXP (inner, 0);
8455 break;
8456 case LSHIFTRT:
8457 if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1))
8458 && (inner_mode == QImode || inner_mode == HImode))
8459 op = XEXP (inner, 0);
8460 break;
8461 case ASHIFTRT:
8462 if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1))
8463 && (inner_mode == QImode || inner_mode == HImode))
8464 op = XEXP (inner, 0);
8465 break;
8466 default:
8467 break;
8468 }
8469
8470 return op;
8471 }
8472
8473 /* Return true if the mask and a shift amount from an RTX of the form
8474 (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
8475 mode MODE. See the *andim_ashift<mode>_bfiz pattern. */
8476
8477 bool
8478 aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode mode, rtx mask,
8479 rtx shft_amnt)
8480 {
8481 return CONST_INT_P (mask) && CONST_INT_P (shft_amnt)
8482 && INTVAL (shft_amnt) < GET_MODE_BITSIZE (mode)
8483 && exact_log2 ((INTVAL (mask) >> INTVAL (shft_amnt)) + 1) >= 0
8484 && (INTVAL (mask) & ((1 << INTVAL (shft_amnt)) - 1)) == 0;
8485 }
8486
8487 /* Calculate the cost of calculating X, storing it in *COST. Result
8488 is true if the total cost of the operation has now been calculated. */
8489 static bool
8490 aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
8491 int param ATTRIBUTE_UNUSED, int *cost, bool speed)
8492 {
8493 rtx op0, op1, op2;
8494 const struct cpu_cost_table *extra_cost
8495 = aarch64_tune_params.insn_extra_cost;
8496 int code = GET_CODE (x);
8497 scalar_int_mode int_mode;
8498
8499 /* By default, assume that everything has equivalent cost to the
8500 cheapest instruction. Any additional costs are applied as a delta
8501 above this default. */
8502 *cost = COSTS_N_INSNS (1);
8503
8504 switch (code)
8505 {
8506 case SET:
8507 /* The cost depends entirely on the operands to SET. */
8508 *cost = 0;
8509 op0 = SET_DEST (x);
8510 op1 = SET_SRC (x);
8511
8512 switch (GET_CODE (op0))
8513 {
8514 case MEM:
8515 if (speed)
8516 {
8517 rtx address = XEXP (op0, 0);
8518 if (VECTOR_MODE_P (mode))
8519 *cost += extra_cost->ldst.storev;
8520 else if (GET_MODE_CLASS (mode) == MODE_INT)
8521 *cost += extra_cost->ldst.store;
8522 else if (mode == SFmode)
8523 *cost += extra_cost->ldst.storef;
8524 else if (mode == DFmode)
8525 *cost += extra_cost->ldst.stored;
8526
8527 *cost +=
8528 COSTS_N_INSNS (aarch64_address_cost (address, mode,
8529 0, speed));
8530 }
8531
8532 *cost += rtx_cost (op1, mode, SET, 1, speed);
8533 return true;
8534
8535 case SUBREG:
8536 if (! REG_P (SUBREG_REG (op0)))
8537 *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
8538
8539 /* Fall through. */
8540 case REG:
8541 /* The cost is one per vector-register copied. */
8542 if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
8543 {
8544 int nregs = aarch64_hard_regno_nregs (V0_REGNUM, GET_MODE (op0));
8545 *cost = COSTS_N_INSNS (nregs);
8546 }
8547 /* const0_rtx is in general free, but we will use an
8548 instruction to set a register to 0. */
8549 else if (REG_P (op1) || op1 == const0_rtx)
8550 {
8551 /* The cost is 1 per register copied. */
8552 int nregs = aarch64_hard_regno_nregs (R0_REGNUM, GET_MODE (op0));
8553 *cost = COSTS_N_INSNS (nregs);
8554 }
8555 else
8556 /* Cost is just the cost of the RHS of the set. */
8557 *cost += rtx_cost (op1, mode, SET, 1, speed);
8558 return true;
8559
8560 case ZERO_EXTRACT:
8561 case SIGN_EXTRACT:
8562 /* Bit-field insertion. Strip any redundant widening of
8563 the RHS to meet the width of the target. */
8564 if (GET_CODE (op1) == SUBREG)
8565 op1 = SUBREG_REG (op1);
8566 if ((GET_CODE (op1) == ZERO_EXTEND
8567 || GET_CODE (op1) == SIGN_EXTEND)
8568 && CONST_INT_P (XEXP (op0, 1))
8569 && is_a <scalar_int_mode> (GET_MODE (XEXP (op1, 0)), &int_mode)
8570 && GET_MODE_BITSIZE (int_mode) >= INTVAL (XEXP (op0, 1)))
8571 op1 = XEXP (op1, 0);
8572
8573 if (CONST_INT_P (op1))
8574 {
8575 /* MOV immediate is assumed to always be cheap. */
8576 *cost = COSTS_N_INSNS (1);
8577 }
8578 else
8579 {
8580 /* BFM. */
8581 if (speed)
8582 *cost += extra_cost->alu.bfi;
8583 *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed);
8584 }
8585
8586 return true;
8587
8588 default:
8589 /* We can't make sense of this, assume default cost. */
8590 *cost = COSTS_N_INSNS (1);
8591 return false;
8592 }
8593 return false;
8594
8595 case CONST_INT:
8596 /* If an instruction can incorporate a constant within the
8597 instruction, the instruction's expression avoids calling
8598 rtx_cost() on the constant. If rtx_cost() is called on a
8599 constant, then it is usually because the constant must be
8600 moved into a register by one or more instructions.
8601
8602 The exception is constant 0, which can be expressed
8603 as XZR/WZR and is therefore free. The exception to this is
8604 if we have (set (reg) (const0_rtx)) in which case we must cost
8605 the move. However, we can catch that when we cost the SET, so
8606 we don't need to consider that here. */
8607 if (x == const0_rtx)
8608 *cost = 0;
8609 else
8610 {
8611 /* To an approximation, building any other constant is
8612 proportionally expensive to the number of instructions
8613 required to build that constant. This is true whether we
8614 are compiling for SPEED or otherwise. */
8615 if (!is_a <scalar_int_mode> (mode, &int_mode))
8616 int_mode = word_mode;
8617 *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
8618 (NULL_RTX, x, false, int_mode));
8619 }
8620 return true;
8621
8622 case CONST_DOUBLE:
8623
8624 /* First determine number of instructions to do the move
8625 as an integer constant. */
8626 if (!aarch64_float_const_representable_p (x)
8627 && !aarch64_can_const_movi_rtx_p (x, mode)
8628 && aarch64_float_const_rtx_p (x))
8629 {
8630 unsigned HOST_WIDE_INT ival;
8631 bool succeed = aarch64_reinterpret_float_as_int (x, &ival);
8632 gcc_assert (succeed);
8633
8634 scalar_int_mode imode = (mode == HFmode
8635 ? SImode
8636 : int_mode_for_mode (mode).require ());
8637 int ncost = aarch64_internal_mov_immediate
8638 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
8639 *cost += COSTS_N_INSNS (ncost);
8640 return true;
8641 }
8642
8643 if (speed)
8644 {
8645 /* mov[df,sf]_aarch64. */
8646 if (aarch64_float_const_representable_p (x))
8647 /* FMOV (scalar immediate). */
8648 *cost += extra_cost->fp[mode == DFmode].fpconst;
8649 else if (!aarch64_float_const_zero_rtx_p (x))
8650 {
8651 /* This will be a load from memory. */
8652 if (mode == DFmode)
8653 *cost += extra_cost->ldst.loadd;
8654 else
8655 *cost += extra_cost->ldst.loadf;
8656 }
8657 else
8658 /* Otherwise this is +0.0. We get this using MOVI d0, #0
8659 or MOV v0.s[0], wzr - neither of which are modeled by the
8660 cost tables. Just use the default cost. */
8661 {
8662 }
8663 }
8664
8665 return true;
8666
8667 case MEM:
8668 if (speed)
8669 {
8670 /* For loads we want the base cost of a load, plus an
8671 approximation for the additional cost of the addressing
8672 mode. */
8673 rtx address = XEXP (x, 0);
8674 if (VECTOR_MODE_P (mode))
8675 *cost += extra_cost->ldst.loadv;
8676 else if (GET_MODE_CLASS (mode) == MODE_INT)
8677 *cost += extra_cost->ldst.load;
8678 else if (mode == SFmode)
8679 *cost += extra_cost->ldst.loadf;
8680 else if (mode == DFmode)
8681 *cost += extra_cost->ldst.loadd;
8682
8683 *cost +=
8684 COSTS_N_INSNS (aarch64_address_cost (address, mode,
8685 0, speed));
8686 }
8687
8688 return true;
8689
8690 case NEG:
8691 op0 = XEXP (x, 0);
8692
8693 if (VECTOR_MODE_P (mode))
8694 {
8695 if (speed)
8696 {
8697 /* FNEG. */
8698 *cost += extra_cost->vect.alu;
8699 }
8700 return false;
8701 }
8702
8703 if (GET_MODE_CLASS (mode) == MODE_INT)
8704 {
8705 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
8706 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
8707 {
8708 /* CSETM. */
8709 *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
8710 return true;
8711 }
8712
8713 /* Cost this as SUB wzr, X. */
8714 op0 = CONST0_RTX (mode);
8715 op1 = XEXP (x, 0);
8716 goto cost_minus;
8717 }
8718
8719 if (GET_MODE_CLASS (mode) == MODE_FLOAT)
8720 {
8721 /* Support (neg(fma...)) as a single instruction only if
8722 sign of zeros is unimportant. This matches the decision
8723 making in aarch64.md. */
8724 if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
8725 {
8726 /* FNMADD. */
8727 *cost = rtx_cost (op0, mode, NEG, 0, speed);
8728 return true;
8729 }
8730 if (GET_CODE (op0) == MULT)
8731 {
8732 /* FNMUL. */
8733 *cost = rtx_cost (op0, mode, NEG, 0, speed);
8734 return true;
8735 }
8736 if (speed)
8737 /* FNEG. */
8738 *cost += extra_cost->fp[mode == DFmode].neg;
8739 return false;
8740 }
8741
8742 return false;
8743
8744 case CLRSB:
8745 case CLZ:
8746 if (speed)
8747 {
8748 if (VECTOR_MODE_P (mode))
8749 *cost += extra_cost->vect.alu;
8750 else
8751 *cost += extra_cost->alu.clz;
8752 }
8753
8754 return false;
8755
8756 case COMPARE:
8757 op0 = XEXP (x, 0);
8758 op1 = XEXP (x, 1);
8759
8760 if (op1 == const0_rtx
8761 && GET_CODE (op0) == AND)
8762 {
8763 x = op0;
8764 mode = GET_MODE (op0);
8765 goto cost_logic;
8766 }
8767
8768 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
8769 {
8770 /* TODO: A write to the CC flags possibly costs extra, this
8771 needs encoding in the cost tables. */
8772
8773 mode = GET_MODE (op0);
8774 /* ANDS. */
8775 if (GET_CODE (op0) == AND)
8776 {
8777 x = op0;
8778 goto cost_logic;
8779 }
8780
8781 if (GET_CODE (op0) == PLUS)
8782 {
8783 /* ADDS (and CMN alias). */
8784 x = op0;
8785 goto cost_plus;
8786 }
8787
8788 if (GET_CODE (op0) == MINUS)
8789 {
8790 /* SUBS. */
8791 x = op0;
8792 goto cost_minus;
8793 }
8794
8795 if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx
8796 && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1))
8797 && CONST_INT_P (XEXP (op0, 2)))
8798 {
8799 /* COMPARE of ZERO_EXTRACT form of TST-immediate.
8800 Handle it here directly rather than going to cost_logic
8801 since we know the immediate generated for the TST is valid
8802 so we can avoid creating an intermediate rtx for it only
8803 for costing purposes. */
8804 if (speed)
8805 *cost += extra_cost->alu.logical;
8806
8807 *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0),
8808 ZERO_EXTRACT, 0, speed);
8809 return true;
8810 }
8811
8812 if (GET_CODE (op1) == NEG)
8813 {
8814 /* CMN. */
8815 if (speed)
8816 *cost += extra_cost->alu.arith;
8817
8818 *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
8819 *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
8820 return true;
8821 }
8822
8823 /* CMP.
8824
8825 Compare can freely swap the order of operands, and
8826 canonicalization puts the more complex operation first.
8827 But the integer MINUS logic expects the shift/extend
8828 operation in op1. */
8829 if (! (REG_P (op0)
8830 || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
8831 {
8832 op0 = XEXP (x, 1);
8833 op1 = XEXP (x, 0);
8834 }
8835 goto cost_minus;
8836 }
8837
8838 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
8839 {
8840 /* FCMP. */
8841 if (speed)
8842 *cost += extra_cost->fp[mode == DFmode].compare;
8843
8844 if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
8845 {
8846 *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
8847 /* FCMP supports constant 0.0 for no extra cost. */
8848 return true;
8849 }
8850 return false;
8851 }
8852
8853 if (VECTOR_MODE_P (mode))
8854 {
8855 /* Vector compare. */
8856 if (speed)
8857 *cost += extra_cost->vect.alu;
8858
8859 if (aarch64_float_const_zero_rtx_p (op1))
8860 {
8861 /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
8862 cost. */
8863 return true;
8864 }
8865 return false;
8866 }
8867 return false;
8868
8869 case MINUS:
8870 {
8871 op0 = XEXP (x, 0);
8872 op1 = XEXP (x, 1);
8873
8874 cost_minus:
8875 *cost += rtx_cost (op0, mode, MINUS, 0, speed);
8876
8877 /* Detect valid immediates. */
8878 if ((GET_MODE_CLASS (mode) == MODE_INT
8879 || (GET_MODE_CLASS (mode) == MODE_CC
8880 && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
8881 && CONST_INT_P (op1)
8882 && aarch64_uimm12_shift (INTVAL (op1)))
8883 {
8884 if (speed)
8885 /* SUB(S) (immediate). */
8886 *cost += extra_cost->alu.arith;
8887 return true;
8888 }
8889
8890 /* Look for SUB (extended register). */
8891 if (is_a <scalar_int_mode> (mode, &int_mode)
8892 && aarch64_rtx_arith_op_extract_p (op1, int_mode))
8893 {
8894 if (speed)
8895 *cost += extra_cost->alu.extend_arith;
8896
8897 op1 = aarch64_strip_extend (op1, true);
8898 *cost += rtx_cost (op1, VOIDmode,
8899 (enum rtx_code) GET_CODE (op1), 0, speed);
8900 return true;
8901 }
8902
8903 rtx new_op1 = aarch64_strip_extend (op1, false);
8904
8905 /* Cost this as an FMA-alike operation. */
8906 if ((GET_CODE (new_op1) == MULT
8907 || aarch64_shift_p (GET_CODE (new_op1)))
8908 && code != COMPARE)
8909 {
8910 *cost += aarch64_rtx_mult_cost (new_op1, MULT,
8911 (enum rtx_code) code,
8912 speed);
8913 return true;
8914 }
8915
8916 *cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
8917
8918 if (speed)
8919 {
8920 if (VECTOR_MODE_P (mode))
8921 {
8922 /* Vector SUB. */
8923 *cost += extra_cost->vect.alu;
8924 }
8925 else if (GET_MODE_CLASS (mode) == MODE_INT)
8926 {
8927 /* SUB(S). */
8928 *cost += extra_cost->alu.arith;
8929 }
8930 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
8931 {
8932 /* FSUB. */
8933 *cost += extra_cost->fp[mode == DFmode].addsub;
8934 }
8935 }
8936 return true;
8937 }
8938
8939 case PLUS:
8940 {
8941 rtx new_op0;
8942
8943 op0 = XEXP (x, 0);
8944 op1 = XEXP (x, 1);
8945
8946 cost_plus:
8947 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
8948 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
8949 {
8950 /* CSINC. */
8951 *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
8952 *cost += rtx_cost (op1, mode, PLUS, 1, speed);
8953 return true;
8954 }
8955
8956 if (GET_MODE_CLASS (mode) == MODE_INT
8957 && ((CONST_INT_P (op1) && aarch64_uimm12_shift (INTVAL (op1)))
8958 || aarch64_sve_addvl_addpl_immediate (op1, mode)))
8959 {
8960 *cost += rtx_cost (op0, mode, PLUS, 0, speed);
8961
8962 if (speed)
8963 /* ADD (immediate). */
8964 *cost += extra_cost->alu.arith;
8965 return true;
8966 }
8967
8968 *cost += rtx_cost (op1, mode, PLUS, 1, speed);
8969
8970 /* Look for ADD (extended register). */
8971 if (is_a <scalar_int_mode> (mode, &int_mode)
8972 && aarch64_rtx_arith_op_extract_p (op0, int_mode))
8973 {
8974 if (speed)
8975 *cost += extra_cost->alu.extend_arith;
8976
8977 op0 = aarch64_strip_extend (op0, true);
8978 *cost += rtx_cost (op0, VOIDmode,
8979 (enum rtx_code) GET_CODE (op0), 0, speed);
8980 return true;
8981 }
8982
8983 /* Strip any extend, leave shifts behind as we will
8984 cost them through mult_cost. */
8985 new_op0 = aarch64_strip_extend (op0, false);
8986
8987 if (GET_CODE (new_op0) == MULT
8988 || aarch64_shift_p (GET_CODE (new_op0)))
8989 {
8990 *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
8991 speed);
8992 return true;
8993 }
8994
8995 *cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
8996
8997 if (speed)
8998 {
8999 if (VECTOR_MODE_P (mode))
9000 {
9001 /* Vector ADD. */
9002 *cost += extra_cost->vect.alu;
9003 }
9004 else if (GET_MODE_CLASS (mode) == MODE_INT)
9005 {
9006 /* ADD. */
9007 *cost += extra_cost->alu.arith;
9008 }
9009 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
9010 {
9011 /* FADD. */
9012 *cost += extra_cost->fp[mode == DFmode].addsub;
9013 }
9014 }
9015 return true;
9016 }
9017
9018 case BSWAP:
9019 *cost = COSTS_N_INSNS (1);
9020
9021 if (speed)
9022 {
9023 if (VECTOR_MODE_P (mode))
9024 *cost += extra_cost->vect.alu;
9025 else
9026 *cost += extra_cost->alu.rev;
9027 }
9028 return false;
9029
9030 case IOR:
9031 if (aarch_rev16_p (x))
9032 {
9033 *cost = COSTS_N_INSNS (1);
9034
9035 if (speed)
9036 {
9037 if (VECTOR_MODE_P (mode))
9038 *cost += extra_cost->vect.alu;
9039 else
9040 *cost += extra_cost->alu.rev;
9041 }
9042 return true;
9043 }
9044
9045 if (aarch64_extr_rtx_p (x, &op0, &op1))
9046 {
9047 *cost += rtx_cost (op0, mode, IOR, 0, speed);
9048 *cost += rtx_cost (op1, mode, IOR, 1, speed);
9049 if (speed)
9050 *cost += extra_cost->alu.shift;
9051
9052 return true;
9053 }
9054 /* Fall through. */
9055 case XOR:
9056 case AND:
9057 cost_logic:
9058 op0 = XEXP (x, 0);
9059 op1 = XEXP (x, 1);
9060
9061 if (VECTOR_MODE_P (mode))
9062 {
9063 if (speed)
9064 *cost += extra_cost->vect.alu;
9065 return true;
9066 }
9067
9068 if (code == AND
9069 && GET_CODE (op0) == MULT
9070 && CONST_INT_P (XEXP (op0, 1))
9071 && CONST_INT_P (op1)
9072 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
9073 INTVAL (op1)) != 0)
9074 {
9075 /* This is a UBFM/SBFM. */
9076 *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
9077 if (speed)
9078 *cost += extra_cost->alu.bfx;
9079 return true;
9080 }
9081
9082 if (is_int_mode (mode, &int_mode))
9083 {
9084 if (CONST_INT_P (op1))
9085 {
9086 /* We have a mask + shift version of a UBFIZ
9087 i.e. the *andim_ashift<mode>_bfiz pattern. */
9088 if (GET_CODE (op0) == ASHIFT
9089 && aarch64_mask_and_shift_for_ubfiz_p (int_mode, op1,
9090 XEXP (op0, 1)))
9091 {
9092 *cost += rtx_cost (XEXP (op0, 0), int_mode,
9093 (enum rtx_code) code, 0, speed);
9094 if (speed)
9095 *cost += extra_cost->alu.bfx;
9096
9097 return true;
9098 }
9099 else if (aarch64_bitmask_imm (INTVAL (op1), int_mode))
9100 {
9101 /* We possibly get the immediate for free, this is not
9102 modelled. */
9103 *cost += rtx_cost (op0, int_mode,
9104 (enum rtx_code) code, 0, speed);
9105 if (speed)
9106 *cost += extra_cost->alu.logical;
9107
9108 return true;
9109 }
9110 }
9111 else
9112 {
9113 rtx new_op0 = op0;
9114
9115 /* Handle ORN, EON, or BIC. */
9116 if (GET_CODE (op0) == NOT)
9117 op0 = XEXP (op0, 0);
9118
9119 new_op0 = aarch64_strip_shift (op0);
9120
9121 /* If we had a shift on op0 then this is a logical-shift-
9122 by-register/immediate operation. Otherwise, this is just
9123 a logical operation. */
9124 if (speed)
9125 {
9126 if (new_op0 != op0)
9127 {
9128 /* Shift by immediate. */
9129 if (CONST_INT_P (XEXP (op0, 1)))
9130 *cost += extra_cost->alu.log_shift;
9131 else
9132 *cost += extra_cost->alu.log_shift_reg;
9133 }
9134 else
9135 *cost += extra_cost->alu.logical;
9136 }
9137
9138 /* In both cases we want to cost both operands. */
9139 *cost += rtx_cost (new_op0, int_mode, (enum rtx_code) code,
9140 0, speed);
9141 *cost += rtx_cost (op1, int_mode, (enum rtx_code) code,
9142 1, speed);
9143
9144 return true;
9145 }
9146 }
9147 return false;
9148
9149 case NOT:
9150 x = XEXP (x, 0);
9151 op0 = aarch64_strip_shift (x);
9152
9153 if (VECTOR_MODE_P (mode))
9154 {
9155 /* Vector NOT. */
9156 *cost += extra_cost->vect.alu;
9157 return false;
9158 }
9159
9160 /* MVN-shifted-reg. */
9161 if (op0 != x)
9162 {
9163 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
9164
9165 if (speed)
9166 *cost += extra_cost->alu.log_shift;
9167
9168 return true;
9169 }
9170 /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
9171 Handle the second form here taking care that 'a' in the above can
9172 be a shift. */
9173 else if (GET_CODE (op0) == XOR)
9174 {
9175 rtx newop0 = XEXP (op0, 0);
9176 rtx newop1 = XEXP (op0, 1);
9177 rtx op0_stripped = aarch64_strip_shift (newop0);
9178
9179 *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed);
9180 *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
9181
9182 if (speed)
9183 {
9184 if (op0_stripped != newop0)
9185 *cost += extra_cost->alu.log_shift;
9186 else
9187 *cost += extra_cost->alu.logical;
9188 }
9189
9190 return true;
9191 }
9192 /* MVN. */
9193 if (speed)
9194 *cost += extra_cost->alu.logical;
9195
9196 return false;
9197
9198 case ZERO_EXTEND:
9199
9200 op0 = XEXP (x, 0);
9201 /* If a value is written in SI mode, then zero extended to DI
9202 mode, the operation will in general be free as a write to
9203 a 'w' register implicitly zeroes the upper bits of an 'x'
9204 register. However, if this is
9205
9206 (set (reg) (zero_extend (reg)))
9207
9208 we must cost the explicit register move. */
9209 if (mode == DImode
9210 && GET_MODE (op0) == SImode
9211 && outer == SET)
9212 {
9213 int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
9214
9215 /* If OP_COST is non-zero, then the cost of the zero extend
9216 is effectively the cost of the inner operation. Otherwise
9217 we have a MOV instruction and we take the cost from the MOV
9218 itself. This is true independently of whether we are
9219 optimizing for space or time. */
9220 if (op_cost)
9221 *cost = op_cost;
9222
9223 return true;
9224 }
9225 else if (MEM_P (op0))
9226 {
9227 /* All loads can zero extend to any size for free. */
9228 *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
9229 return true;
9230 }
9231
9232 op0 = aarch64_extend_bitfield_pattern_p (x);
9233 if (op0)
9234 {
9235 *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed);
9236 if (speed)
9237 *cost += extra_cost->alu.bfx;
9238 return true;
9239 }
9240
9241 if (speed)
9242 {
9243 if (VECTOR_MODE_P (mode))
9244 {
9245 /* UMOV. */
9246 *cost += extra_cost->vect.alu;
9247 }
9248 else
9249 {
9250 /* We generate an AND instead of UXTB/UXTH. */
9251 *cost += extra_cost->alu.logical;
9252 }
9253 }
9254 return false;
9255
9256 case SIGN_EXTEND:
9257 if (MEM_P (XEXP (x, 0)))
9258 {
9259 /* LDRSH. */
9260 if (speed)
9261 {
9262 rtx address = XEXP (XEXP (x, 0), 0);
9263 *cost += extra_cost->ldst.load_sign_extend;
9264
9265 *cost +=
9266 COSTS_N_INSNS (aarch64_address_cost (address, mode,
9267 0, speed));
9268 }
9269 return true;
9270 }
9271
9272 op0 = aarch64_extend_bitfield_pattern_p (x);
9273 if (op0)
9274 {
9275 *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed);
9276 if (speed)
9277 *cost += extra_cost->alu.bfx;
9278 return true;
9279 }
9280
9281 if (speed)
9282 {
9283 if (VECTOR_MODE_P (mode))
9284 *cost += extra_cost->vect.alu;
9285 else
9286 *cost += extra_cost->alu.extend;
9287 }
9288 return false;
9289
9290 case ASHIFT:
9291 op0 = XEXP (x, 0);
9292 op1 = XEXP (x, 1);
9293
9294 if (CONST_INT_P (op1))
9295 {
9296 if (speed)
9297 {
9298 if (VECTOR_MODE_P (mode))
9299 {
9300 /* Vector shift (immediate). */
9301 *cost += extra_cost->vect.alu;
9302 }
9303 else
9304 {
9305 /* LSL (immediate), UBMF, UBFIZ and friends. These are all
9306 aliases. */
9307 *cost += extra_cost->alu.shift;
9308 }
9309 }
9310
9311 /* We can incorporate zero/sign extend for free. */
9312 if (GET_CODE (op0) == ZERO_EXTEND
9313 || GET_CODE (op0) == SIGN_EXTEND)
9314 op0 = XEXP (op0, 0);
9315
9316 *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
9317 return true;
9318 }
9319 else
9320 {
9321 if (VECTOR_MODE_P (mode))
9322 {
9323 if (speed)
9324 /* Vector shift (register). */
9325 *cost += extra_cost->vect.alu;
9326 }
9327 else
9328 {
9329 if (speed)
9330 /* LSLV. */
9331 *cost += extra_cost->alu.shift_reg;
9332
9333 if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
9334 && CONST_INT_P (XEXP (op1, 1))
9335 && known_eq (INTVAL (XEXP (op1, 1)),
9336 GET_MODE_BITSIZE (mode) - 1))
9337 {
9338 *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
9339 /* We already demanded XEXP (op1, 0) to be REG_P, so
9340 don't recurse into it. */
9341 return true;
9342 }
9343 }
9344 return false; /* All arguments need to be in registers. */
9345 }
9346
9347 case ROTATE:
9348 case ROTATERT:
9349 case LSHIFTRT:
9350 case ASHIFTRT:
9351 op0 = XEXP (x, 0);
9352 op1 = XEXP (x, 1);
9353
9354 if (CONST_INT_P (op1))
9355 {
9356 /* ASR (immediate) and friends. */
9357 if (speed)
9358 {
9359 if (VECTOR_MODE_P (mode))
9360 *cost += extra_cost->vect.alu;
9361 else
9362 *cost += extra_cost->alu.shift;
9363 }
9364
9365 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
9366 return true;
9367 }
9368 else
9369 {
9370 if (VECTOR_MODE_P (mode))
9371 {
9372 if (speed)
9373 /* Vector shift (register). */
9374 *cost += extra_cost->vect.alu;
9375 }
9376 else
9377 {
9378 if (speed)
9379 /* ASR (register) and friends. */
9380 *cost += extra_cost->alu.shift_reg;
9381
9382 if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
9383 && CONST_INT_P (XEXP (op1, 1))
9384 && known_eq (INTVAL (XEXP (op1, 1)),
9385 GET_MODE_BITSIZE (mode) - 1))
9386 {
9387 *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
9388 /* We already demanded XEXP (op1, 0) to be REG_P, so
9389 don't recurse into it. */
9390 return true;
9391 }
9392 }
9393 return false; /* All arguments need to be in registers. */
9394 }
9395
9396 case SYMBOL_REF:
9397
9398 if (aarch64_cmodel == AARCH64_CMODEL_LARGE
9399 || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
9400 {
9401 /* LDR. */
9402 if (speed)
9403 *cost += extra_cost->ldst.load;
9404 }
9405 else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
9406 || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
9407 {
9408 /* ADRP, followed by ADD. */
9409 *cost += COSTS_N_INSNS (1);
9410 if (speed)
9411 *cost += 2 * extra_cost->alu.arith;
9412 }
9413 else if (aarch64_cmodel == AARCH64_CMODEL_TINY
9414 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
9415 {
9416 /* ADR. */
9417 if (speed)
9418 *cost += extra_cost->alu.arith;
9419 }
9420
9421 if (flag_pic)
9422 {
9423 /* One extra load instruction, after accessing the GOT. */
9424 *cost += COSTS_N_INSNS (1);
9425 if (speed)
9426 *cost += extra_cost->ldst.load;
9427 }
9428 return true;
9429
9430 case HIGH:
9431 case LO_SUM:
9432 /* ADRP/ADD (immediate). */
9433 if (speed)
9434 *cost += extra_cost->alu.arith;
9435 return true;
9436
9437 case ZERO_EXTRACT:
9438 case SIGN_EXTRACT:
9439 /* UBFX/SBFX. */
9440 if (speed)
9441 {
9442 if (VECTOR_MODE_P (mode))
9443 *cost += extra_cost->vect.alu;
9444 else
9445 *cost += extra_cost->alu.bfx;
9446 }
9447
9448 /* We can trust that the immediates used will be correct (there
9449 are no by-register forms), so we need only cost op0. */
9450 *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed);
9451 return true;
9452
9453 case MULT:
9454 *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
9455 /* aarch64_rtx_mult_cost always handles recursion to its
9456 operands. */
9457 return true;
9458
9459 case MOD:
9460 /* We can expand signed mod by power of 2 using a NEGS, two parallel
9461 ANDs and a CSNEG. Assume here that CSNEG is the same as the cost of
9462 an unconditional negate. This case should only ever be reached through
9463 the set_smod_pow2_cheap check in expmed.c. */
9464 if (CONST_INT_P (XEXP (x, 1))
9465 && exact_log2 (INTVAL (XEXP (x, 1))) > 0
9466 && (mode == SImode || mode == DImode))
9467 {
9468 /* We expand to 4 instructions. Reset the baseline. */
9469 *cost = COSTS_N_INSNS (4);
9470
9471 if (speed)
9472 *cost += 2 * extra_cost->alu.logical
9473 + 2 * extra_cost->alu.arith;
9474
9475 return true;
9476 }
9477
9478 /* Fall-through. */
9479 case UMOD:
9480 if (speed)
9481 {
9482 /* Slighly prefer UMOD over SMOD. */
9483 if (VECTOR_MODE_P (mode))
9484 *cost += extra_cost->vect.alu;
9485 else if (GET_MODE_CLASS (mode) == MODE_INT)
9486 *cost += (extra_cost->mult[mode == DImode].add
9487 + extra_cost->mult[mode == DImode].idiv
9488 + (code == MOD ? 1 : 0));
9489 }
9490 return false; /* All arguments need to be in registers. */
9491
9492 case DIV:
9493 case UDIV:
9494 case SQRT:
9495 if (speed)
9496 {
9497 if (VECTOR_MODE_P (mode))
9498 *cost += extra_cost->vect.alu;
9499 else if (GET_MODE_CLASS (mode) == MODE_INT)
9500 /* There is no integer SQRT, so only DIV and UDIV can get
9501 here. */
9502 *cost += (extra_cost->mult[mode == DImode].idiv
9503 /* Slighly prefer UDIV over SDIV. */
9504 + (code == DIV ? 1 : 0));
9505 else
9506 *cost += extra_cost->fp[mode == DFmode].div;
9507 }
9508 return false; /* All arguments need to be in registers. */
9509
9510 case IF_THEN_ELSE:
9511 return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
9512 XEXP (x, 2), cost, speed);
9513
9514 case EQ:
9515 case NE:
9516 case GT:
9517 case GTU:
9518 case LT:
9519 case LTU:
9520 case GE:
9521 case GEU:
9522 case LE:
9523 case LEU:
9524
9525 return false; /* All arguments must be in registers. */
9526
9527 case FMA:
9528 op0 = XEXP (x, 0);
9529 op1 = XEXP (x, 1);
9530 op2 = XEXP (x, 2);
9531
9532 if (speed)
9533 {
9534 if (VECTOR_MODE_P (mode))
9535 *cost += extra_cost->vect.alu;
9536 else
9537 *cost += extra_cost->fp[mode == DFmode].fma;
9538 }
9539
9540 /* FMSUB, FNMADD, and FNMSUB are free. */
9541 if (GET_CODE (op0) == NEG)
9542 op0 = XEXP (op0, 0);
9543
9544 if (GET_CODE (op2) == NEG)
9545 op2 = XEXP (op2, 0);
9546
9547 /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
9548 and the by-element operand as operand 0. */
9549 if (GET_CODE (op1) == NEG)
9550 op1 = XEXP (op1, 0);
9551
9552 /* Catch vector-by-element operations. The by-element operand can
9553 either be (vec_duplicate (vec_select (x))) or just
9554 (vec_select (x)), depending on whether we are multiplying by
9555 a vector or a scalar.
9556
9557 Canonicalization is not very good in these cases, FMA4 will put the
9558 by-element operand as operand 0, FNMA4 will have it as operand 1. */
9559 if (GET_CODE (op0) == VEC_DUPLICATE)
9560 op0 = XEXP (op0, 0);
9561 else if (GET_CODE (op1) == VEC_DUPLICATE)
9562 op1 = XEXP (op1, 0);
9563
9564 if (GET_CODE (op0) == VEC_SELECT)
9565 op0 = XEXP (op0, 0);
9566 else if (GET_CODE (op1) == VEC_SELECT)
9567 op1 = XEXP (op1, 0);
9568
9569 /* If the remaining parameters are not registers,
9570 get the cost to put them into registers. */
9571 *cost += rtx_cost (op0, mode, FMA, 0, speed);
9572 *cost += rtx_cost (op1, mode, FMA, 1, speed);
9573 *cost += rtx_cost (op2, mode, FMA, 2, speed);
9574 return true;
9575
9576 case FLOAT:
9577 case UNSIGNED_FLOAT:
9578 if (speed)
9579 *cost += extra_cost->fp[mode == DFmode].fromint;
9580 return false;
9581
9582 case FLOAT_EXTEND:
9583 if (speed)
9584 {
9585 if (VECTOR_MODE_P (mode))
9586 {
9587 /*Vector truncate. */
9588 *cost += extra_cost->vect.alu;
9589 }
9590 else
9591 *cost += extra_cost->fp[mode == DFmode].widen;
9592 }
9593 return false;
9594
9595 case FLOAT_TRUNCATE:
9596 if (speed)
9597 {
9598 if (VECTOR_MODE_P (mode))
9599 {
9600 /*Vector conversion. */
9601 *cost += extra_cost->vect.alu;
9602 }
9603 else
9604 *cost += extra_cost->fp[mode == DFmode].narrow;
9605 }
9606 return false;
9607
9608 case FIX:
9609 case UNSIGNED_FIX:
9610 x = XEXP (x, 0);
9611 /* Strip the rounding part. They will all be implemented
9612 by the fcvt* family of instructions anyway. */
9613 if (GET_CODE (x) == UNSPEC)
9614 {
9615 unsigned int uns_code = XINT (x, 1);
9616
9617 if (uns_code == UNSPEC_FRINTA
9618 || uns_code == UNSPEC_FRINTM
9619 || uns_code == UNSPEC_FRINTN
9620 || uns_code == UNSPEC_FRINTP
9621 || uns_code == UNSPEC_FRINTZ)
9622 x = XVECEXP (x, 0, 0);
9623 }
9624
9625 if (speed)
9626 {
9627 if (VECTOR_MODE_P (mode))
9628 *cost += extra_cost->vect.alu;
9629 else
9630 *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
9631 }
9632
9633 /* We can combine fmul by a power of 2 followed by a fcvt into a single
9634 fixed-point fcvt. */
9635 if (GET_CODE (x) == MULT
9636 && ((VECTOR_MODE_P (mode)
9637 && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
9638 || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
9639 {
9640 *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code,
9641 0, speed);
9642 return true;
9643 }
9644
9645 *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
9646 return true;
9647
9648 case ABS:
9649 if (VECTOR_MODE_P (mode))
9650 {
9651 /* ABS (vector). */
9652 if (speed)
9653 *cost += extra_cost->vect.alu;
9654 }
9655 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
9656 {
9657 op0 = XEXP (x, 0);
9658
9659 /* FABD, which is analogous to FADD. */
9660 if (GET_CODE (op0) == MINUS)
9661 {
9662 *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
9663 *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
9664 if (speed)
9665 *cost += extra_cost->fp[mode == DFmode].addsub;
9666
9667 return true;
9668 }
9669 /* Simple FABS is analogous to FNEG. */
9670 if (speed)
9671 *cost += extra_cost->fp[mode == DFmode].neg;
9672 }
9673 else
9674 {
9675 /* Integer ABS will either be split to
9676 two arithmetic instructions, or will be an ABS
9677 (scalar), which we don't model. */
9678 *cost = COSTS_N_INSNS (2);
9679 if (speed)
9680 *cost += 2 * extra_cost->alu.arith;
9681 }
9682 return false;
9683
9684 case SMAX:
9685 case SMIN:
9686 if (speed)
9687 {
9688 if (VECTOR_MODE_P (mode))
9689 *cost += extra_cost->vect.alu;
9690 else
9691 {
9692 /* FMAXNM/FMINNM/FMAX/FMIN.
9693 TODO: This may not be accurate for all implementations, but
9694 we do not model this in the cost tables. */
9695 *cost += extra_cost->fp[mode == DFmode].addsub;
9696 }
9697 }
9698 return false;
9699
9700 case UNSPEC:
9701 /* The floating point round to integer frint* instructions. */
9702 if (aarch64_frint_unspec_p (XINT (x, 1)))
9703 {
9704 if (speed)
9705 *cost += extra_cost->fp[mode == DFmode].roundint;
9706
9707 return false;
9708 }
9709
9710 if (XINT (x, 1) == UNSPEC_RBIT)
9711 {
9712 if (speed)
9713 *cost += extra_cost->alu.rev;
9714
9715 return false;
9716 }
9717 break;
9718
9719 case TRUNCATE:
9720
9721 /* Decompose <su>muldi3_highpart. */
9722 if (/* (truncate:DI */
9723 mode == DImode
9724 /* (lshiftrt:TI */
9725 && GET_MODE (XEXP (x, 0)) == TImode
9726 && GET_CODE (XEXP (x, 0)) == LSHIFTRT
9727 /* (mult:TI */
9728 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
9729 /* (ANY_EXTEND:TI (reg:DI))
9730 (ANY_EXTEND:TI (reg:DI))) */
9731 && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
9732 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
9733 || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
9734 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
9735 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
9736 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
9737 /* (const_int 64) */
9738 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
9739 && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
9740 {
9741 /* UMULH/SMULH. */
9742 if (speed)
9743 *cost += extra_cost->mult[mode == DImode].extend;
9744 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
9745 mode, MULT, 0, speed);
9746 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
9747 mode, MULT, 1, speed);
9748 return true;
9749 }
9750
9751 /* Fall through. */
9752 default:
9753 break;
9754 }
9755
9756 if (dump_file
9757 && flag_aarch64_verbose_cost)
9758 fprintf (dump_file,
9759 "\nFailed to cost RTX. Assuming default cost.\n");
9760
9761 return true;
9762 }
9763
9764 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
9765 calculated for X. This cost is stored in *COST. Returns true
9766 if the total cost of X was calculated. */
9767 static bool
9768 aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
9769 int param, int *cost, bool speed)
9770 {
9771 bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
9772
9773 if (dump_file
9774 && flag_aarch64_verbose_cost)
9775 {
9776 print_rtl_single (dump_file, x);
9777 fprintf (dump_file, "\n%s cost: %d (%s)\n",
9778 speed ? "Hot" : "Cold",
9779 *cost, result ? "final" : "partial");
9780 }
9781
9782 return result;
9783 }
9784
9785 static int
9786 aarch64_register_move_cost (machine_mode mode,
9787 reg_class_t from_i, reg_class_t to_i)
9788 {
9789 enum reg_class from = (enum reg_class) from_i;
9790 enum reg_class to = (enum reg_class) to_i;
9791 const struct cpu_regmove_cost *regmove_cost
9792 = aarch64_tune_params.regmove_cost;
9793
9794 /* Caller save and pointer regs are equivalent to GENERAL_REGS. */
9795 if (to == TAILCALL_ADDR_REGS || to == POINTER_REGS)
9796 to = GENERAL_REGS;
9797
9798 if (from == TAILCALL_ADDR_REGS || from == POINTER_REGS)
9799 from = GENERAL_REGS;
9800
9801 /* Moving between GPR and stack cost is the same as GP2GP. */
9802 if ((from == GENERAL_REGS && to == STACK_REG)
9803 || (to == GENERAL_REGS && from == STACK_REG))
9804 return regmove_cost->GP2GP;
9805
9806 /* To/From the stack register, we move via the gprs. */
9807 if (to == STACK_REG || from == STACK_REG)
9808 return aarch64_register_move_cost (mode, from, GENERAL_REGS)
9809 + aarch64_register_move_cost (mode, GENERAL_REGS, to);
9810
9811 if (known_eq (GET_MODE_SIZE (mode), 16))
9812 {
9813 /* 128-bit operations on general registers require 2 instructions. */
9814 if (from == GENERAL_REGS && to == GENERAL_REGS)
9815 return regmove_cost->GP2GP * 2;
9816 else if (from == GENERAL_REGS)
9817 return regmove_cost->GP2FP * 2;
9818 else if (to == GENERAL_REGS)
9819 return regmove_cost->FP2GP * 2;
9820
9821 /* When AdvSIMD instructions are disabled it is not possible to move
9822 a 128-bit value directly between Q registers. This is handled in
9823 secondary reload. A general register is used as a scratch to move
9824 the upper DI value and the lower DI value is moved directly,
9825 hence the cost is the sum of three moves. */
9826 if (! TARGET_SIMD)
9827 return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
9828
9829 return regmove_cost->FP2FP;
9830 }
9831
9832 if (from == GENERAL_REGS && to == GENERAL_REGS)
9833 return regmove_cost->GP2GP;
9834 else if (from == GENERAL_REGS)
9835 return regmove_cost->GP2FP;
9836 else if (to == GENERAL_REGS)
9837 return regmove_cost->FP2GP;
9838
9839 return regmove_cost->FP2FP;
9840 }
9841
9842 static int
9843 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
9844 reg_class_t rclass ATTRIBUTE_UNUSED,
9845 bool in ATTRIBUTE_UNUSED)
9846 {
9847 return aarch64_tune_params.memmov_cost;
9848 }
9849
9850 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
9851 to optimize 1.0/sqrt. */
9852
9853 static bool
9854 use_rsqrt_p (machine_mode mode)
9855 {
9856 return (!flag_trapping_math
9857 && flag_unsafe_math_optimizations
9858 && ((aarch64_tune_params.approx_modes->recip_sqrt
9859 & AARCH64_APPROX_MODE (mode))
9860 || flag_mrecip_low_precision_sqrt));
9861 }
9862
9863 /* Function to decide when to use the approximate reciprocal square root
9864 builtin. */
9865
9866 static tree
9867 aarch64_builtin_reciprocal (tree fndecl)
9868 {
9869 machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
9870
9871 if (!use_rsqrt_p (mode))
9872 return NULL_TREE;
9873 return aarch64_builtin_rsqrt (DECL_FUNCTION_CODE (fndecl));
9874 }
9875
9876 typedef rtx (*rsqrte_type) (rtx, rtx);
9877
9878 /* Select reciprocal square root initial estimate insn depending on machine
9879 mode. */
9880
9881 static rsqrte_type
9882 get_rsqrte_type (machine_mode mode)
9883 {
9884 switch (mode)
9885 {
9886 case E_DFmode: return gen_aarch64_rsqrtedf;
9887 case E_SFmode: return gen_aarch64_rsqrtesf;
9888 case E_V2DFmode: return gen_aarch64_rsqrtev2df;
9889 case E_V2SFmode: return gen_aarch64_rsqrtev2sf;
9890 case E_V4SFmode: return gen_aarch64_rsqrtev4sf;
9891 default: gcc_unreachable ();
9892 }
9893 }
9894
9895 typedef rtx (*rsqrts_type) (rtx, rtx, rtx);
9896
9897 /* Select reciprocal square root series step insn depending on machine mode. */
9898
9899 static rsqrts_type
9900 get_rsqrts_type (machine_mode mode)
9901 {
9902 switch (mode)
9903 {
9904 case E_DFmode: return gen_aarch64_rsqrtsdf;
9905 case E_SFmode: return gen_aarch64_rsqrtssf;
9906 case E_V2DFmode: return gen_aarch64_rsqrtsv2df;
9907 case E_V2SFmode: return gen_aarch64_rsqrtsv2sf;
9908 case E_V4SFmode: return gen_aarch64_rsqrtsv4sf;
9909 default: gcc_unreachable ();
9910 }
9911 }
9912
9913 /* Emit instruction sequence to compute either the approximate square root
9914 or its approximate reciprocal, depending on the flag RECP, and return
9915 whether the sequence was emitted or not. */
9916
9917 bool
9918 aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
9919 {
9920 machine_mode mode = GET_MODE (dst);
9921
9922 if (GET_MODE_INNER (mode) == HFmode)
9923 {
9924 gcc_assert (!recp);
9925 return false;
9926 }
9927
9928 if (!recp)
9929 {
9930 if (!(flag_mlow_precision_sqrt
9931 || (aarch64_tune_params.approx_modes->sqrt
9932 & AARCH64_APPROX_MODE (mode))))
9933 return false;
9934
9935 if (flag_finite_math_only
9936 || flag_trapping_math
9937 || !flag_unsafe_math_optimizations
9938 || optimize_function_for_size_p (cfun))
9939 return false;
9940 }
9941 else
9942 /* Caller assumes we cannot fail. */
9943 gcc_assert (use_rsqrt_p (mode));
9944
9945 machine_mode mmsk = mode_for_int_vector (mode).require ();
9946 rtx xmsk = gen_reg_rtx (mmsk);
9947 if (!recp)
9948 /* When calculating the approximate square root, compare the
9949 argument with 0.0 and create a mask. */
9950 emit_insn (gen_rtx_SET (xmsk,
9951 gen_rtx_NEG (mmsk,
9952 gen_rtx_EQ (mmsk, src,
9953 CONST0_RTX (mode)))));
9954
9955 /* Estimate the approximate reciprocal square root. */
9956 rtx xdst = gen_reg_rtx (mode);
9957 emit_insn ((*get_rsqrte_type (mode)) (xdst, src));
9958
9959 /* Iterate over the series twice for SF and thrice for DF. */
9960 int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
9961
9962 /* Optionally iterate over the series once less for faster performance
9963 while sacrificing the accuracy. */
9964 if ((recp && flag_mrecip_low_precision_sqrt)
9965 || (!recp && flag_mlow_precision_sqrt))
9966 iterations--;
9967
9968 /* Iterate over the series to calculate the approximate reciprocal square
9969 root. */
9970 rtx x1 = gen_reg_rtx (mode);
9971 while (iterations--)
9972 {
9973 rtx x2 = gen_reg_rtx (mode);
9974 emit_set_insn (x2, gen_rtx_MULT (mode, xdst, xdst));
9975
9976 emit_insn ((*get_rsqrts_type (mode)) (x1, src, x2));
9977
9978 if (iterations > 0)
9979 emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, x1));
9980 }
9981
9982 if (!recp)
9983 {
9984 /* Qualify the approximate reciprocal square root when the argument is
9985 0.0 by squashing the intermediary result to 0.0. */
9986 rtx xtmp = gen_reg_rtx (mmsk);
9987 emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
9988 gen_rtx_SUBREG (mmsk, xdst, 0)));
9989 emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
9990
9991 /* Calculate the approximate square root. */
9992 emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, src));
9993 }
9994
9995 /* Finalize the approximation. */
9996 emit_set_insn (dst, gen_rtx_MULT (mode, xdst, x1));
9997
9998 return true;
9999 }
10000
10001 typedef rtx (*recpe_type) (rtx, rtx);
10002
10003 /* Select reciprocal initial estimate insn depending on machine mode. */
10004
10005 static recpe_type
10006 get_recpe_type (machine_mode mode)
10007 {
10008 switch (mode)
10009 {
10010 case E_SFmode: return (gen_aarch64_frecpesf);
10011 case E_V2SFmode: return (gen_aarch64_frecpev2sf);
10012 case E_V4SFmode: return (gen_aarch64_frecpev4sf);
10013 case E_DFmode: return (gen_aarch64_frecpedf);
10014 case E_V2DFmode: return (gen_aarch64_frecpev2df);
10015 default: gcc_unreachable ();
10016 }
10017 }
10018
10019 typedef rtx (*recps_type) (rtx, rtx, rtx);
10020
10021 /* Select reciprocal series step insn depending on machine mode. */
10022
10023 static recps_type
10024 get_recps_type (machine_mode mode)
10025 {
10026 switch (mode)
10027 {
10028 case E_SFmode: return (gen_aarch64_frecpssf);
10029 case E_V2SFmode: return (gen_aarch64_frecpsv2sf);
10030 case E_V4SFmode: return (gen_aarch64_frecpsv4sf);
10031 case E_DFmode: return (gen_aarch64_frecpsdf);
10032 case E_V2DFmode: return (gen_aarch64_frecpsv2df);
10033 default: gcc_unreachable ();
10034 }
10035 }
10036
10037 /* Emit the instruction sequence to compute the approximation for the division
10038 of NUM by DEN in QUO and return whether the sequence was emitted or not. */
10039
10040 bool
10041 aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
10042 {
10043 machine_mode mode = GET_MODE (quo);
10044
10045 if (GET_MODE_INNER (mode) == HFmode)
10046 return false;
10047
10048 bool use_approx_division_p = (flag_mlow_precision_div
10049 || (aarch64_tune_params.approx_modes->division
10050 & AARCH64_APPROX_MODE (mode)));
10051
10052 if (!flag_finite_math_only
10053 || flag_trapping_math
10054 || !flag_unsafe_math_optimizations
10055 || optimize_function_for_size_p (cfun)
10056 || !use_approx_division_p)
10057 return false;
10058
10059 if (!TARGET_SIMD && VECTOR_MODE_P (mode))
10060 return false;
10061
10062 /* Estimate the approximate reciprocal. */
10063 rtx xrcp = gen_reg_rtx (mode);
10064 emit_insn ((*get_recpe_type (mode)) (xrcp, den));
10065
10066 /* Iterate over the series twice for SF and thrice for DF. */
10067 int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
10068
10069 /* Optionally iterate over the series once less for faster performance,
10070 while sacrificing the accuracy. */
10071 if (flag_mlow_precision_div)
10072 iterations--;
10073
10074 /* Iterate over the series to calculate the approximate reciprocal. */
10075 rtx xtmp = gen_reg_rtx (mode);
10076 while (iterations--)
10077 {
10078 emit_insn ((*get_recps_type (mode)) (xtmp, xrcp, den));
10079
10080 if (iterations > 0)
10081 emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xtmp));
10082 }
10083
10084 if (num != CONST1_RTX (mode))
10085 {
10086 /* As the approximate reciprocal of DEN is already calculated, only
10087 calculate the approximate division when NUM is not 1.0. */
10088 rtx xnum = force_reg (mode, num);
10089 emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xnum));
10090 }
10091
10092 /* Finalize the approximation. */
10093 emit_set_insn (quo, gen_rtx_MULT (mode, xrcp, xtmp));
10094 return true;
10095 }
10096
10097 /* Return the number of instructions that can be issued per cycle. */
10098 static int
10099 aarch64_sched_issue_rate (void)
10100 {
10101 return aarch64_tune_params.issue_rate;
10102 }
10103
10104 static int
10105 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
10106 {
10107 int issue_rate = aarch64_sched_issue_rate ();
10108
10109 return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
10110 }
10111
10112
10113 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
10114 autopref_multipass_dfa_lookahead_guard from haifa-sched.c. It only
10115 has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0. */
10116
10117 static int
10118 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
10119 int ready_index)
10120 {
10121 return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
10122 }
10123
10124
10125 /* Vectorizer cost model target hooks. */
10126
10127 /* Implement targetm.vectorize.builtin_vectorization_cost. */
10128 static int
10129 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
10130 tree vectype,
10131 int misalign ATTRIBUTE_UNUSED)
10132 {
10133 unsigned elements;
10134 const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
10135 bool fp = false;
10136
10137 if (vectype != NULL)
10138 fp = FLOAT_TYPE_P (vectype);
10139
10140 switch (type_of_cost)
10141 {
10142 case scalar_stmt:
10143 return fp ? costs->scalar_fp_stmt_cost : costs->scalar_int_stmt_cost;
10144
10145 case scalar_load:
10146 return costs->scalar_load_cost;
10147
10148 case scalar_store:
10149 return costs->scalar_store_cost;
10150
10151 case vector_stmt:
10152 return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
10153
10154 case vector_load:
10155 return costs->vec_align_load_cost;
10156
10157 case vector_store:
10158 return costs->vec_store_cost;
10159
10160 case vec_to_scalar:
10161 return costs->vec_to_scalar_cost;
10162
10163 case scalar_to_vec:
10164 return costs->scalar_to_vec_cost;
10165
10166 case unaligned_load:
10167 case vector_gather_load:
10168 return costs->vec_unalign_load_cost;
10169
10170 case unaligned_store:
10171 case vector_scatter_store:
10172 return costs->vec_unalign_store_cost;
10173
10174 case cond_branch_taken:
10175 return costs->cond_taken_branch_cost;
10176
10177 case cond_branch_not_taken:
10178 return costs->cond_not_taken_branch_cost;
10179
10180 case vec_perm:
10181 return costs->vec_permute_cost;
10182
10183 case vec_promote_demote:
10184 return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
10185
10186 case vec_construct:
10187 elements = estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype));
10188 return elements / 2 + 1;
10189
10190 default:
10191 gcc_unreachable ();
10192 }
10193 }
10194
10195 /* Implement targetm.vectorize.add_stmt_cost. */
10196 static unsigned
10197 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
10198 struct _stmt_vec_info *stmt_info, int misalign,
10199 enum vect_cost_model_location where)
10200 {
10201 unsigned *cost = (unsigned *) data;
10202 unsigned retval = 0;
10203
10204 if (flag_vect_cost_model)
10205 {
10206 tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
10207 int stmt_cost =
10208 aarch64_builtin_vectorization_cost (kind, vectype, misalign);
10209
10210 /* Statements in an inner loop relative to the loop being
10211 vectorized are weighted more heavily. The value here is
10212 arbitrary and could potentially be improved with analysis. */
10213 if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
10214 count *= 50; /* FIXME */
10215
10216 retval = (unsigned) (count * stmt_cost);
10217 cost[where] += retval;
10218 }
10219
10220 return retval;
10221 }
10222
10223 static void initialize_aarch64_code_model (struct gcc_options *);
10224
10225 /* Parse the TO_PARSE string and put the architecture struct that it
10226 selects into RES and the architectural features into ISA_FLAGS.
10227 Return an aarch64_parse_opt_result describing the parse result.
10228 If there is an error parsing, RES and ISA_FLAGS are left unchanged. */
10229
10230 static enum aarch64_parse_opt_result
10231 aarch64_parse_arch (const char *to_parse, const struct processor **res,
10232 unsigned long *isa_flags)
10233 {
10234 char *ext;
10235 const struct processor *arch;
10236 char *str = (char *) alloca (strlen (to_parse) + 1);
10237 size_t len;
10238
10239 strcpy (str, to_parse);
10240
10241 ext = strchr (str, '+');
10242
10243 if (ext != NULL)
10244 len = ext - str;
10245 else
10246 len = strlen (str);
10247
10248 if (len == 0)
10249 return AARCH64_PARSE_MISSING_ARG;
10250
10251
10252 /* Loop through the list of supported ARCHes to find a match. */
10253 for (arch = all_architectures; arch->name != NULL; arch++)
10254 {
10255 if (strlen (arch->name) == len && strncmp (arch->name, str, len) == 0)
10256 {
10257 unsigned long isa_temp = arch->flags;
10258
10259 if (ext != NULL)
10260 {
10261 /* TO_PARSE string contains at least one extension. */
10262 enum aarch64_parse_opt_result ext_res
10263 = aarch64_parse_extension (ext, &isa_temp);
10264
10265 if (ext_res != AARCH64_PARSE_OK)
10266 return ext_res;
10267 }
10268 /* Extension parsing was successful. Confirm the result
10269 arch and ISA flags. */
10270 *res = arch;
10271 *isa_flags = isa_temp;
10272 return AARCH64_PARSE_OK;
10273 }
10274 }
10275
10276 /* ARCH name not found in list. */
10277 return AARCH64_PARSE_INVALID_ARG;
10278 }
10279
10280 /* Parse the TO_PARSE string and put the result tuning in RES and the
10281 architecture flags in ISA_FLAGS. Return an aarch64_parse_opt_result
10282 describing the parse result. If there is an error parsing, RES and
10283 ISA_FLAGS are left unchanged. */
10284
10285 static enum aarch64_parse_opt_result
10286 aarch64_parse_cpu (const char *to_parse, const struct processor **res,
10287 unsigned long *isa_flags)
10288 {
10289 char *ext;
10290 const struct processor *cpu;
10291 char *str = (char *) alloca (strlen (to_parse) + 1);
10292 size_t len;
10293
10294 strcpy (str, to_parse);
10295
10296 ext = strchr (str, '+');
10297
10298 if (ext != NULL)
10299 len = ext - str;
10300 else
10301 len = strlen (str);
10302
10303 if (len == 0)
10304 return AARCH64_PARSE_MISSING_ARG;
10305
10306
10307 /* Loop through the list of supported CPUs to find a match. */
10308 for (cpu = all_cores; cpu->name != NULL; cpu++)
10309 {
10310 if (strlen (cpu->name) == len && strncmp (cpu->name, str, len) == 0)
10311 {
10312 unsigned long isa_temp = cpu->flags;
10313
10314
10315 if (ext != NULL)
10316 {
10317 /* TO_PARSE string contains at least one extension. */
10318 enum aarch64_parse_opt_result ext_res
10319 = aarch64_parse_extension (ext, &isa_temp);
10320
10321 if (ext_res != AARCH64_PARSE_OK)
10322 return ext_res;
10323 }
10324 /* Extension parsing was successfull. Confirm the result
10325 cpu and ISA flags. */
10326 *res = cpu;
10327 *isa_flags = isa_temp;
10328 return AARCH64_PARSE_OK;
10329 }
10330 }
10331
10332 /* CPU name not found in list. */
10333 return AARCH64_PARSE_INVALID_ARG;
10334 }
10335
10336 /* Parse the TO_PARSE string and put the cpu it selects into RES.
10337 Return an aarch64_parse_opt_result describing the parse result.
10338 If the parsing fails the RES does not change. */
10339
10340 static enum aarch64_parse_opt_result
10341 aarch64_parse_tune (const char *to_parse, const struct processor **res)
10342 {
10343 const struct processor *cpu;
10344 char *str = (char *) alloca (strlen (to_parse) + 1);
10345
10346 strcpy (str, to_parse);
10347
10348 /* Loop through the list of supported CPUs to find a match. */
10349 for (cpu = all_cores; cpu->name != NULL; cpu++)
10350 {
10351 if (strcmp (cpu->name, str) == 0)
10352 {
10353 *res = cpu;
10354 return AARCH64_PARSE_OK;
10355 }
10356 }
10357
10358 /* CPU name not found in list. */
10359 return AARCH64_PARSE_INVALID_ARG;
10360 }
10361
10362 /* Parse TOKEN, which has length LENGTH to see if it is an option
10363 described in FLAG. If it is, return the index bit for that fusion type.
10364 If not, error (printing OPTION_NAME) and return zero. */
10365
10366 static unsigned int
10367 aarch64_parse_one_option_token (const char *token,
10368 size_t length,
10369 const struct aarch64_flag_desc *flag,
10370 const char *option_name)
10371 {
10372 for (; flag->name != NULL; flag++)
10373 {
10374 if (length == strlen (flag->name)
10375 && !strncmp (flag->name, token, length))
10376 return flag->flag;
10377 }
10378
10379 error ("unknown flag passed in -moverride=%s (%s)", option_name, token);
10380 return 0;
10381 }
10382
10383 /* Parse OPTION which is a comma-separated list of flags to enable.
10384 FLAGS gives the list of flags we understand, INITIAL_STATE gives any
10385 default state we inherit from the CPU tuning structures. OPTION_NAME
10386 gives the top-level option we are parsing in the -moverride string,
10387 for use in error messages. */
10388
10389 static unsigned int
10390 aarch64_parse_boolean_options (const char *option,
10391 const struct aarch64_flag_desc *flags,
10392 unsigned int initial_state,
10393 const char *option_name)
10394 {
10395 const char separator = '.';
10396 const char* specs = option;
10397 const char* ntoken = option;
10398 unsigned int found_flags = initial_state;
10399
10400 while ((ntoken = strchr (specs, separator)))
10401 {
10402 size_t token_length = ntoken - specs;
10403 unsigned token_ops = aarch64_parse_one_option_token (specs,
10404 token_length,
10405 flags,
10406 option_name);
10407 /* If we find "none" (or, for simplicity's sake, an error) anywhere
10408 in the token stream, reset the supported operations. So:
10409
10410 adrp+add.cmp+branch.none.adrp+add
10411
10412 would have the result of turning on only adrp+add fusion. */
10413 if (!token_ops)
10414 found_flags = 0;
10415
10416 found_flags |= token_ops;
10417 specs = ++ntoken;
10418 }
10419
10420 /* We ended with a comma, print something. */
10421 if (!(*specs))
10422 {
10423 error ("%s string ill-formed\n", option_name);
10424 return 0;
10425 }
10426
10427 /* We still have one more token to parse. */
10428 size_t token_length = strlen (specs);
10429 unsigned token_ops = aarch64_parse_one_option_token (specs,
10430 token_length,
10431 flags,
10432 option_name);
10433 if (!token_ops)
10434 found_flags = 0;
10435
10436 found_flags |= token_ops;
10437 return found_flags;
10438 }
10439
10440 /* Support for overriding instruction fusion. */
10441
10442 static void
10443 aarch64_parse_fuse_string (const char *fuse_string,
10444 struct tune_params *tune)
10445 {
10446 tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
10447 aarch64_fusible_pairs,
10448 tune->fusible_ops,
10449 "fuse=");
10450 }
10451
10452 /* Support for overriding other tuning flags. */
10453
10454 static void
10455 aarch64_parse_tune_string (const char *tune_string,
10456 struct tune_params *tune)
10457 {
10458 tune->extra_tuning_flags
10459 = aarch64_parse_boolean_options (tune_string,
10460 aarch64_tuning_flags,
10461 tune->extra_tuning_flags,
10462 "tune=");
10463 }
10464
10465 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
10466 we understand. If it is, extract the option string and handoff to
10467 the appropriate function. */
10468
10469 void
10470 aarch64_parse_one_override_token (const char* token,
10471 size_t length,
10472 struct tune_params *tune)
10473 {
10474 const struct aarch64_tuning_override_function *fn
10475 = aarch64_tuning_override_functions;
10476
10477 const char *option_part = strchr (token, '=');
10478 if (!option_part)
10479 {
10480 error ("tuning string missing in option (%s)", token);
10481 return;
10482 }
10483
10484 /* Get the length of the option name. */
10485 length = option_part - token;
10486 /* Skip the '=' to get to the option string. */
10487 option_part++;
10488
10489 for (; fn->name != NULL; fn++)
10490 {
10491 if (!strncmp (fn->name, token, length))
10492 {
10493 fn->parse_override (option_part, tune);
10494 return;
10495 }
10496 }
10497
10498 error ("unknown tuning option (%s)",token);
10499 return;
10500 }
10501
10502 /* A checking mechanism for the implementation of the tls size. */
10503
10504 static void
10505 initialize_aarch64_tls_size (struct gcc_options *opts)
10506 {
10507 if (aarch64_tls_size == 0)
10508 aarch64_tls_size = 24;
10509
10510 switch (opts->x_aarch64_cmodel_var)
10511 {
10512 case AARCH64_CMODEL_TINY:
10513 /* Both the default and maximum TLS size allowed under tiny is 1M which
10514 needs two instructions to address, so we clamp the size to 24. */
10515 if (aarch64_tls_size > 24)
10516 aarch64_tls_size = 24;
10517 break;
10518 case AARCH64_CMODEL_SMALL:
10519 /* The maximum TLS size allowed under small is 4G. */
10520 if (aarch64_tls_size > 32)
10521 aarch64_tls_size = 32;
10522 break;
10523 case AARCH64_CMODEL_LARGE:
10524 /* The maximum TLS size allowed under large is 16E.
10525 FIXME: 16E should be 64bit, we only support 48bit offset now. */
10526 if (aarch64_tls_size > 48)
10527 aarch64_tls_size = 48;
10528 break;
10529 default:
10530 gcc_unreachable ();
10531 }
10532
10533 return;
10534 }
10535
10536 /* Parse STRING looking for options in the format:
10537 string :: option:string
10538 option :: name=substring
10539 name :: {a-z}
10540 substring :: defined by option. */
10541
10542 static void
10543 aarch64_parse_override_string (const char* input_string,
10544 struct tune_params* tune)
10545 {
10546 const char separator = ':';
10547 size_t string_length = strlen (input_string) + 1;
10548 char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
10549 char *string = string_root;
10550 strncpy (string, input_string, string_length);
10551 string[string_length - 1] = '\0';
10552
10553 char* ntoken = string;
10554
10555 while ((ntoken = strchr (string, separator)))
10556 {
10557 size_t token_length = ntoken - string;
10558 /* Make this substring look like a string. */
10559 *ntoken = '\0';
10560 aarch64_parse_one_override_token (string, token_length, tune);
10561 string = ++ntoken;
10562 }
10563
10564 /* One last option to parse. */
10565 aarch64_parse_one_override_token (string, strlen (string), tune);
10566 free (string_root);
10567 }
10568
10569
10570 static void
10571 aarch64_override_options_after_change_1 (struct gcc_options *opts)
10572 {
10573 /* PR 70044: We have to be careful about being called multiple times for the
10574 same function. This means all changes should be repeatable. */
10575
10576 /* Set aarch64_use_frame_pointer based on -fno-omit-frame-pointer.
10577 Disable the frame pointer flag so the mid-end will not use a frame
10578 pointer in leaf functions in order to support -fomit-leaf-frame-pointer.
10579 Set x_flag_omit_frame_pointer to the special value 2 to differentiate
10580 between -fomit-frame-pointer (1) and -fno-omit-frame-pointer (2). */
10581 aarch64_use_frame_pointer = opts->x_flag_omit_frame_pointer != 1;
10582 if (opts->x_flag_omit_frame_pointer == 0)
10583 opts->x_flag_omit_frame_pointer = 2;
10584
10585 /* If not optimizing for size, set the default
10586 alignment to what the target wants. */
10587 if (!opts->x_optimize_size)
10588 {
10589 if (opts->x_flag_align_loops && !opts->x_str_align_loops)
10590 opts->x_str_align_loops = aarch64_tune_params.loop_align;
10591 if (opts->x_flag_align_jumps && !opts->x_str_align_jumps)
10592 opts->x_str_align_jumps = aarch64_tune_params.jump_align;
10593 if (opts->x_flag_align_functions && !opts->x_str_align_functions)
10594 opts->x_str_align_functions = aarch64_tune_params.function_align;
10595 }
10596
10597 /* We default to no pc-relative literal loads. */
10598
10599 aarch64_pcrelative_literal_loads = false;
10600
10601 /* If -mpc-relative-literal-loads is set on the command line, this
10602 implies that the user asked for PC relative literal loads. */
10603 if (opts->x_pcrelative_literal_loads == 1)
10604 aarch64_pcrelative_literal_loads = true;
10605
10606 /* In the tiny memory model it makes no sense to disallow PC relative
10607 literal pool loads. */
10608 if (aarch64_cmodel == AARCH64_CMODEL_TINY
10609 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
10610 aarch64_pcrelative_literal_loads = true;
10611
10612 /* When enabling the lower precision Newton series for the square root, also
10613 enable it for the reciprocal square root, since the latter is an
10614 intermediary step for the former. */
10615 if (flag_mlow_precision_sqrt)
10616 flag_mrecip_low_precision_sqrt = true;
10617 }
10618
10619 /* 'Unpack' up the internal tuning structs and update the options
10620 in OPTS. The caller must have set up selected_tune and selected_arch
10621 as all the other target-specific codegen decisions are
10622 derived from them. */
10623
10624 void
10625 aarch64_override_options_internal (struct gcc_options *opts)
10626 {
10627 aarch64_tune_flags = selected_tune->flags;
10628 aarch64_tune = selected_tune->sched_core;
10629 /* Make a copy of the tuning parameters attached to the core, which
10630 we may later overwrite. */
10631 aarch64_tune_params = *(selected_tune->tune);
10632 aarch64_architecture_version = selected_arch->architecture_version;
10633
10634 if (opts->x_aarch64_override_tune_string)
10635 aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
10636 &aarch64_tune_params);
10637
10638 /* This target defaults to strict volatile bitfields. */
10639 if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
10640 opts->x_flag_strict_volatile_bitfields = 1;
10641
10642 initialize_aarch64_code_model (opts);
10643 initialize_aarch64_tls_size (opts);
10644
10645 int queue_depth = 0;
10646 switch (aarch64_tune_params.autoprefetcher_model)
10647 {
10648 case tune_params::AUTOPREFETCHER_OFF:
10649 queue_depth = -1;
10650 break;
10651 case tune_params::AUTOPREFETCHER_WEAK:
10652 queue_depth = 0;
10653 break;
10654 case tune_params::AUTOPREFETCHER_STRONG:
10655 queue_depth = max_insn_queue_index + 1;
10656 break;
10657 default:
10658 gcc_unreachable ();
10659 }
10660
10661 /* We don't mind passing in global_options_set here as we don't use
10662 the *options_set structs anyway. */
10663 maybe_set_param_value (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH,
10664 queue_depth,
10665 opts->x_param_values,
10666 global_options_set.x_param_values);
10667
10668 /* Set up parameters to be used in prefetching algorithm. Do not
10669 override the defaults unless we are tuning for a core we have
10670 researched values for. */
10671 if (aarch64_tune_params.prefetch->num_slots > 0)
10672 maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
10673 aarch64_tune_params.prefetch->num_slots,
10674 opts->x_param_values,
10675 global_options_set.x_param_values);
10676 if (aarch64_tune_params.prefetch->l1_cache_size >= 0)
10677 maybe_set_param_value (PARAM_L1_CACHE_SIZE,
10678 aarch64_tune_params.prefetch->l1_cache_size,
10679 opts->x_param_values,
10680 global_options_set.x_param_values);
10681 if (aarch64_tune_params.prefetch->l1_cache_line_size >= 0)
10682 maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
10683 aarch64_tune_params.prefetch->l1_cache_line_size,
10684 opts->x_param_values,
10685 global_options_set.x_param_values);
10686 if (aarch64_tune_params.prefetch->l2_cache_size >= 0)
10687 maybe_set_param_value (PARAM_L2_CACHE_SIZE,
10688 aarch64_tune_params.prefetch->l2_cache_size,
10689 opts->x_param_values,
10690 global_options_set.x_param_values);
10691 if (!aarch64_tune_params.prefetch->prefetch_dynamic_strides)
10692 maybe_set_param_value (PARAM_PREFETCH_DYNAMIC_STRIDES,
10693 0,
10694 opts->x_param_values,
10695 global_options_set.x_param_values);
10696 if (aarch64_tune_params.prefetch->minimum_stride >= 0)
10697 maybe_set_param_value (PARAM_PREFETCH_MINIMUM_STRIDE,
10698 aarch64_tune_params.prefetch->minimum_stride,
10699 opts->x_param_values,
10700 global_options_set.x_param_values);
10701
10702 /* Use the alternative scheduling-pressure algorithm by default. */
10703 maybe_set_param_value (PARAM_SCHED_PRESSURE_ALGORITHM, SCHED_PRESSURE_MODEL,
10704 opts->x_param_values,
10705 global_options_set.x_param_values);
10706
10707 /* Enable sw prefetching at specified optimization level for
10708 CPUS that have prefetch. Lower optimization level threshold by 1
10709 when profiling is enabled. */
10710 if (opts->x_flag_prefetch_loop_arrays < 0
10711 && !opts->x_optimize_size
10712 && aarch64_tune_params.prefetch->default_opt_level >= 0
10713 && opts->x_optimize >= aarch64_tune_params.prefetch->default_opt_level)
10714 opts->x_flag_prefetch_loop_arrays = 1;
10715
10716 aarch64_override_options_after_change_1 (opts);
10717 }
10718
10719 /* Print a hint with a suggestion for a core or architecture name that
10720 most closely resembles what the user passed in STR. ARCH is true if
10721 the user is asking for an architecture name. ARCH is false if the user
10722 is asking for a core name. */
10723
10724 static void
10725 aarch64_print_hint_for_core_or_arch (const char *str, bool arch)
10726 {
10727 auto_vec<const char *> candidates;
10728 const struct processor *entry = arch ? all_architectures : all_cores;
10729 for (; entry->name != NULL; entry++)
10730 candidates.safe_push (entry->name);
10731
10732 #ifdef HAVE_LOCAL_CPU_DETECT
10733 /* Add also "native" as possible value. */
10734 if (arch)
10735 candidates.safe_push ("native");
10736 #endif
10737
10738 char *s;
10739 const char *hint = candidates_list_and_hint (str, s, candidates);
10740 if (hint)
10741 inform (input_location, "valid arguments are: %s;"
10742 " did you mean %qs?", s, hint);
10743 else
10744 inform (input_location, "valid arguments are: %s", s);
10745
10746 XDELETEVEC (s);
10747 }
10748
10749 /* Print a hint with a suggestion for a core name that most closely resembles
10750 what the user passed in STR. */
10751
10752 inline static void
10753 aarch64_print_hint_for_core (const char *str)
10754 {
10755 aarch64_print_hint_for_core_or_arch (str, false);
10756 }
10757
10758 /* Print a hint with a suggestion for an architecture name that most closely
10759 resembles what the user passed in STR. */
10760
10761 inline static void
10762 aarch64_print_hint_for_arch (const char *str)
10763 {
10764 aarch64_print_hint_for_core_or_arch (str, true);
10765 }
10766
10767 /* Validate a command-line -mcpu option. Parse the cpu and extensions (if any)
10768 specified in STR and throw errors if appropriate. Put the results if
10769 they are valid in RES and ISA_FLAGS. Return whether the option is
10770 valid. */
10771
10772 static bool
10773 aarch64_validate_mcpu (const char *str, const struct processor **res,
10774 unsigned long *isa_flags)
10775 {
10776 enum aarch64_parse_opt_result parse_res
10777 = aarch64_parse_cpu (str, res, isa_flags);
10778
10779 if (parse_res == AARCH64_PARSE_OK)
10780 return true;
10781
10782 switch (parse_res)
10783 {
10784 case AARCH64_PARSE_MISSING_ARG:
10785 error ("missing cpu name in %<-mcpu=%s%>", str);
10786 break;
10787 case AARCH64_PARSE_INVALID_ARG:
10788 error ("unknown value %qs for -mcpu", str);
10789 aarch64_print_hint_for_core (str);
10790 break;
10791 case AARCH64_PARSE_INVALID_FEATURE:
10792 error ("invalid feature modifier in %<-mcpu=%s%>", str);
10793 break;
10794 default:
10795 gcc_unreachable ();
10796 }
10797
10798 return false;
10799 }
10800
10801 /* Validate a command-line -march option. Parse the arch and extensions
10802 (if any) specified in STR and throw errors if appropriate. Put the
10803 results, if they are valid, in RES and ISA_FLAGS. Return whether the
10804 option is valid. */
10805
10806 static bool
10807 aarch64_validate_march (const char *str, const struct processor **res,
10808 unsigned long *isa_flags)
10809 {
10810 enum aarch64_parse_opt_result parse_res
10811 = aarch64_parse_arch (str, res, isa_flags);
10812
10813 if (parse_res == AARCH64_PARSE_OK)
10814 return true;
10815
10816 switch (parse_res)
10817 {
10818 case AARCH64_PARSE_MISSING_ARG:
10819 error ("missing arch name in %<-march=%s%>", str);
10820 break;
10821 case AARCH64_PARSE_INVALID_ARG:
10822 error ("unknown value %qs for -march", str);
10823 aarch64_print_hint_for_arch (str);
10824 break;
10825 case AARCH64_PARSE_INVALID_FEATURE:
10826 error ("invalid feature modifier in %<-march=%s%>", str);
10827 break;
10828 default:
10829 gcc_unreachable ();
10830 }
10831
10832 return false;
10833 }
10834
10835 /* Validate a command-line -mtune option. Parse the cpu
10836 specified in STR and throw errors if appropriate. Put the
10837 result, if it is valid, in RES. Return whether the option is
10838 valid. */
10839
10840 static bool
10841 aarch64_validate_mtune (const char *str, const struct processor **res)
10842 {
10843 enum aarch64_parse_opt_result parse_res
10844 = aarch64_parse_tune (str, res);
10845
10846 if (parse_res == AARCH64_PARSE_OK)
10847 return true;
10848
10849 switch (parse_res)
10850 {
10851 case AARCH64_PARSE_MISSING_ARG:
10852 error ("missing cpu name in %<-mtune=%s%>", str);
10853 break;
10854 case AARCH64_PARSE_INVALID_ARG:
10855 error ("unknown value %qs for -mtune", str);
10856 aarch64_print_hint_for_core (str);
10857 break;
10858 default:
10859 gcc_unreachable ();
10860 }
10861 return false;
10862 }
10863
10864 /* Return the CPU corresponding to the enum CPU.
10865 If it doesn't specify a cpu, return the default. */
10866
10867 static const struct processor *
10868 aarch64_get_tune_cpu (enum aarch64_processor cpu)
10869 {
10870 if (cpu != aarch64_none)
10871 return &all_cores[cpu];
10872
10873 /* The & 0x3f is to extract the bottom 6 bits that encode the
10874 default cpu as selected by the --with-cpu GCC configure option
10875 in config.gcc.
10876 ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
10877 flags mechanism should be reworked to make it more sane. */
10878 return &all_cores[TARGET_CPU_DEFAULT & 0x3f];
10879 }
10880
10881 /* Return the architecture corresponding to the enum ARCH.
10882 If it doesn't specify a valid architecture, return the default. */
10883
10884 static const struct processor *
10885 aarch64_get_arch (enum aarch64_arch arch)
10886 {
10887 if (arch != aarch64_no_arch)
10888 return &all_architectures[arch];
10889
10890 const struct processor *cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
10891
10892 return &all_architectures[cpu->arch];
10893 }
10894
10895 /* Return the VG value associated with -msve-vector-bits= value VALUE. */
10896
10897 static poly_uint16
10898 aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits_enum value)
10899 {
10900 /* For now generate vector-length agnostic code for -msve-vector-bits=128.
10901 This ensures we can clearly distinguish SVE and Advanced SIMD modes when
10902 deciding which .md file patterns to use and when deciding whether
10903 something is a legitimate address or constant. */
10904 if (value == SVE_SCALABLE || value == SVE_128)
10905 return poly_uint16 (2, 2);
10906 else
10907 return (int) value / 64;
10908 }
10909
10910 /* Implement TARGET_OPTION_OVERRIDE. This is called once in the beginning
10911 and is used to parse the -m{cpu,tune,arch} strings and setup the initial
10912 tuning structs. In particular it must set selected_tune and
10913 aarch64_isa_flags that define the available ISA features and tuning
10914 decisions. It must also set selected_arch as this will be used to
10915 output the .arch asm tags for each function. */
10916
10917 static void
10918 aarch64_override_options (void)
10919 {
10920 unsigned long cpu_isa = 0;
10921 unsigned long arch_isa = 0;
10922 aarch64_isa_flags = 0;
10923
10924 bool valid_cpu = true;
10925 bool valid_tune = true;
10926 bool valid_arch = true;
10927
10928 selected_cpu = NULL;
10929 selected_arch = NULL;
10930 selected_tune = NULL;
10931
10932 /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
10933 If either of -march or -mtune is given, they override their
10934 respective component of -mcpu. */
10935 if (aarch64_cpu_string)
10936 valid_cpu = aarch64_validate_mcpu (aarch64_cpu_string, &selected_cpu,
10937 &cpu_isa);
10938
10939 if (aarch64_arch_string)
10940 valid_arch = aarch64_validate_march (aarch64_arch_string, &selected_arch,
10941 &arch_isa);
10942
10943 if (aarch64_tune_string)
10944 valid_tune = aarch64_validate_mtune (aarch64_tune_string, &selected_tune);
10945
10946 /* If the user did not specify a processor, choose the default
10947 one for them. This will be the CPU set during configuration using
10948 --with-cpu, otherwise it is "generic". */
10949 if (!selected_cpu)
10950 {
10951 if (selected_arch)
10952 {
10953 selected_cpu = &all_cores[selected_arch->ident];
10954 aarch64_isa_flags = arch_isa;
10955 explicit_arch = selected_arch->arch;
10956 }
10957 else
10958 {
10959 /* Get default configure-time CPU. */
10960 selected_cpu = aarch64_get_tune_cpu (aarch64_none);
10961 aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
10962 }
10963
10964 if (selected_tune)
10965 explicit_tune_core = selected_tune->ident;
10966 }
10967 /* If both -mcpu and -march are specified check that they are architecturally
10968 compatible, warn if they're not and prefer the -march ISA flags. */
10969 else if (selected_arch)
10970 {
10971 if (selected_arch->arch != selected_cpu->arch)
10972 {
10973 warning (0, "switch -mcpu=%s conflicts with -march=%s switch",
10974 all_architectures[selected_cpu->arch].name,
10975 selected_arch->name);
10976 }
10977 aarch64_isa_flags = arch_isa;
10978 explicit_arch = selected_arch->arch;
10979 explicit_tune_core = selected_tune ? selected_tune->ident
10980 : selected_cpu->ident;
10981 }
10982 else
10983 {
10984 /* -mcpu but no -march. */
10985 aarch64_isa_flags = cpu_isa;
10986 explicit_tune_core = selected_tune ? selected_tune->ident
10987 : selected_cpu->ident;
10988 gcc_assert (selected_cpu);
10989 selected_arch = &all_architectures[selected_cpu->arch];
10990 explicit_arch = selected_arch->arch;
10991 }
10992
10993 /* Set the arch as well as we will need it when outputing
10994 the .arch directive in assembly. */
10995 if (!selected_arch)
10996 {
10997 gcc_assert (selected_cpu);
10998 selected_arch = &all_architectures[selected_cpu->arch];
10999 }
11000
11001 if (!selected_tune)
11002 selected_tune = selected_cpu;
11003
11004 #ifndef HAVE_AS_MABI_OPTION
11005 /* The compiler may have been configured with 2.23.* binutils, which does
11006 not have support for ILP32. */
11007 if (TARGET_ILP32)
11008 error ("assembler does not support -mabi=ilp32");
11009 #endif
11010
11011 /* Convert -msve-vector-bits to a VG count. */
11012 aarch64_sve_vg = aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits);
11013
11014 if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE && TARGET_ILP32)
11015 sorry ("return address signing is only supported for -mabi=lp64");
11016
11017 /* Make sure we properly set up the explicit options. */
11018 if ((aarch64_cpu_string && valid_cpu)
11019 || (aarch64_tune_string && valid_tune))
11020 gcc_assert (explicit_tune_core != aarch64_none);
11021
11022 if ((aarch64_cpu_string && valid_cpu)
11023 || (aarch64_arch_string && valid_arch))
11024 gcc_assert (explicit_arch != aarch64_no_arch);
11025
11026 aarch64_override_options_internal (&global_options);
11027
11028 /* Save these options as the default ones in case we push and pop them later
11029 while processing functions with potential target attributes. */
11030 target_option_default_node = target_option_current_node
11031 = build_target_option_node (&global_options);
11032 }
11033
11034 /* Implement targetm.override_options_after_change. */
11035
11036 static void
11037 aarch64_override_options_after_change (void)
11038 {
11039 aarch64_override_options_after_change_1 (&global_options);
11040 }
11041
11042 static struct machine_function *
11043 aarch64_init_machine_status (void)
11044 {
11045 struct machine_function *machine;
11046 machine = ggc_cleared_alloc<machine_function> ();
11047 return machine;
11048 }
11049
11050 void
11051 aarch64_init_expanders (void)
11052 {
11053 init_machine_status = aarch64_init_machine_status;
11054 }
11055
11056 /* A checking mechanism for the implementation of the various code models. */
11057 static void
11058 initialize_aarch64_code_model (struct gcc_options *opts)
11059 {
11060 if (opts->x_flag_pic)
11061 {
11062 switch (opts->x_aarch64_cmodel_var)
11063 {
11064 case AARCH64_CMODEL_TINY:
11065 aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
11066 break;
11067 case AARCH64_CMODEL_SMALL:
11068 #ifdef HAVE_AS_SMALL_PIC_RELOCS
11069 aarch64_cmodel = (flag_pic == 2
11070 ? AARCH64_CMODEL_SMALL_PIC
11071 : AARCH64_CMODEL_SMALL_SPIC);
11072 #else
11073 aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
11074 #endif
11075 break;
11076 case AARCH64_CMODEL_LARGE:
11077 sorry ("code model %qs with -f%s", "large",
11078 opts->x_flag_pic > 1 ? "PIC" : "pic");
11079 break;
11080 default:
11081 gcc_unreachable ();
11082 }
11083 }
11084 else
11085 aarch64_cmodel = opts->x_aarch64_cmodel_var;
11086 }
11087
11088 /* Implement TARGET_OPTION_SAVE. */
11089
11090 static void
11091 aarch64_option_save (struct cl_target_option *ptr, struct gcc_options *opts)
11092 {
11093 ptr->x_aarch64_override_tune_string = opts->x_aarch64_override_tune_string;
11094 }
11095
11096 /* Implements TARGET_OPTION_RESTORE. Restore the backend codegen decisions
11097 using the information saved in PTR. */
11098
11099 static void
11100 aarch64_option_restore (struct gcc_options *opts, struct cl_target_option *ptr)
11101 {
11102 opts->x_explicit_tune_core = ptr->x_explicit_tune_core;
11103 selected_tune = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
11104 opts->x_explicit_arch = ptr->x_explicit_arch;
11105 selected_arch = aarch64_get_arch (ptr->x_explicit_arch);
11106 opts->x_aarch64_override_tune_string = ptr->x_aarch64_override_tune_string;
11107
11108 aarch64_override_options_internal (opts);
11109 }
11110
11111 /* Implement TARGET_OPTION_PRINT. */
11112
11113 static void
11114 aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
11115 {
11116 const struct processor *cpu
11117 = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
11118 unsigned long isa_flags = ptr->x_aarch64_isa_flags;
11119 const struct processor *arch = aarch64_get_arch (ptr->x_explicit_arch);
11120 std::string extension
11121 = aarch64_get_extension_string_for_isa_flags (isa_flags, arch->flags);
11122
11123 fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
11124 fprintf (file, "%*sselected arch = %s%s\n", indent, "",
11125 arch->name, extension.c_str ());
11126 }
11127
11128 static GTY(()) tree aarch64_previous_fndecl;
11129
11130 void
11131 aarch64_reset_previous_fndecl (void)
11132 {
11133 aarch64_previous_fndecl = NULL;
11134 }
11135
11136 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
11137 Used by aarch64_set_current_function and aarch64_pragma_target_parse to
11138 make sure optab availability predicates are recomputed when necessary. */
11139
11140 void
11141 aarch64_save_restore_target_globals (tree new_tree)
11142 {
11143 if (TREE_TARGET_GLOBALS (new_tree))
11144 restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
11145 else if (new_tree == target_option_default_node)
11146 restore_target_globals (&default_target_globals);
11147 else
11148 TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
11149 }
11150
11151 /* Implement TARGET_SET_CURRENT_FUNCTION. Unpack the codegen decisions
11152 like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
11153 of the function, if such exists. This function may be called multiple
11154 times on a single function so use aarch64_previous_fndecl to avoid
11155 setting up identical state. */
11156
11157 static void
11158 aarch64_set_current_function (tree fndecl)
11159 {
11160 if (!fndecl || fndecl == aarch64_previous_fndecl)
11161 return;
11162
11163 tree old_tree = (aarch64_previous_fndecl
11164 ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl)
11165 : NULL_TREE);
11166
11167 tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
11168
11169 /* If current function has no attributes but the previous one did,
11170 use the default node. */
11171 if (!new_tree && old_tree)
11172 new_tree = target_option_default_node;
11173
11174 /* If nothing to do, return. #pragma GCC reset or #pragma GCC pop to
11175 the default have been handled by aarch64_save_restore_target_globals from
11176 aarch64_pragma_target_parse. */
11177 if (old_tree == new_tree)
11178 return;
11179
11180 aarch64_previous_fndecl = fndecl;
11181
11182 /* First set the target options. */
11183 cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
11184
11185 aarch64_save_restore_target_globals (new_tree);
11186 }
11187
11188 /* Enum describing the various ways we can handle attributes.
11189 In many cases we can reuse the generic option handling machinery. */
11190
11191 enum aarch64_attr_opt_type
11192 {
11193 aarch64_attr_mask, /* Attribute should set a bit in target_flags. */
11194 aarch64_attr_bool, /* Attribute sets or unsets a boolean variable. */
11195 aarch64_attr_enum, /* Attribute sets an enum variable. */
11196 aarch64_attr_custom /* Attribute requires a custom handling function. */
11197 };
11198
11199 /* All the information needed to handle a target attribute.
11200 NAME is the name of the attribute.
11201 ATTR_TYPE specifies the type of behavior of the attribute as described
11202 in the definition of enum aarch64_attr_opt_type.
11203 ALLOW_NEG is true if the attribute supports a "no-" form.
11204 HANDLER is the function that takes the attribute string as an argument
11205 It is needed only when the ATTR_TYPE is aarch64_attr_custom.
11206 OPT_NUM is the enum specifying the option that the attribute modifies.
11207 This is needed for attributes that mirror the behavior of a command-line
11208 option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
11209 aarch64_attr_enum. */
11210
11211 struct aarch64_attribute_info
11212 {
11213 const char *name;
11214 enum aarch64_attr_opt_type attr_type;
11215 bool allow_neg;
11216 bool (*handler) (const char *);
11217 enum opt_code opt_num;
11218 };
11219
11220 /* Handle the ARCH_STR argument to the arch= target attribute. */
11221
11222 static bool
11223 aarch64_handle_attr_arch (const char *str)
11224 {
11225 const struct processor *tmp_arch = NULL;
11226 enum aarch64_parse_opt_result parse_res
11227 = aarch64_parse_arch (str, &tmp_arch, &aarch64_isa_flags);
11228
11229 if (parse_res == AARCH64_PARSE_OK)
11230 {
11231 gcc_assert (tmp_arch);
11232 selected_arch = tmp_arch;
11233 explicit_arch = selected_arch->arch;
11234 return true;
11235 }
11236
11237 switch (parse_res)
11238 {
11239 case AARCH64_PARSE_MISSING_ARG:
11240 error ("missing name in %<target(\"arch=\")%> pragma or attribute");
11241 break;
11242 case AARCH64_PARSE_INVALID_ARG:
11243 error ("invalid name (\"%s\") in %<target(\"arch=\")%> pragma or attribute", str);
11244 aarch64_print_hint_for_arch (str);
11245 break;
11246 case AARCH64_PARSE_INVALID_FEATURE:
11247 error ("invalid value (\"%s\") in %<target()%> pragma or attribute", str);
11248 break;
11249 default:
11250 gcc_unreachable ();
11251 }
11252
11253 return false;
11254 }
11255
11256 /* Handle the argument CPU_STR to the cpu= target attribute. */
11257
11258 static bool
11259 aarch64_handle_attr_cpu (const char *str)
11260 {
11261 const struct processor *tmp_cpu = NULL;
11262 enum aarch64_parse_opt_result parse_res
11263 = aarch64_parse_cpu (str, &tmp_cpu, &aarch64_isa_flags);
11264
11265 if (parse_res == AARCH64_PARSE_OK)
11266 {
11267 gcc_assert (tmp_cpu);
11268 selected_tune = tmp_cpu;
11269 explicit_tune_core = selected_tune->ident;
11270
11271 selected_arch = &all_architectures[tmp_cpu->arch];
11272 explicit_arch = selected_arch->arch;
11273 return true;
11274 }
11275
11276 switch (parse_res)
11277 {
11278 case AARCH64_PARSE_MISSING_ARG:
11279 error ("missing name in %<target(\"cpu=\")%> pragma or attribute");
11280 break;
11281 case AARCH64_PARSE_INVALID_ARG:
11282 error ("invalid name (\"%s\") in %<target(\"cpu=\")%> pragma or attribute", str);
11283 aarch64_print_hint_for_core (str);
11284 break;
11285 case AARCH64_PARSE_INVALID_FEATURE:
11286 error ("invalid value (\"%s\") in %<target()%> pragma or attribute", str);
11287 break;
11288 default:
11289 gcc_unreachable ();
11290 }
11291
11292 return false;
11293 }
11294
11295 /* Handle the argument STR to the tune= target attribute. */
11296
11297 static bool
11298 aarch64_handle_attr_tune (const char *str)
11299 {
11300 const struct processor *tmp_tune = NULL;
11301 enum aarch64_parse_opt_result parse_res
11302 = aarch64_parse_tune (str, &tmp_tune);
11303
11304 if (parse_res == AARCH64_PARSE_OK)
11305 {
11306 gcc_assert (tmp_tune);
11307 selected_tune = tmp_tune;
11308 explicit_tune_core = selected_tune->ident;
11309 return true;
11310 }
11311
11312 switch (parse_res)
11313 {
11314 case AARCH64_PARSE_INVALID_ARG:
11315 error ("invalid name (\"%s\") in %<target(\"tune=\")%> pragma or attribute", str);
11316 aarch64_print_hint_for_core (str);
11317 break;
11318 default:
11319 gcc_unreachable ();
11320 }
11321
11322 return false;
11323 }
11324
11325 /* Parse an architecture extensions target attribute string specified in STR.
11326 For example "+fp+nosimd". Show any errors if needed. Return TRUE
11327 if successful. Update aarch64_isa_flags to reflect the ISA features
11328 modified. */
11329
11330 static bool
11331 aarch64_handle_attr_isa_flags (char *str)
11332 {
11333 enum aarch64_parse_opt_result parse_res;
11334 unsigned long isa_flags = aarch64_isa_flags;
11335
11336 /* We allow "+nothing" in the beginning to clear out all architectural
11337 features if the user wants to handpick specific features. */
11338 if (strncmp ("+nothing", str, 8) == 0)
11339 {
11340 isa_flags = 0;
11341 str += 8;
11342 }
11343
11344 parse_res = aarch64_parse_extension (str, &isa_flags);
11345
11346 if (parse_res == AARCH64_PARSE_OK)
11347 {
11348 aarch64_isa_flags = isa_flags;
11349 return true;
11350 }
11351
11352 switch (parse_res)
11353 {
11354 case AARCH64_PARSE_MISSING_ARG:
11355 error ("missing value in %<target()%> pragma or attribute");
11356 break;
11357
11358 case AARCH64_PARSE_INVALID_FEATURE:
11359 error ("invalid value (\"%s\") in %<target()%> pragma or attribute", str);
11360 break;
11361
11362 default:
11363 gcc_unreachable ();
11364 }
11365
11366 return false;
11367 }
11368
11369 /* The target attributes that we support. On top of these we also support just
11370 ISA extensions, like __attribute__ ((target ("+crc"))), but that case is
11371 handled explicitly in aarch64_process_one_target_attr. */
11372
11373 static const struct aarch64_attribute_info aarch64_attributes[] =
11374 {
11375 { "general-regs-only", aarch64_attr_mask, false, NULL,
11376 OPT_mgeneral_regs_only },
11377 { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
11378 OPT_mfix_cortex_a53_835769 },
11379 { "fix-cortex-a53-843419", aarch64_attr_bool, true, NULL,
11380 OPT_mfix_cortex_a53_843419 },
11381 { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
11382 { "strict-align", aarch64_attr_mask, true, NULL, OPT_mstrict_align },
11383 { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
11384 OPT_momit_leaf_frame_pointer },
11385 { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
11386 { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
11387 OPT_march_ },
11388 { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
11389 { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
11390 OPT_mtune_ },
11391 { "sign-return-address", aarch64_attr_enum, false, NULL,
11392 OPT_msign_return_address_ },
11393 { NULL, aarch64_attr_custom, false, NULL, OPT____ }
11394 };
11395
11396 /* Parse ARG_STR which contains the definition of one target attribute.
11397 Show appropriate errors if any or return true if the attribute is valid. */
11398
11399 static bool
11400 aarch64_process_one_target_attr (char *arg_str)
11401 {
11402 bool invert = false;
11403
11404 size_t len = strlen (arg_str);
11405
11406 if (len == 0)
11407 {
11408 error ("malformed %<target()%> pragma or attribute");
11409 return false;
11410 }
11411
11412 char *str_to_check = (char *) alloca (len + 1);
11413 strcpy (str_to_check, arg_str);
11414
11415 /* Skip leading whitespace. */
11416 while (*str_to_check == ' ' || *str_to_check == '\t')
11417 str_to_check++;
11418
11419 /* We have something like __attribute__ ((target ("+fp+nosimd"))).
11420 It is easier to detect and handle it explicitly here rather than going
11421 through the machinery for the rest of the target attributes in this
11422 function. */
11423 if (*str_to_check == '+')
11424 return aarch64_handle_attr_isa_flags (str_to_check);
11425
11426 if (len > 3 && strncmp (str_to_check, "no-", 3) == 0)
11427 {
11428 invert = true;
11429 str_to_check += 3;
11430 }
11431 char *arg = strchr (str_to_check, '=');
11432
11433 /* If we found opt=foo then terminate STR_TO_CHECK at the '='
11434 and point ARG to "foo". */
11435 if (arg)
11436 {
11437 *arg = '\0';
11438 arg++;
11439 }
11440 const struct aarch64_attribute_info *p_attr;
11441 bool found = false;
11442 for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
11443 {
11444 /* If the names don't match up, or the user has given an argument
11445 to an attribute that doesn't accept one, or didn't give an argument
11446 to an attribute that expects one, fail to match. */
11447 if (strcmp (str_to_check, p_attr->name) != 0)
11448 continue;
11449
11450 found = true;
11451 bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
11452 || p_attr->attr_type == aarch64_attr_enum;
11453
11454 if (attr_need_arg_p ^ (arg != NULL))
11455 {
11456 error ("pragma or attribute %<target(\"%s\")%> does not accept an argument", str_to_check);
11457 return false;
11458 }
11459
11460 /* If the name matches but the attribute does not allow "no-" versions
11461 then we can't match. */
11462 if (invert && !p_attr->allow_neg)
11463 {
11464 error ("pragma or attribute %<target(\"%s\")%> does not allow a negated form", str_to_check);
11465 return false;
11466 }
11467
11468 switch (p_attr->attr_type)
11469 {
11470 /* Has a custom handler registered.
11471 For example, cpu=, arch=, tune=. */
11472 case aarch64_attr_custom:
11473 gcc_assert (p_attr->handler);
11474 if (!p_attr->handler (arg))
11475 return false;
11476 break;
11477
11478 /* Either set or unset a boolean option. */
11479 case aarch64_attr_bool:
11480 {
11481 struct cl_decoded_option decoded;
11482
11483 generate_option (p_attr->opt_num, NULL, !invert,
11484 CL_TARGET, &decoded);
11485 aarch64_handle_option (&global_options, &global_options_set,
11486 &decoded, input_location);
11487 break;
11488 }
11489 /* Set or unset a bit in the target_flags. aarch64_handle_option
11490 should know what mask to apply given the option number. */
11491 case aarch64_attr_mask:
11492 {
11493 struct cl_decoded_option decoded;
11494 /* We only need to specify the option number.
11495 aarch64_handle_option will know which mask to apply. */
11496 decoded.opt_index = p_attr->opt_num;
11497 decoded.value = !invert;
11498 aarch64_handle_option (&global_options, &global_options_set,
11499 &decoded, input_location);
11500 break;
11501 }
11502 /* Use the option setting machinery to set an option to an enum. */
11503 case aarch64_attr_enum:
11504 {
11505 gcc_assert (arg);
11506 bool valid;
11507 int value;
11508 valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
11509 &value, CL_TARGET);
11510 if (valid)
11511 {
11512 set_option (&global_options, NULL, p_attr->opt_num, value,
11513 NULL, DK_UNSPECIFIED, input_location,
11514 global_dc);
11515 }
11516 else
11517 {
11518 error ("pragma or attribute %<target(\"%s=%s\")%> is not valid", str_to_check, arg);
11519 }
11520 break;
11521 }
11522 default:
11523 gcc_unreachable ();
11524 }
11525 }
11526
11527 /* If we reached here we either have found an attribute and validated
11528 it or didn't match any. If we matched an attribute but its arguments
11529 were malformed we will have returned false already. */
11530 return found;
11531 }
11532
11533 /* Count how many times the character C appears in
11534 NULL-terminated string STR. */
11535
11536 static unsigned int
11537 num_occurences_in_str (char c, char *str)
11538 {
11539 unsigned int res = 0;
11540 while (*str != '\0')
11541 {
11542 if (*str == c)
11543 res++;
11544
11545 str++;
11546 }
11547
11548 return res;
11549 }
11550
11551 /* Parse the tree in ARGS that contains the target attribute information
11552 and update the global target options space. */
11553
11554 bool
11555 aarch64_process_target_attr (tree args)
11556 {
11557 if (TREE_CODE (args) == TREE_LIST)
11558 {
11559 do
11560 {
11561 tree head = TREE_VALUE (args);
11562 if (head)
11563 {
11564 if (!aarch64_process_target_attr (head))
11565 return false;
11566 }
11567 args = TREE_CHAIN (args);
11568 } while (args);
11569
11570 return true;
11571 }
11572
11573 if (TREE_CODE (args) != STRING_CST)
11574 {
11575 error ("attribute %<target%> argument not a string");
11576 return false;
11577 }
11578
11579 size_t len = strlen (TREE_STRING_POINTER (args));
11580 char *str_to_check = (char *) alloca (len + 1);
11581 strcpy (str_to_check, TREE_STRING_POINTER (args));
11582
11583 if (len == 0)
11584 {
11585 error ("malformed %<target()%> pragma or attribute");
11586 return false;
11587 }
11588
11589 /* Used to catch empty spaces between commas i.e.
11590 attribute ((target ("attr1,,attr2"))). */
11591 unsigned int num_commas = num_occurences_in_str (',', str_to_check);
11592
11593 /* Handle multiple target attributes separated by ','. */
11594 char *token = strtok (str_to_check, ",");
11595
11596 unsigned int num_attrs = 0;
11597 while (token)
11598 {
11599 num_attrs++;
11600 if (!aarch64_process_one_target_attr (token))
11601 {
11602 error ("pragma or attribute %<target(\"%s\")%> is not valid", token);
11603 return false;
11604 }
11605
11606 token = strtok (NULL, ",");
11607 }
11608
11609 if (num_attrs != num_commas + 1)
11610 {
11611 error ("malformed %<target(\"%s\")%> pragma or attribute", TREE_STRING_POINTER (args));
11612 return false;
11613 }
11614
11615 return true;
11616 }
11617
11618 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P. This is used to
11619 process attribute ((target ("..."))). */
11620
11621 static bool
11622 aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
11623 {
11624 struct cl_target_option cur_target;
11625 bool ret;
11626 tree old_optimize;
11627 tree new_target, new_optimize;
11628 tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
11629
11630 /* If what we're processing is the current pragma string then the
11631 target option node is already stored in target_option_current_node
11632 by aarch64_pragma_target_parse in aarch64-c.c. Use that to avoid
11633 having to re-parse the string. This is especially useful to keep
11634 arm_neon.h compile times down since that header contains a lot
11635 of intrinsics enclosed in pragmas. */
11636 if (!existing_target && args == current_target_pragma)
11637 {
11638 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
11639 return true;
11640 }
11641 tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
11642
11643 old_optimize = build_optimization_node (&global_options);
11644 func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
11645
11646 /* If the function changed the optimization levels as well as setting
11647 target options, start with the optimizations specified. */
11648 if (func_optimize && func_optimize != old_optimize)
11649 cl_optimization_restore (&global_options,
11650 TREE_OPTIMIZATION (func_optimize));
11651
11652 /* Save the current target options to restore at the end. */
11653 cl_target_option_save (&cur_target, &global_options);
11654
11655 /* If fndecl already has some target attributes applied to it, unpack
11656 them so that we add this attribute on top of them, rather than
11657 overwriting them. */
11658 if (existing_target)
11659 {
11660 struct cl_target_option *existing_options
11661 = TREE_TARGET_OPTION (existing_target);
11662
11663 if (existing_options)
11664 cl_target_option_restore (&global_options, existing_options);
11665 }
11666 else
11667 cl_target_option_restore (&global_options,
11668 TREE_TARGET_OPTION (target_option_current_node));
11669
11670 ret = aarch64_process_target_attr (args);
11671
11672 /* Set up any additional state. */
11673 if (ret)
11674 {
11675 aarch64_override_options_internal (&global_options);
11676 /* Initialize SIMD builtins if we haven't already.
11677 Set current_target_pragma to NULL for the duration so that
11678 the builtin initialization code doesn't try to tag the functions
11679 being built with the attributes specified by any current pragma, thus
11680 going into an infinite recursion. */
11681 if (TARGET_SIMD)
11682 {
11683 tree saved_current_target_pragma = current_target_pragma;
11684 current_target_pragma = NULL;
11685 aarch64_init_simd_builtins ();
11686 current_target_pragma = saved_current_target_pragma;
11687 }
11688 new_target = build_target_option_node (&global_options);
11689 }
11690 else
11691 new_target = NULL;
11692
11693 new_optimize = build_optimization_node (&global_options);
11694
11695 if (fndecl && ret)
11696 {
11697 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
11698
11699 if (old_optimize != new_optimize)
11700 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
11701 }
11702
11703 cl_target_option_restore (&global_options, &cur_target);
11704
11705 if (old_optimize != new_optimize)
11706 cl_optimization_restore (&global_options,
11707 TREE_OPTIMIZATION (old_optimize));
11708 return ret;
11709 }
11710
11711 /* Helper for aarch64_can_inline_p. In the case where CALLER and CALLEE are
11712 tri-bool options (yes, no, don't care) and the default value is
11713 DEF, determine whether to reject inlining. */
11714
11715 static bool
11716 aarch64_tribools_ok_for_inlining_p (int caller, int callee,
11717 int dont_care, int def)
11718 {
11719 /* If the callee doesn't care, always allow inlining. */
11720 if (callee == dont_care)
11721 return true;
11722
11723 /* If the caller doesn't care, always allow inlining. */
11724 if (caller == dont_care)
11725 return true;
11726
11727 /* Otherwise, allow inlining if either the callee and caller values
11728 agree, or if the callee is using the default value. */
11729 return (callee == caller || callee == def);
11730 }
11731
11732 /* Implement TARGET_CAN_INLINE_P. Decide whether it is valid
11733 to inline CALLEE into CALLER based on target-specific info.
11734 Make sure that the caller and callee have compatible architectural
11735 features. Then go through the other possible target attributes
11736 and see if they can block inlining. Try not to reject always_inline
11737 callees unless they are incompatible architecturally. */
11738
11739 static bool
11740 aarch64_can_inline_p (tree caller, tree callee)
11741 {
11742 tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
11743 tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
11744
11745 struct cl_target_option *caller_opts
11746 = TREE_TARGET_OPTION (caller_tree ? caller_tree
11747 : target_option_default_node);
11748
11749 struct cl_target_option *callee_opts
11750 = TREE_TARGET_OPTION (callee_tree ? callee_tree
11751 : target_option_default_node);
11752
11753 /* Callee's ISA flags should be a subset of the caller's. */
11754 if ((caller_opts->x_aarch64_isa_flags & callee_opts->x_aarch64_isa_flags)
11755 != callee_opts->x_aarch64_isa_flags)
11756 return false;
11757
11758 /* Allow non-strict aligned functions inlining into strict
11759 aligned ones. */
11760 if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
11761 != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
11762 && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
11763 && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
11764 return false;
11765
11766 bool always_inline = lookup_attribute ("always_inline",
11767 DECL_ATTRIBUTES (callee));
11768
11769 /* If the architectural features match up and the callee is always_inline
11770 then the other attributes don't matter. */
11771 if (always_inline)
11772 return true;
11773
11774 if (caller_opts->x_aarch64_cmodel_var
11775 != callee_opts->x_aarch64_cmodel_var)
11776 return false;
11777
11778 if (caller_opts->x_aarch64_tls_dialect
11779 != callee_opts->x_aarch64_tls_dialect)
11780 return false;
11781
11782 /* Honour explicit requests to workaround errata. */
11783 if (!aarch64_tribools_ok_for_inlining_p (
11784 caller_opts->x_aarch64_fix_a53_err835769,
11785 callee_opts->x_aarch64_fix_a53_err835769,
11786 2, TARGET_FIX_ERR_A53_835769_DEFAULT))
11787 return false;
11788
11789 if (!aarch64_tribools_ok_for_inlining_p (
11790 caller_opts->x_aarch64_fix_a53_err843419,
11791 callee_opts->x_aarch64_fix_a53_err843419,
11792 2, TARGET_FIX_ERR_A53_843419))
11793 return false;
11794
11795 /* If the user explicitly specified -momit-leaf-frame-pointer for the
11796 caller and calle and they don't match up, reject inlining. */
11797 if (!aarch64_tribools_ok_for_inlining_p (
11798 caller_opts->x_flag_omit_leaf_frame_pointer,
11799 callee_opts->x_flag_omit_leaf_frame_pointer,
11800 2, 1))
11801 return false;
11802
11803 /* If the callee has specific tuning overrides, respect them. */
11804 if (callee_opts->x_aarch64_override_tune_string != NULL
11805 && caller_opts->x_aarch64_override_tune_string == NULL)
11806 return false;
11807
11808 /* If the user specified tuning override strings for the
11809 caller and callee and they don't match up, reject inlining.
11810 We just do a string compare here, we don't analyze the meaning
11811 of the string, as it would be too costly for little gain. */
11812 if (callee_opts->x_aarch64_override_tune_string
11813 && caller_opts->x_aarch64_override_tune_string
11814 && (strcmp (callee_opts->x_aarch64_override_tune_string,
11815 caller_opts->x_aarch64_override_tune_string) != 0))
11816 return false;
11817
11818 return true;
11819 }
11820
11821 /* Return true if SYMBOL_REF X binds locally. */
11822
11823 static bool
11824 aarch64_symbol_binds_local_p (const_rtx x)
11825 {
11826 return (SYMBOL_REF_DECL (x)
11827 ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
11828 : SYMBOL_REF_LOCAL_P (x));
11829 }
11830
11831 /* Return true if SYMBOL_REF X is thread local */
11832 static bool
11833 aarch64_tls_symbol_p (rtx x)
11834 {
11835 if (! TARGET_HAVE_TLS)
11836 return false;
11837
11838 if (GET_CODE (x) != SYMBOL_REF)
11839 return false;
11840
11841 return SYMBOL_REF_TLS_MODEL (x) != 0;
11842 }
11843
11844 /* Classify a TLS symbol into one of the TLS kinds. */
11845 enum aarch64_symbol_type
11846 aarch64_classify_tls_symbol (rtx x)
11847 {
11848 enum tls_model tls_kind = tls_symbolic_operand_type (x);
11849
11850 switch (tls_kind)
11851 {
11852 case TLS_MODEL_GLOBAL_DYNAMIC:
11853 case TLS_MODEL_LOCAL_DYNAMIC:
11854 return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
11855
11856 case TLS_MODEL_INITIAL_EXEC:
11857 switch (aarch64_cmodel)
11858 {
11859 case AARCH64_CMODEL_TINY:
11860 case AARCH64_CMODEL_TINY_PIC:
11861 return SYMBOL_TINY_TLSIE;
11862 default:
11863 return SYMBOL_SMALL_TLSIE;
11864 }
11865
11866 case TLS_MODEL_LOCAL_EXEC:
11867 if (aarch64_tls_size == 12)
11868 return SYMBOL_TLSLE12;
11869 else if (aarch64_tls_size == 24)
11870 return SYMBOL_TLSLE24;
11871 else if (aarch64_tls_size == 32)
11872 return SYMBOL_TLSLE32;
11873 else if (aarch64_tls_size == 48)
11874 return SYMBOL_TLSLE48;
11875 else
11876 gcc_unreachable ();
11877
11878 case TLS_MODEL_EMULATED:
11879 case TLS_MODEL_NONE:
11880 return SYMBOL_FORCE_TO_MEM;
11881
11882 default:
11883 gcc_unreachable ();
11884 }
11885 }
11886
11887 /* Return the correct method for accessing X + OFFSET, where X is either
11888 a SYMBOL_REF or LABEL_REF. */
11889
11890 enum aarch64_symbol_type
11891 aarch64_classify_symbol (rtx x, HOST_WIDE_INT offset)
11892 {
11893 if (GET_CODE (x) == LABEL_REF)
11894 {
11895 switch (aarch64_cmodel)
11896 {
11897 case AARCH64_CMODEL_LARGE:
11898 return SYMBOL_FORCE_TO_MEM;
11899
11900 case AARCH64_CMODEL_TINY_PIC:
11901 case AARCH64_CMODEL_TINY:
11902 return SYMBOL_TINY_ABSOLUTE;
11903
11904 case AARCH64_CMODEL_SMALL_SPIC:
11905 case AARCH64_CMODEL_SMALL_PIC:
11906 case AARCH64_CMODEL_SMALL:
11907 return SYMBOL_SMALL_ABSOLUTE;
11908
11909 default:
11910 gcc_unreachable ();
11911 }
11912 }
11913
11914 if (GET_CODE (x) == SYMBOL_REF)
11915 {
11916 if (aarch64_tls_symbol_p (x))
11917 return aarch64_classify_tls_symbol (x);
11918
11919 switch (aarch64_cmodel)
11920 {
11921 case AARCH64_CMODEL_TINY:
11922 /* When we retrieve symbol + offset address, we have to make sure
11923 the offset does not cause overflow of the final address. But
11924 we have no way of knowing the address of symbol at compile time
11925 so we can't accurately say if the distance between the PC and
11926 symbol + offset is outside the addressible range of +/-1M in the
11927 TINY code model. So we rely on images not being greater than
11928 1M and cap the offset at 1M and anything beyond 1M will have to
11929 be loaded using an alternative mechanism. Furthermore if the
11930 symbol is a weak reference to something that isn't known to
11931 resolve to a symbol in this module, then force to memory. */
11932 if ((SYMBOL_REF_WEAK (x)
11933 && !aarch64_symbol_binds_local_p (x))
11934 || !IN_RANGE (offset, -1048575, 1048575))
11935 return SYMBOL_FORCE_TO_MEM;
11936 return SYMBOL_TINY_ABSOLUTE;
11937
11938 case AARCH64_CMODEL_SMALL:
11939 /* Same reasoning as the tiny code model, but the offset cap here is
11940 4G. */
11941 if ((SYMBOL_REF_WEAK (x)
11942 && !aarch64_symbol_binds_local_p (x))
11943 || !IN_RANGE (offset, HOST_WIDE_INT_C (-4294967263),
11944 HOST_WIDE_INT_C (4294967264)))
11945 return SYMBOL_FORCE_TO_MEM;
11946 return SYMBOL_SMALL_ABSOLUTE;
11947
11948 case AARCH64_CMODEL_TINY_PIC:
11949 if (!aarch64_symbol_binds_local_p (x))
11950 return SYMBOL_TINY_GOT;
11951 return SYMBOL_TINY_ABSOLUTE;
11952
11953 case AARCH64_CMODEL_SMALL_SPIC:
11954 case AARCH64_CMODEL_SMALL_PIC:
11955 if (!aarch64_symbol_binds_local_p (x))
11956 return (aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
11957 ? SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G);
11958 return SYMBOL_SMALL_ABSOLUTE;
11959
11960 case AARCH64_CMODEL_LARGE:
11961 /* This is alright even in PIC code as the constant
11962 pool reference is always PC relative and within
11963 the same translation unit. */
11964 if (!aarch64_pcrelative_literal_loads && CONSTANT_POOL_ADDRESS_P (x))
11965 return SYMBOL_SMALL_ABSOLUTE;
11966 else
11967 return SYMBOL_FORCE_TO_MEM;
11968
11969 default:
11970 gcc_unreachable ();
11971 }
11972 }
11973
11974 /* By default push everything into the constant pool. */
11975 return SYMBOL_FORCE_TO_MEM;
11976 }
11977
11978 bool
11979 aarch64_constant_address_p (rtx x)
11980 {
11981 return (CONSTANT_P (x) && memory_address_p (DImode, x));
11982 }
11983
11984 bool
11985 aarch64_legitimate_pic_operand_p (rtx x)
11986 {
11987 if (GET_CODE (x) == SYMBOL_REF
11988 || (GET_CODE (x) == CONST
11989 && GET_CODE (XEXP (x, 0)) == PLUS
11990 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
11991 return false;
11992
11993 return true;
11994 }
11995
11996 /* Implement TARGET_LEGITIMATE_CONSTANT_P hook. Return true for constants
11997 that should be rematerialized rather than spilled. */
11998
11999 static bool
12000 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
12001 {
12002 /* Support CSE and rematerialization of common constants. */
12003 if (CONST_INT_P (x)
12004 || (CONST_DOUBLE_P (x) && GET_MODE_CLASS (mode) == MODE_FLOAT)
12005 || GET_CODE (x) == CONST_VECTOR)
12006 return true;
12007
12008 /* Do not allow vector struct mode constants for Advanced SIMD.
12009 We could support 0 and -1 easily, but they need support in
12010 aarch64-simd.md. */
12011 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
12012 if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
12013 return false;
12014
12015 /* Only accept variable-length vector constants if they can be
12016 handled directly.
12017
12018 ??? It would be possible to handle rematerialization of other
12019 constants via secondary reloads. */
12020 if (vec_flags & VEC_ANY_SVE)
12021 return aarch64_simd_valid_immediate (x, NULL);
12022
12023 if (GET_CODE (x) == HIGH)
12024 x = XEXP (x, 0);
12025
12026 /* Accept polynomial constants that can be calculated by using the
12027 destination of a move as the sole temporary. Constants that
12028 require a second temporary cannot be rematerialized (they can't be
12029 forced to memory and also aren't legitimate constants). */
12030 poly_int64 offset;
12031 if (poly_int_rtx_p (x, &offset))
12032 return aarch64_offset_temporaries (false, offset) <= 1;
12033
12034 /* If an offset is being added to something else, we need to allow the
12035 base to be moved into the destination register, meaning that there
12036 are no free temporaries for the offset. */
12037 x = strip_offset (x, &offset);
12038 if (!offset.is_constant () && aarch64_offset_temporaries (true, offset) > 0)
12039 return false;
12040
12041 /* Do not allow const (plus (anchor_symbol, const_int)). */
12042 if (maybe_ne (offset, 0) && SYMBOL_REF_P (x) && SYMBOL_REF_ANCHOR_P (x))
12043 return false;
12044
12045 /* Treat symbols as constants. Avoid TLS symbols as they are complex,
12046 so spilling them is better than rematerialization. */
12047 if (SYMBOL_REF_P (x) && !SYMBOL_REF_TLS_MODEL (x))
12048 return true;
12049
12050 /* Label references are always constant. */
12051 if (GET_CODE (x) == LABEL_REF)
12052 return true;
12053
12054 return false;
12055 }
12056
12057 rtx
12058 aarch64_load_tp (rtx target)
12059 {
12060 if (!target
12061 || GET_MODE (target) != Pmode
12062 || !register_operand (target, Pmode))
12063 target = gen_reg_rtx (Pmode);
12064
12065 /* Can return in any reg. */
12066 emit_insn (gen_aarch64_load_tp_hard (target));
12067 return target;
12068 }
12069
12070 /* On AAPCS systems, this is the "struct __va_list". */
12071 static GTY(()) tree va_list_type;
12072
12073 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
12074 Return the type to use as __builtin_va_list.
12075
12076 AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
12077
12078 struct __va_list
12079 {
12080 void *__stack;
12081 void *__gr_top;
12082 void *__vr_top;
12083 int __gr_offs;
12084 int __vr_offs;
12085 }; */
12086
12087 static tree
12088 aarch64_build_builtin_va_list (void)
12089 {
12090 tree va_list_name;
12091 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
12092
12093 /* Create the type. */
12094 va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
12095 /* Give it the required name. */
12096 va_list_name = build_decl (BUILTINS_LOCATION,
12097 TYPE_DECL,
12098 get_identifier ("__va_list"),
12099 va_list_type);
12100 DECL_ARTIFICIAL (va_list_name) = 1;
12101 TYPE_NAME (va_list_type) = va_list_name;
12102 TYPE_STUB_DECL (va_list_type) = va_list_name;
12103
12104 /* Create the fields. */
12105 f_stack = build_decl (BUILTINS_LOCATION,
12106 FIELD_DECL, get_identifier ("__stack"),
12107 ptr_type_node);
12108 f_grtop = build_decl (BUILTINS_LOCATION,
12109 FIELD_DECL, get_identifier ("__gr_top"),
12110 ptr_type_node);
12111 f_vrtop = build_decl (BUILTINS_LOCATION,
12112 FIELD_DECL, get_identifier ("__vr_top"),
12113 ptr_type_node);
12114 f_groff = build_decl (BUILTINS_LOCATION,
12115 FIELD_DECL, get_identifier ("__gr_offs"),
12116 integer_type_node);
12117 f_vroff = build_decl (BUILTINS_LOCATION,
12118 FIELD_DECL, get_identifier ("__vr_offs"),
12119 integer_type_node);
12120
12121 /* Tell tree-stdarg pass about our internal offset fields.
12122 NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
12123 purpose to identify whether the code is updating va_list internal
12124 offset fields through irregular way. */
12125 va_list_gpr_counter_field = f_groff;
12126 va_list_fpr_counter_field = f_vroff;
12127
12128 DECL_ARTIFICIAL (f_stack) = 1;
12129 DECL_ARTIFICIAL (f_grtop) = 1;
12130 DECL_ARTIFICIAL (f_vrtop) = 1;
12131 DECL_ARTIFICIAL (f_groff) = 1;
12132 DECL_ARTIFICIAL (f_vroff) = 1;
12133
12134 DECL_FIELD_CONTEXT (f_stack) = va_list_type;
12135 DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
12136 DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
12137 DECL_FIELD_CONTEXT (f_groff) = va_list_type;
12138 DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
12139
12140 TYPE_FIELDS (va_list_type) = f_stack;
12141 DECL_CHAIN (f_stack) = f_grtop;
12142 DECL_CHAIN (f_grtop) = f_vrtop;
12143 DECL_CHAIN (f_vrtop) = f_groff;
12144 DECL_CHAIN (f_groff) = f_vroff;
12145
12146 /* Compute its layout. */
12147 layout_type (va_list_type);
12148
12149 return va_list_type;
12150 }
12151
12152 /* Implement TARGET_EXPAND_BUILTIN_VA_START. */
12153 static void
12154 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
12155 {
12156 const CUMULATIVE_ARGS *cum;
12157 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
12158 tree stack, grtop, vrtop, groff, vroff;
12159 tree t;
12160 int gr_save_area_size = cfun->va_list_gpr_size;
12161 int vr_save_area_size = cfun->va_list_fpr_size;
12162 int vr_offset;
12163
12164 cum = &crtl->args.info;
12165 if (cfun->va_list_gpr_size)
12166 gr_save_area_size = MIN ((NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD,
12167 cfun->va_list_gpr_size);
12168 if (cfun->va_list_fpr_size)
12169 vr_save_area_size = MIN ((NUM_FP_ARG_REGS - cum->aapcs_nvrn)
12170 * UNITS_PER_VREG, cfun->va_list_fpr_size);
12171
12172 if (!TARGET_FLOAT)
12173 {
12174 gcc_assert (cum->aapcs_nvrn == 0);
12175 vr_save_area_size = 0;
12176 }
12177
12178 f_stack = TYPE_FIELDS (va_list_type_node);
12179 f_grtop = DECL_CHAIN (f_stack);
12180 f_vrtop = DECL_CHAIN (f_grtop);
12181 f_groff = DECL_CHAIN (f_vrtop);
12182 f_vroff = DECL_CHAIN (f_groff);
12183
12184 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
12185 NULL_TREE);
12186 grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
12187 NULL_TREE);
12188 vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
12189 NULL_TREE);
12190 groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
12191 NULL_TREE);
12192 vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
12193 NULL_TREE);
12194
12195 /* Emit code to initialize STACK, which points to the next varargs stack
12196 argument. CUM->AAPCS_STACK_SIZE gives the number of stack words used
12197 by named arguments. STACK is 8-byte aligned. */
12198 t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
12199 if (cum->aapcs_stack_size > 0)
12200 t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
12201 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
12202 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
12203
12204 /* Emit code to initialize GRTOP, the top of the GR save area.
12205 virtual_incoming_args_rtx should have been 16 byte aligned. */
12206 t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
12207 t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
12208 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
12209
12210 /* Emit code to initialize VRTOP, the top of the VR save area.
12211 This address is gr_save_area_bytes below GRTOP, rounded
12212 down to the next 16-byte boundary. */
12213 t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
12214 vr_offset = ROUND_UP (gr_save_area_size,
12215 STACK_BOUNDARY / BITS_PER_UNIT);
12216
12217 if (vr_offset)
12218 t = fold_build_pointer_plus_hwi (t, -vr_offset);
12219 t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
12220 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
12221
12222 /* Emit code to initialize GROFF, the offset from GRTOP of the
12223 next GPR argument. */
12224 t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
12225 build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
12226 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
12227
12228 /* Likewise emit code to initialize VROFF, the offset from FTOP
12229 of the next VR argument. */
12230 t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
12231 build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
12232 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
12233 }
12234
12235 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR. */
12236
12237 static tree
12238 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
12239 gimple_seq *post_p ATTRIBUTE_UNUSED)
12240 {
12241 tree addr;
12242 bool indirect_p;
12243 bool is_ha; /* is HFA or HVA. */
12244 bool dw_align; /* double-word align. */
12245 machine_mode ag_mode = VOIDmode;
12246 int nregs;
12247 machine_mode mode;
12248
12249 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
12250 tree stack, f_top, f_off, off, arg, roundup, on_stack;
12251 HOST_WIDE_INT size, rsize, adjust, align;
12252 tree t, u, cond1, cond2;
12253
12254 indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
12255 if (indirect_p)
12256 type = build_pointer_type (type);
12257
12258 mode = TYPE_MODE (type);
12259
12260 f_stack = TYPE_FIELDS (va_list_type_node);
12261 f_grtop = DECL_CHAIN (f_stack);
12262 f_vrtop = DECL_CHAIN (f_grtop);
12263 f_groff = DECL_CHAIN (f_vrtop);
12264 f_vroff = DECL_CHAIN (f_groff);
12265
12266 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
12267 f_stack, NULL_TREE);
12268 size = int_size_in_bytes (type);
12269 align = aarch64_function_arg_alignment (mode, type) / BITS_PER_UNIT;
12270
12271 dw_align = false;
12272 adjust = 0;
12273 if (aarch64_vfp_is_call_or_return_candidate (mode,
12274 type,
12275 &ag_mode,
12276 &nregs,
12277 &is_ha))
12278 {
12279 /* No frontends can create types with variable-sized modes, so we
12280 shouldn't be asked to pass or return them. */
12281 unsigned int ag_size = GET_MODE_SIZE (ag_mode).to_constant ();
12282
12283 /* TYPE passed in fp/simd registers. */
12284 if (!TARGET_FLOAT)
12285 aarch64_err_no_fpadvsimd (mode);
12286
12287 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
12288 unshare_expr (valist), f_vrtop, NULL_TREE);
12289 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
12290 unshare_expr (valist), f_vroff, NULL_TREE);
12291
12292 rsize = nregs * UNITS_PER_VREG;
12293
12294 if (is_ha)
12295 {
12296 if (BYTES_BIG_ENDIAN && ag_size < UNITS_PER_VREG)
12297 adjust = UNITS_PER_VREG - ag_size;
12298 }
12299 else if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
12300 && size < UNITS_PER_VREG)
12301 {
12302 adjust = UNITS_PER_VREG - size;
12303 }
12304 }
12305 else
12306 {
12307 /* TYPE passed in general registers. */
12308 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
12309 unshare_expr (valist), f_grtop, NULL_TREE);
12310 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
12311 unshare_expr (valist), f_groff, NULL_TREE);
12312 rsize = ROUND_UP (size, UNITS_PER_WORD);
12313 nregs = rsize / UNITS_PER_WORD;
12314
12315 if (align > 8)
12316 dw_align = true;
12317
12318 if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
12319 && size < UNITS_PER_WORD)
12320 {
12321 adjust = UNITS_PER_WORD - size;
12322 }
12323 }
12324
12325 /* Get a local temporary for the field value. */
12326 off = get_initialized_tmp_var (f_off, pre_p, NULL);
12327
12328 /* Emit code to branch if off >= 0. */
12329 t = build2 (GE_EXPR, boolean_type_node, off,
12330 build_int_cst (TREE_TYPE (off), 0));
12331 cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
12332
12333 if (dw_align)
12334 {
12335 /* Emit: offs = (offs + 15) & -16. */
12336 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
12337 build_int_cst (TREE_TYPE (off), 15));
12338 t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
12339 build_int_cst (TREE_TYPE (off), -16));
12340 roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
12341 }
12342 else
12343 roundup = NULL;
12344
12345 /* Update ap.__[g|v]r_offs */
12346 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
12347 build_int_cst (TREE_TYPE (off), rsize));
12348 t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
12349
12350 /* String up. */
12351 if (roundup)
12352 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
12353
12354 /* [cond2] if (ap.__[g|v]r_offs > 0) */
12355 u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
12356 build_int_cst (TREE_TYPE (f_off), 0));
12357 cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
12358
12359 /* String up: make sure the assignment happens before the use. */
12360 t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
12361 COND_EXPR_ELSE (cond1) = t;
12362
12363 /* Prepare the trees handling the argument that is passed on the stack;
12364 the top level node will store in ON_STACK. */
12365 arg = get_initialized_tmp_var (stack, pre_p, NULL);
12366 if (align > 8)
12367 {
12368 /* if (alignof(type) > 8) (arg = arg + 15) & -16; */
12369 t = fold_build_pointer_plus_hwi (arg, 15);
12370 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
12371 build_int_cst (TREE_TYPE (t), -16));
12372 roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
12373 }
12374 else
12375 roundup = NULL;
12376 /* Advance ap.__stack */
12377 t = fold_build_pointer_plus_hwi (arg, size + 7);
12378 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
12379 build_int_cst (TREE_TYPE (t), -8));
12380 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
12381 /* String up roundup and advance. */
12382 if (roundup)
12383 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
12384 /* String up with arg */
12385 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
12386 /* Big-endianness related address adjustment. */
12387 if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
12388 && size < UNITS_PER_WORD)
12389 {
12390 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
12391 size_int (UNITS_PER_WORD - size));
12392 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
12393 }
12394
12395 COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
12396 COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
12397
12398 /* Adjustment to OFFSET in the case of BIG_ENDIAN. */
12399 t = off;
12400 if (adjust)
12401 t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
12402 build_int_cst (TREE_TYPE (off), adjust));
12403
12404 t = fold_convert (sizetype, t);
12405 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
12406
12407 if (is_ha)
12408 {
12409 /* type ha; // treat as "struct {ftype field[n];}"
12410 ... [computing offs]
12411 for (i = 0; i <nregs; ++i, offs += 16)
12412 ha.field[i] = *((ftype *)(ap.__vr_top + offs));
12413 return ha; */
12414 int i;
12415 tree tmp_ha, field_t, field_ptr_t;
12416
12417 /* Declare a local variable. */
12418 tmp_ha = create_tmp_var_raw (type, "ha");
12419 gimple_add_tmp_var (tmp_ha);
12420
12421 /* Establish the base type. */
12422 switch (ag_mode)
12423 {
12424 case E_SFmode:
12425 field_t = float_type_node;
12426 field_ptr_t = float_ptr_type_node;
12427 break;
12428 case E_DFmode:
12429 field_t = double_type_node;
12430 field_ptr_t = double_ptr_type_node;
12431 break;
12432 case E_TFmode:
12433 field_t = long_double_type_node;
12434 field_ptr_t = long_double_ptr_type_node;
12435 break;
12436 case E_HFmode:
12437 field_t = aarch64_fp16_type_node;
12438 field_ptr_t = aarch64_fp16_ptr_type_node;
12439 break;
12440 case E_V2SImode:
12441 case E_V4SImode:
12442 {
12443 tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
12444 field_t = build_vector_type_for_mode (innertype, ag_mode);
12445 field_ptr_t = build_pointer_type (field_t);
12446 }
12447 break;
12448 default:
12449 gcc_assert (0);
12450 }
12451
12452 /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area */
12453 tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
12454 addr = t;
12455 t = fold_convert (field_ptr_t, addr);
12456 t = build2 (MODIFY_EXPR, field_t,
12457 build1 (INDIRECT_REF, field_t, tmp_ha),
12458 build1 (INDIRECT_REF, field_t, t));
12459
12460 /* ha.field[i] = *((field_ptr_t)vr_saved_area + i) */
12461 for (i = 1; i < nregs; ++i)
12462 {
12463 addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
12464 u = fold_convert (field_ptr_t, addr);
12465 u = build2 (MODIFY_EXPR, field_t,
12466 build2 (MEM_REF, field_t, tmp_ha,
12467 build_int_cst (field_ptr_t,
12468 (i *
12469 int_size_in_bytes (field_t)))),
12470 build1 (INDIRECT_REF, field_t, u));
12471 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
12472 }
12473
12474 u = fold_convert (TREE_TYPE (f_top), tmp_ha);
12475 t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
12476 }
12477
12478 COND_EXPR_ELSE (cond2) = t;
12479 addr = fold_convert (build_pointer_type (type), cond1);
12480 addr = build_va_arg_indirect_ref (addr);
12481
12482 if (indirect_p)
12483 addr = build_va_arg_indirect_ref (addr);
12484
12485 return addr;
12486 }
12487
12488 /* Implement TARGET_SETUP_INCOMING_VARARGS. */
12489
12490 static void
12491 aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
12492 tree type, int *pretend_size ATTRIBUTE_UNUSED,
12493 int no_rtl)
12494 {
12495 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
12496 CUMULATIVE_ARGS local_cum;
12497 int gr_saved = cfun->va_list_gpr_size;
12498 int vr_saved = cfun->va_list_fpr_size;
12499
12500 /* The caller has advanced CUM up to, but not beyond, the last named
12501 argument. Advance a local copy of CUM past the last "real" named
12502 argument, to find out how many registers are left over. */
12503 local_cum = *cum;
12504 aarch64_function_arg_advance (pack_cumulative_args(&local_cum), mode, type, true);
12505
12506 /* Found out how many registers we need to save.
12507 Honor tree-stdvar analysis results. */
12508 if (cfun->va_list_gpr_size)
12509 gr_saved = MIN (NUM_ARG_REGS - local_cum.aapcs_ncrn,
12510 cfun->va_list_gpr_size / UNITS_PER_WORD);
12511 if (cfun->va_list_fpr_size)
12512 vr_saved = MIN (NUM_FP_ARG_REGS - local_cum.aapcs_nvrn,
12513 cfun->va_list_fpr_size / UNITS_PER_VREG);
12514
12515 if (!TARGET_FLOAT)
12516 {
12517 gcc_assert (local_cum.aapcs_nvrn == 0);
12518 vr_saved = 0;
12519 }
12520
12521 if (!no_rtl)
12522 {
12523 if (gr_saved > 0)
12524 {
12525 rtx ptr, mem;
12526
12527 /* virtual_incoming_args_rtx should have been 16-byte aligned. */
12528 ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
12529 - gr_saved * UNITS_PER_WORD);
12530 mem = gen_frame_mem (BLKmode, ptr);
12531 set_mem_alias_set (mem, get_varargs_alias_set ());
12532
12533 move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
12534 mem, gr_saved);
12535 }
12536 if (vr_saved > 0)
12537 {
12538 /* We can't use move_block_from_reg, because it will use
12539 the wrong mode, storing D regs only. */
12540 machine_mode mode = TImode;
12541 int off, i, vr_start;
12542
12543 /* Set OFF to the offset from virtual_incoming_args_rtx of
12544 the first vector register. The VR save area lies below
12545 the GR one, and is aligned to 16 bytes. */
12546 off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
12547 STACK_BOUNDARY / BITS_PER_UNIT);
12548 off -= vr_saved * UNITS_PER_VREG;
12549
12550 vr_start = V0_REGNUM + local_cum.aapcs_nvrn;
12551 for (i = 0; i < vr_saved; ++i)
12552 {
12553 rtx ptr, mem;
12554
12555 ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
12556 mem = gen_frame_mem (mode, ptr);
12557 set_mem_alias_set (mem, get_varargs_alias_set ());
12558 aarch64_emit_move (mem, gen_rtx_REG (mode, vr_start + i));
12559 off += UNITS_PER_VREG;
12560 }
12561 }
12562 }
12563
12564 /* We don't save the size into *PRETEND_SIZE because we want to avoid
12565 any complication of having crtl->args.pretend_args_size changed. */
12566 cfun->machine->frame.saved_varargs_size
12567 = (ROUND_UP (gr_saved * UNITS_PER_WORD,
12568 STACK_BOUNDARY / BITS_PER_UNIT)
12569 + vr_saved * UNITS_PER_VREG);
12570 }
12571
12572 static void
12573 aarch64_conditional_register_usage (void)
12574 {
12575 int i;
12576 if (!TARGET_FLOAT)
12577 {
12578 for (i = V0_REGNUM; i <= V31_REGNUM; i++)
12579 {
12580 fixed_regs[i] = 1;
12581 call_used_regs[i] = 1;
12582 }
12583 }
12584 if (!TARGET_SVE)
12585 for (i = P0_REGNUM; i <= P15_REGNUM; i++)
12586 {
12587 fixed_regs[i] = 1;
12588 call_used_regs[i] = 1;
12589 }
12590 }
12591
12592 /* Walk down the type tree of TYPE counting consecutive base elements.
12593 If *MODEP is VOIDmode, then set it to the first valid floating point
12594 type. If a non-floating point type is found, or if a floating point
12595 type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
12596 otherwise return the count in the sub-tree. */
12597 static int
12598 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
12599 {
12600 machine_mode mode;
12601 HOST_WIDE_INT size;
12602
12603 switch (TREE_CODE (type))
12604 {
12605 case REAL_TYPE:
12606 mode = TYPE_MODE (type);
12607 if (mode != DFmode && mode != SFmode
12608 && mode != TFmode && mode != HFmode)
12609 return -1;
12610
12611 if (*modep == VOIDmode)
12612 *modep = mode;
12613
12614 if (*modep == mode)
12615 return 1;
12616
12617 break;
12618
12619 case COMPLEX_TYPE:
12620 mode = TYPE_MODE (TREE_TYPE (type));
12621 if (mode != DFmode && mode != SFmode
12622 && mode != TFmode && mode != HFmode)
12623 return -1;
12624
12625 if (*modep == VOIDmode)
12626 *modep = mode;
12627
12628 if (*modep == mode)
12629 return 2;
12630
12631 break;
12632
12633 case VECTOR_TYPE:
12634 /* Use V2SImode and V4SImode as representatives of all 64-bit
12635 and 128-bit vector types. */
12636 size = int_size_in_bytes (type);
12637 switch (size)
12638 {
12639 case 8:
12640 mode = V2SImode;
12641 break;
12642 case 16:
12643 mode = V4SImode;
12644 break;
12645 default:
12646 return -1;
12647 }
12648
12649 if (*modep == VOIDmode)
12650 *modep = mode;
12651
12652 /* Vector modes are considered to be opaque: two vectors are
12653 equivalent for the purposes of being homogeneous aggregates
12654 if they are the same size. */
12655 if (*modep == mode)
12656 return 1;
12657
12658 break;
12659
12660 case ARRAY_TYPE:
12661 {
12662 int count;
12663 tree index = TYPE_DOMAIN (type);
12664
12665 /* Can't handle incomplete types nor sizes that are not
12666 fixed. */
12667 if (!COMPLETE_TYPE_P (type)
12668 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
12669 return -1;
12670
12671 count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
12672 if (count == -1
12673 || !index
12674 || !TYPE_MAX_VALUE (index)
12675 || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
12676 || !TYPE_MIN_VALUE (index)
12677 || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
12678 || count < 0)
12679 return -1;
12680
12681 count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
12682 - tree_to_uhwi (TYPE_MIN_VALUE (index)));
12683
12684 /* There must be no padding. */
12685 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
12686 count * GET_MODE_BITSIZE (*modep)))
12687 return -1;
12688
12689 return count;
12690 }
12691
12692 case RECORD_TYPE:
12693 {
12694 int count = 0;
12695 int sub_count;
12696 tree field;
12697
12698 /* Can't handle incomplete types nor sizes that are not
12699 fixed. */
12700 if (!COMPLETE_TYPE_P (type)
12701 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
12702 return -1;
12703
12704 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
12705 {
12706 if (TREE_CODE (field) != FIELD_DECL)
12707 continue;
12708
12709 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
12710 if (sub_count < 0)
12711 return -1;
12712 count += sub_count;
12713 }
12714
12715 /* There must be no padding. */
12716 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
12717 count * GET_MODE_BITSIZE (*modep)))
12718 return -1;
12719
12720 return count;
12721 }
12722
12723 case UNION_TYPE:
12724 case QUAL_UNION_TYPE:
12725 {
12726 /* These aren't very interesting except in a degenerate case. */
12727 int count = 0;
12728 int sub_count;
12729 tree field;
12730
12731 /* Can't handle incomplete types nor sizes that are not
12732 fixed. */
12733 if (!COMPLETE_TYPE_P (type)
12734 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
12735 return -1;
12736
12737 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
12738 {
12739 if (TREE_CODE (field) != FIELD_DECL)
12740 continue;
12741
12742 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
12743 if (sub_count < 0)
12744 return -1;
12745 count = count > sub_count ? count : sub_count;
12746 }
12747
12748 /* There must be no padding. */
12749 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
12750 count * GET_MODE_BITSIZE (*modep)))
12751 return -1;
12752
12753 return count;
12754 }
12755
12756 default:
12757 break;
12758 }
12759
12760 return -1;
12761 }
12762
12763 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
12764 type as described in AAPCS64 \S 4.1.2.
12765
12766 See the comment above aarch64_composite_type_p for the notes on MODE. */
12767
12768 static bool
12769 aarch64_short_vector_p (const_tree type,
12770 machine_mode mode)
12771 {
12772 poly_int64 size = -1;
12773
12774 if (type && TREE_CODE (type) == VECTOR_TYPE)
12775 size = int_size_in_bytes (type);
12776 else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
12777 || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
12778 size = GET_MODE_SIZE (mode);
12779
12780 return known_eq (size, 8) || known_eq (size, 16);
12781 }
12782
12783 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
12784 type as described in AAPCS64 \S 4.3. This includes aggregate, union and
12785 array types. The C99 floating-point complex types are also considered
12786 as composite types, according to AAPCS64 \S 7.1.1. The complex integer
12787 types, which are GCC extensions and out of the scope of AAPCS64, are
12788 treated as composite types here as well.
12789
12790 Note that MODE itself is not sufficient in determining whether a type
12791 is such a composite type or not. This is because
12792 stor-layout.c:compute_record_mode may have already changed the MODE
12793 (BLKmode) of a RECORD_TYPE TYPE to some other mode. For example, a
12794 structure with only one field may have its MODE set to the mode of the
12795 field. Also an integer mode whose size matches the size of the
12796 RECORD_TYPE type may be used to substitute the original mode
12797 (i.e. BLKmode) in certain circumstances. In other words, MODE cannot be
12798 solely relied on. */
12799
12800 static bool
12801 aarch64_composite_type_p (const_tree type,
12802 machine_mode mode)
12803 {
12804 if (aarch64_short_vector_p (type, mode))
12805 return false;
12806
12807 if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
12808 return true;
12809
12810 if (mode == BLKmode
12811 || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
12812 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
12813 return true;
12814
12815 return false;
12816 }
12817
12818 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
12819 shall be passed or returned in simd/fp register(s) (providing these
12820 parameter passing registers are available).
12821
12822 Upon successful return, *COUNT returns the number of needed registers,
12823 *BASE_MODE returns the mode of the individual register and when IS_HAF
12824 is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
12825 floating-point aggregate or a homogeneous short-vector aggregate. */
12826
12827 static bool
12828 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
12829 const_tree type,
12830 machine_mode *base_mode,
12831 int *count,
12832 bool *is_ha)
12833 {
12834 machine_mode new_mode = VOIDmode;
12835 bool composite_p = aarch64_composite_type_p (type, mode);
12836
12837 if (is_ha != NULL) *is_ha = false;
12838
12839 if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
12840 || aarch64_short_vector_p (type, mode))
12841 {
12842 *count = 1;
12843 new_mode = mode;
12844 }
12845 else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
12846 {
12847 if (is_ha != NULL) *is_ha = true;
12848 *count = 2;
12849 new_mode = GET_MODE_INNER (mode);
12850 }
12851 else if (type && composite_p)
12852 {
12853 int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
12854
12855 if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
12856 {
12857 if (is_ha != NULL) *is_ha = true;
12858 *count = ag_count;
12859 }
12860 else
12861 return false;
12862 }
12863 else
12864 return false;
12865
12866 *base_mode = new_mode;
12867 return true;
12868 }
12869
12870 /* Implement TARGET_STRUCT_VALUE_RTX. */
12871
12872 static rtx
12873 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
12874 int incoming ATTRIBUTE_UNUSED)
12875 {
12876 return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
12877 }
12878
12879 /* Implements target hook vector_mode_supported_p. */
12880 static bool
12881 aarch64_vector_mode_supported_p (machine_mode mode)
12882 {
12883 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
12884 return vec_flags != 0 && (vec_flags & VEC_STRUCT) == 0;
12885 }
12886
12887 /* Return appropriate SIMD container
12888 for MODE within a vector of WIDTH bits. */
12889 static machine_mode
12890 aarch64_simd_container_mode (scalar_mode mode, poly_int64 width)
12891 {
12892 if (TARGET_SVE && known_eq (width, BITS_PER_SVE_VECTOR))
12893 switch (mode)
12894 {
12895 case E_DFmode:
12896 return VNx2DFmode;
12897 case E_SFmode:
12898 return VNx4SFmode;
12899 case E_HFmode:
12900 return VNx8HFmode;
12901 case E_DImode:
12902 return VNx2DImode;
12903 case E_SImode:
12904 return VNx4SImode;
12905 case E_HImode:
12906 return VNx8HImode;
12907 case E_QImode:
12908 return VNx16QImode;
12909 default:
12910 return word_mode;
12911 }
12912
12913 gcc_assert (known_eq (width, 64) || known_eq (width, 128));
12914 if (TARGET_SIMD)
12915 {
12916 if (known_eq (width, 128))
12917 switch (mode)
12918 {
12919 case E_DFmode:
12920 return V2DFmode;
12921 case E_SFmode:
12922 return V4SFmode;
12923 case E_HFmode:
12924 return V8HFmode;
12925 case E_SImode:
12926 return V4SImode;
12927 case E_HImode:
12928 return V8HImode;
12929 case E_QImode:
12930 return V16QImode;
12931 case E_DImode:
12932 return V2DImode;
12933 default:
12934 break;
12935 }
12936 else
12937 switch (mode)
12938 {
12939 case E_SFmode:
12940 return V2SFmode;
12941 case E_HFmode:
12942 return V4HFmode;
12943 case E_SImode:
12944 return V2SImode;
12945 case E_HImode:
12946 return V4HImode;
12947 case E_QImode:
12948 return V8QImode;
12949 default:
12950 break;
12951 }
12952 }
12953 return word_mode;
12954 }
12955
12956 /* Return 128-bit container as the preferred SIMD mode for MODE. */
12957 static machine_mode
12958 aarch64_preferred_simd_mode (scalar_mode mode)
12959 {
12960 poly_int64 bits = TARGET_SVE ? BITS_PER_SVE_VECTOR : 128;
12961 return aarch64_simd_container_mode (mode, bits);
12962 }
12963
12964 /* Return a list of possible vector sizes for the vectorizer
12965 to iterate over. */
12966 static void
12967 aarch64_autovectorize_vector_sizes (vector_sizes *sizes)
12968 {
12969 if (TARGET_SVE)
12970 sizes->safe_push (BYTES_PER_SVE_VECTOR);
12971 sizes->safe_push (16);
12972 sizes->safe_push (8);
12973 }
12974
12975 /* Implement TARGET_MANGLE_TYPE. */
12976
12977 static const char *
12978 aarch64_mangle_type (const_tree type)
12979 {
12980 /* The AArch64 ABI documents say that "__va_list" has to be
12981 managled as if it is in the "std" namespace. */
12982 if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
12983 return "St9__va_list";
12984
12985 /* Half-precision float. */
12986 if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16)
12987 return "Dh";
12988
12989 /* Mangle AArch64-specific internal types. TYPE_NAME is non-NULL_TREE for
12990 builtin types. */
12991 if (TYPE_NAME (type) != NULL)
12992 return aarch64_mangle_builtin_type (type);
12993
12994 /* Use the default mangling. */
12995 return NULL;
12996 }
12997
12998 /* Find the first rtx_insn before insn that will generate an assembly
12999 instruction. */
13000
13001 static rtx_insn *
13002 aarch64_prev_real_insn (rtx_insn *insn)
13003 {
13004 if (!insn)
13005 return NULL;
13006
13007 do
13008 {
13009 insn = prev_real_insn (insn);
13010 }
13011 while (insn && recog_memoized (insn) < 0);
13012
13013 return insn;
13014 }
13015
13016 static bool
13017 is_madd_op (enum attr_type t1)
13018 {
13019 unsigned int i;
13020 /* A number of these may be AArch32 only. */
13021 enum attr_type mlatypes[] = {
13022 TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
13023 TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
13024 TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
13025 };
13026
13027 for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
13028 {
13029 if (t1 == mlatypes[i])
13030 return true;
13031 }
13032
13033 return false;
13034 }
13035
13036 /* Check if there is a register dependency between a load and the insn
13037 for which we hold recog_data. */
13038
13039 static bool
13040 dep_between_memop_and_curr (rtx memop)
13041 {
13042 rtx load_reg;
13043 int opno;
13044
13045 gcc_assert (GET_CODE (memop) == SET);
13046
13047 if (!REG_P (SET_DEST (memop)))
13048 return false;
13049
13050 load_reg = SET_DEST (memop);
13051 for (opno = 1; opno < recog_data.n_operands; opno++)
13052 {
13053 rtx operand = recog_data.operand[opno];
13054 if (REG_P (operand)
13055 && reg_overlap_mentioned_p (load_reg, operand))
13056 return true;
13057
13058 }
13059 return false;
13060 }
13061
13062
13063 /* When working around the Cortex-A53 erratum 835769,
13064 given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
13065 instruction and has a preceding memory instruction such that a NOP
13066 should be inserted between them. */
13067
13068 bool
13069 aarch64_madd_needs_nop (rtx_insn* insn)
13070 {
13071 enum attr_type attr_type;
13072 rtx_insn *prev;
13073 rtx body;
13074
13075 if (!TARGET_FIX_ERR_A53_835769)
13076 return false;
13077
13078 if (!INSN_P (insn) || recog_memoized (insn) < 0)
13079 return false;
13080
13081 attr_type = get_attr_type (insn);
13082 if (!is_madd_op (attr_type))
13083 return false;
13084
13085 prev = aarch64_prev_real_insn (insn);
13086 /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
13087 Restore recog state to INSN to avoid state corruption. */
13088 extract_constrain_insn_cached (insn);
13089
13090 if (!prev || !contains_mem_rtx_p (PATTERN (prev)))
13091 return false;
13092
13093 body = single_set (prev);
13094
13095 /* If the previous insn is a memory op and there is no dependency between
13096 it and the DImode madd, emit a NOP between them. If body is NULL then we
13097 have a complex memory operation, probably a load/store pair.
13098 Be conservative for now and emit a NOP. */
13099 if (GET_MODE (recog_data.operand[0]) == DImode
13100 && (!body || !dep_between_memop_and_curr (body)))
13101 return true;
13102
13103 return false;
13104
13105 }
13106
13107
13108 /* Implement FINAL_PRESCAN_INSN. */
13109
13110 void
13111 aarch64_final_prescan_insn (rtx_insn *insn)
13112 {
13113 if (aarch64_madd_needs_nop (insn))
13114 fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
13115 }
13116
13117
13118 /* Return true if BASE_OR_STEP is a valid immediate operand for an SVE INDEX
13119 instruction. */
13120
13121 bool
13122 aarch64_sve_index_immediate_p (rtx base_or_step)
13123 {
13124 return (CONST_INT_P (base_or_step)
13125 && IN_RANGE (INTVAL (base_or_step), -16, 15));
13126 }
13127
13128 /* Return true if X is a valid immediate for the SVE ADD and SUB
13129 instructions. Negate X first if NEGATE_P is true. */
13130
13131 bool
13132 aarch64_sve_arith_immediate_p (rtx x, bool negate_p)
13133 {
13134 rtx elt;
13135
13136 if (!const_vec_duplicate_p (x, &elt)
13137 || !CONST_INT_P (elt))
13138 return false;
13139
13140 HOST_WIDE_INT val = INTVAL (elt);
13141 if (negate_p)
13142 val = -val;
13143 val &= GET_MODE_MASK (GET_MODE_INNER (GET_MODE (x)));
13144
13145 if (val & 0xff)
13146 return IN_RANGE (val, 0, 0xff);
13147 return IN_RANGE (val, 0, 0xff00);
13148 }
13149
13150 /* Return true if X is a valid immediate operand for an SVE logical
13151 instruction such as AND. */
13152
13153 bool
13154 aarch64_sve_bitmask_immediate_p (rtx x)
13155 {
13156 rtx elt;
13157
13158 return (const_vec_duplicate_p (x, &elt)
13159 && CONST_INT_P (elt)
13160 && aarch64_bitmask_imm (INTVAL (elt),
13161 GET_MODE_INNER (GET_MODE (x))));
13162 }
13163
13164 /* Return true if X is a valid immediate for the SVE DUP and CPY
13165 instructions. */
13166
13167 bool
13168 aarch64_sve_dup_immediate_p (rtx x)
13169 {
13170 rtx elt;
13171
13172 if (!const_vec_duplicate_p (x, &elt)
13173 || !CONST_INT_P (elt))
13174 return false;
13175
13176 HOST_WIDE_INT val = INTVAL (elt);
13177 if (val & 0xff)
13178 return IN_RANGE (val, -0x80, 0x7f);
13179 return IN_RANGE (val, -0x8000, 0x7f00);
13180 }
13181
13182 /* Return true if X is a valid immediate operand for an SVE CMP instruction.
13183 SIGNED_P says whether the operand is signed rather than unsigned. */
13184
13185 bool
13186 aarch64_sve_cmp_immediate_p (rtx x, bool signed_p)
13187 {
13188 rtx elt;
13189
13190 return (const_vec_duplicate_p (x, &elt)
13191 && CONST_INT_P (elt)
13192 && (signed_p
13193 ? IN_RANGE (INTVAL (elt), -16, 15)
13194 : IN_RANGE (INTVAL (elt), 0, 127)));
13195 }
13196
13197 /* Return true if X is a valid immediate operand for an SVE FADD or FSUB
13198 instruction. Negate X first if NEGATE_P is true. */
13199
13200 bool
13201 aarch64_sve_float_arith_immediate_p (rtx x, bool negate_p)
13202 {
13203 rtx elt;
13204 REAL_VALUE_TYPE r;
13205
13206 if (!const_vec_duplicate_p (x, &elt)
13207 || GET_CODE (elt) != CONST_DOUBLE)
13208 return false;
13209
13210 r = *CONST_DOUBLE_REAL_VALUE (elt);
13211
13212 if (negate_p)
13213 r = real_value_negate (&r);
13214
13215 if (real_equal (&r, &dconst1))
13216 return true;
13217 if (real_equal (&r, &dconsthalf))
13218 return true;
13219 return false;
13220 }
13221
13222 /* Return true if X is a valid immediate operand for an SVE FMUL
13223 instruction. */
13224
13225 bool
13226 aarch64_sve_float_mul_immediate_p (rtx x)
13227 {
13228 rtx elt;
13229
13230 /* GCC will never generate a multiply with an immediate of 2, so there is no
13231 point testing for it (even though it is a valid constant). */
13232 return (const_vec_duplicate_p (x, &elt)
13233 && GET_CODE (elt) == CONST_DOUBLE
13234 && real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconsthalf));
13235 }
13236
13237 /* Return true if replicating VAL32 is a valid 2-byte or 4-byte immediate
13238 for the Advanced SIMD operation described by WHICH and INSN. If INFO
13239 is nonnull, use it to describe valid immediates. */
13240 static bool
13241 aarch64_advsimd_valid_immediate_hs (unsigned int val32,
13242 simd_immediate_info *info,
13243 enum simd_immediate_check which,
13244 simd_immediate_info::insn_type insn)
13245 {
13246 /* Try a 4-byte immediate with LSL. */
13247 for (unsigned int shift = 0; shift < 32; shift += 8)
13248 if ((val32 & (0xff << shift)) == val32)
13249 {
13250 if (info)
13251 *info = simd_immediate_info (SImode, val32 >> shift, insn,
13252 simd_immediate_info::LSL, shift);
13253 return true;
13254 }
13255
13256 /* Try a 2-byte immediate with LSL. */
13257 unsigned int imm16 = val32 & 0xffff;
13258 if (imm16 == (val32 >> 16))
13259 for (unsigned int shift = 0; shift < 16; shift += 8)
13260 if ((imm16 & (0xff << shift)) == imm16)
13261 {
13262 if (info)
13263 *info = simd_immediate_info (HImode, imm16 >> shift, insn,
13264 simd_immediate_info::LSL, shift);
13265 return true;
13266 }
13267
13268 /* Try a 4-byte immediate with MSL, except for cases that MVN
13269 can handle. */
13270 if (which == AARCH64_CHECK_MOV)
13271 for (unsigned int shift = 8; shift < 24; shift += 8)
13272 {
13273 unsigned int low = (1 << shift) - 1;
13274 if (((val32 & (0xff << shift)) | low) == val32)
13275 {
13276 if (info)
13277 *info = simd_immediate_info (SImode, val32 >> shift, insn,
13278 simd_immediate_info::MSL, shift);
13279 return true;
13280 }
13281 }
13282
13283 return false;
13284 }
13285
13286 /* Return true if replicating VAL64 is a valid immediate for the
13287 Advanced SIMD operation described by WHICH. If INFO is nonnull,
13288 use it to describe valid immediates. */
13289 static bool
13290 aarch64_advsimd_valid_immediate (unsigned HOST_WIDE_INT val64,
13291 simd_immediate_info *info,
13292 enum simd_immediate_check which)
13293 {
13294 unsigned int val32 = val64 & 0xffffffff;
13295 unsigned int val16 = val64 & 0xffff;
13296 unsigned int val8 = val64 & 0xff;
13297
13298 if (val32 == (val64 >> 32))
13299 {
13300 if ((which & AARCH64_CHECK_ORR) != 0
13301 && aarch64_advsimd_valid_immediate_hs (val32, info, which,
13302 simd_immediate_info::MOV))
13303 return true;
13304
13305 if ((which & AARCH64_CHECK_BIC) != 0
13306 && aarch64_advsimd_valid_immediate_hs (~val32, info, which,
13307 simd_immediate_info::MVN))
13308 return true;
13309
13310 /* Try using a replicated byte. */
13311 if (which == AARCH64_CHECK_MOV
13312 && val16 == (val32 >> 16)
13313 && val8 == (val16 >> 8))
13314 {
13315 if (info)
13316 *info = simd_immediate_info (QImode, val8);
13317 return true;
13318 }
13319 }
13320
13321 /* Try using a bit-to-bytemask. */
13322 if (which == AARCH64_CHECK_MOV)
13323 {
13324 unsigned int i;
13325 for (i = 0; i < 64; i += 8)
13326 {
13327 unsigned char byte = (val64 >> i) & 0xff;
13328 if (byte != 0 && byte != 0xff)
13329 break;
13330 }
13331 if (i == 64)
13332 {
13333 if (info)
13334 *info = simd_immediate_info (DImode, val64);
13335 return true;
13336 }
13337 }
13338 return false;
13339 }
13340
13341 /* Return true if replicating VAL64 gives a valid immediate for an SVE MOV
13342 instruction. If INFO is nonnull, use it to describe valid immediates. */
13343
13344 static bool
13345 aarch64_sve_valid_immediate (unsigned HOST_WIDE_INT val64,
13346 simd_immediate_info *info)
13347 {
13348 scalar_int_mode mode = DImode;
13349 unsigned int val32 = val64 & 0xffffffff;
13350 if (val32 == (val64 >> 32))
13351 {
13352 mode = SImode;
13353 unsigned int val16 = val32 & 0xffff;
13354 if (val16 == (val32 >> 16))
13355 {
13356 mode = HImode;
13357 unsigned int val8 = val16 & 0xff;
13358 if (val8 == (val16 >> 8))
13359 mode = QImode;
13360 }
13361 }
13362 HOST_WIDE_INT val = trunc_int_for_mode (val64, mode);
13363 if (IN_RANGE (val, -0x80, 0x7f))
13364 {
13365 /* DUP with no shift. */
13366 if (info)
13367 *info = simd_immediate_info (mode, val);
13368 return true;
13369 }
13370 if ((val & 0xff) == 0 && IN_RANGE (val, -0x8000, 0x7f00))
13371 {
13372 /* DUP with LSL #8. */
13373 if (info)
13374 *info = simd_immediate_info (mode, val);
13375 return true;
13376 }
13377 if (aarch64_bitmask_imm (val64, mode))
13378 {
13379 /* DUPM. */
13380 if (info)
13381 *info = simd_immediate_info (mode, val);
13382 return true;
13383 }
13384 return false;
13385 }
13386
13387 /* Return true if OP is a valid SIMD immediate for the operation
13388 described by WHICH. If INFO is nonnull, use it to describe valid
13389 immediates. */
13390 bool
13391 aarch64_simd_valid_immediate (rtx op, simd_immediate_info *info,
13392 enum simd_immediate_check which)
13393 {
13394 machine_mode mode = GET_MODE (op);
13395 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
13396 if (vec_flags == 0 || vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
13397 return false;
13398
13399 scalar_mode elt_mode = GET_MODE_INNER (mode);
13400 rtx base, step;
13401 unsigned int n_elts;
13402 if (GET_CODE (op) == CONST_VECTOR
13403 && CONST_VECTOR_DUPLICATE_P (op))
13404 n_elts = CONST_VECTOR_NPATTERNS (op);
13405 else if ((vec_flags & VEC_SVE_DATA)
13406 && const_vec_series_p (op, &base, &step))
13407 {
13408 gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_INT);
13409 if (!aarch64_sve_index_immediate_p (base)
13410 || !aarch64_sve_index_immediate_p (step))
13411 return false;
13412
13413 if (info)
13414 *info = simd_immediate_info (elt_mode, base, step);
13415 return true;
13416 }
13417 else if (GET_CODE (op) == CONST_VECTOR
13418 && CONST_VECTOR_NUNITS (op).is_constant (&n_elts))
13419 /* N_ELTS set above. */;
13420 else
13421 return false;
13422
13423 /* Handle PFALSE and PTRUE. */
13424 if (vec_flags & VEC_SVE_PRED)
13425 return (op == CONST0_RTX (mode)
13426 || op == CONSTM1_RTX (mode));
13427
13428 scalar_float_mode elt_float_mode;
13429 if (n_elts == 1
13430 && is_a <scalar_float_mode> (elt_mode, &elt_float_mode))
13431 {
13432 rtx elt = CONST_VECTOR_ENCODED_ELT (op, 0);
13433 if (aarch64_float_const_zero_rtx_p (elt)
13434 || aarch64_float_const_representable_p (elt))
13435 {
13436 if (info)
13437 *info = simd_immediate_info (elt_float_mode, elt);
13438 return true;
13439 }
13440 }
13441
13442 unsigned int elt_size = GET_MODE_SIZE (elt_mode);
13443 if (elt_size > 8)
13444 return false;
13445
13446 scalar_int_mode elt_int_mode = int_mode_for_mode (elt_mode).require ();
13447
13448 /* Expand the vector constant out into a byte vector, with the least
13449 significant byte of the register first. */
13450 auto_vec<unsigned char, 16> bytes;
13451 bytes.reserve (n_elts * elt_size);
13452 for (unsigned int i = 0; i < n_elts; i++)
13453 {
13454 /* The vector is provided in gcc endian-neutral fashion.
13455 For aarch64_be Advanced SIMD, it must be laid out in the vector
13456 register in reverse order. */
13457 bool swap_p = ((vec_flags & VEC_ADVSIMD) != 0 && BYTES_BIG_ENDIAN);
13458 rtx elt = CONST_VECTOR_ELT (op, swap_p ? (n_elts - 1 - i) : i);
13459
13460 if (elt_mode != elt_int_mode)
13461 elt = gen_lowpart (elt_int_mode, elt);
13462
13463 if (!CONST_INT_P (elt))
13464 return false;
13465
13466 unsigned HOST_WIDE_INT elt_val = INTVAL (elt);
13467 for (unsigned int byte = 0; byte < elt_size; byte++)
13468 {
13469 bytes.quick_push (elt_val & 0xff);
13470 elt_val >>= BITS_PER_UNIT;
13471 }
13472 }
13473
13474 /* The immediate must repeat every eight bytes. */
13475 unsigned int nbytes = bytes.length ();
13476 for (unsigned i = 8; i < nbytes; ++i)
13477 if (bytes[i] != bytes[i - 8])
13478 return false;
13479
13480 /* Get the repeating 8-byte value as an integer. No endian correction
13481 is needed here because bytes is already in lsb-first order. */
13482 unsigned HOST_WIDE_INT val64 = 0;
13483 for (unsigned int i = 0; i < 8; i++)
13484 val64 |= ((unsigned HOST_WIDE_INT) bytes[i % nbytes]
13485 << (i * BITS_PER_UNIT));
13486
13487 if (vec_flags & VEC_SVE_DATA)
13488 return aarch64_sve_valid_immediate (val64, info);
13489 else
13490 return aarch64_advsimd_valid_immediate (val64, info, which);
13491 }
13492
13493 /* Check whether X is a VEC_SERIES-like constant that starts at 0 and
13494 has a step in the range of INDEX. Return the index expression if so,
13495 otherwise return null. */
13496 rtx
13497 aarch64_check_zero_based_sve_index_immediate (rtx x)
13498 {
13499 rtx base, step;
13500 if (const_vec_series_p (x, &base, &step)
13501 && base == const0_rtx
13502 && aarch64_sve_index_immediate_p (step))
13503 return step;
13504 return NULL_RTX;
13505 }
13506
13507 /* Check of immediate shift constants are within range. */
13508 bool
13509 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
13510 {
13511 int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
13512 if (left)
13513 return aarch64_const_vec_all_same_in_range_p (x, 0, bit_width - 1);
13514 else
13515 return aarch64_const_vec_all_same_in_range_p (x, 1, bit_width);
13516 }
13517
13518 /* Return the bitmask CONST_INT to select the bits required by a zero extract
13519 operation of width WIDTH at bit position POS. */
13520
13521 rtx
13522 aarch64_mask_from_zextract_ops (rtx width, rtx pos)
13523 {
13524 gcc_assert (CONST_INT_P (width));
13525 gcc_assert (CONST_INT_P (pos));
13526
13527 unsigned HOST_WIDE_INT mask
13528 = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1;
13529 return GEN_INT (mask << UINTVAL (pos));
13530 }
13531
13532 bool
13533 aarch64_mov_operand_p (rtx x, machine_mode mode)
13534 {
13535 if (GET_CODE (x) == HIGH
13536 && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
13537 return true;
13538
13539 if (CONST_INT_P (x))
13540 return true;
13541
13542 if (VECTOR_MODE_P (GET_MODE (x)))
13543 return aarch64_simd_valid_immediate (x, NULL);
13544
13545 if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
13546 return true;
13547
13548 if (aarch64_sve_cnt_immediate_p (x))
13549 return true;
13550
13551 return aarch64_classify_symbolic_expression (x)
13552 == SYMBOL_TINY_ABSOLUTE;
13553 }
13554
13555 /* Return a const_int vector of VAL. */
13556 rtx
13557 aarch64_simd_gen_const_vector_dup (machine_mode mode, HOST_WIDE_INT val)
13558 {
13559 rtx c = gen_int_mode (val, GET_MODE_INNER (mode));
13560 return gen_const_vec_duplicate (mode, c);
13561 }
13562
13563 /* Check OP is a legal scalar immediate for the MOVI instruction. */
13564
13565 bool
13566 aarch64_simd_scalar_immediate_valid_for_move (rtx op, scalar_int_mode mode)
13567 {
13568 machine_mode vmode;
13569
13570 vmode = aarch64_simd_container_mode (mode, 64);
13571 rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
13572 return aarch64_simd_valid_immediate (op_v, NULL);
13573 }
13574
13575 /* Construct and return a PARALLEL RTX vector with elements numbering the
13576 lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
13577 the vector - from the perspective of the architecture. This does not
13578 line up with GCC's perspective on lane numbers, so we end up with
13579 different masks depending on our target endian-ness. The diagram
13580 below may help. We must draw the distinction when building masks
13581 which select one half of the vector. An instruction selecting
13582 architectural low-lanes for a big-endian target, must be described using
13583 a mask selecting GCC high-lanes.
13584
13585 Big-Endian Little-Endian
13586
13587 GCC 0 1 2 3 3 2 1 0
13588 | x | x | x | x | | x | x | x | x |
13589 Architecture 3 2 1 0 3 2 1 0
13590
13591 Low Mask: { 2, 3 } { 0, 1 }
13592 High Mask: { 0, 1 } { 2, 3 }
13593
13594 MODE Is the mode of the vector and NUNITS is the number of units in it. */
13595
13596 rtx
13597 aarch64_simd_vect_par_cnst_half (machine_mode mode, int nunits, bool high)
13598 {
13599 rtvec v = rtvec_alloc (nunits / 2);
13600 int high_base = nunits / 2;
13601 int low_base = 0;
13602 int base;
13603 rtx t1;
13604 int i;
13605
13606 if (BYTES_BIG_ENDIAN)
13607 base = high ? low_base : high_base;
13608 else
13609 base = high ? high_base : low_base;
13610
13611 for (i = 0; i < nunits / 2; i++)
13612 RTVEC_ELT (v, i) = GEN_INT (base + i);
13613
13614 t1 = gen_rtx_PARALLEL (mode, v);
13615 return t1;
13616 }
13617
13618 /* Check OP for validity as a PARALLEL RTX vector with elements
13619 numbering the lanes of either the high (HIGH == TRUE) or low lanes,
13620 from the perspective of the architecture. See the diagram above
13621 aarch64_simd_vect_par_cnst_half for more details. */
13622
13623 bool
13624 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
13625 bool high)
13626 {
13627 int nelts;
13628 if (!VECTOR_MODE_P (mode) || !GET_MODE_NUNITS (mode).is_constant (&nelts))
13629 return false;
13630
13631 rtx ideal = aarch64_simd_vect_par_cnst_half (mode, nelts, high);
13632 HOST_WIDE_INT count_op = XVECLEN (op, 0);
13633 HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
13634 int i = 0;
13635
13636 if (count_op != count_ideal)
13637 return false;
13638
13639 for (i = 0; i < count_ideal; i++)
13640 {
13641 rtx elt_op = XVECEXP (op, 0, i);
13642 rtx elt_ideal = XVECEXP (ideal, 0, i);
13643
13644 if (!CONST_INT_P (elt_op)
13645 || INTVAL (elt_ideal) != INTVAL (elt_op))
13646 return false;
13647 }
13648 return true;
13649 }
13650
13651 /* Bounds-check lanes. Ensure OPERAND lies between LOW (inclusive) and
13652 HIGH (exclusive). */
13653 void
13654 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
13655 const_tree exp)
13656 {
13657 HOST_WIDE_INT lane;
13658 gcc_assert (CONST_INT_P (operand));
13659 lane = INTVAL (operand);
13660
13661 if (lane < low || lane >= high)
13662 {
13663 if (exp)
13664 error ("%Klane %wd out of range %wd - %wd", exp, lane, low, high - 1);
13665 else
13666 error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
13667 }
13668 }
13669
13670 /* Peform endian correction on lane number N, which indexes a vector
13671 of mode MODE, and return the result as an SImode rtx. */
13672
13673 rtx
13674 aarch64_endian_lane_rtx (machine_mode mode, unsigned int n)
13675 {
13676 return gen_int_mode (ENDIAN_LANE_N (GET_MODE_NUNITS (mode), n), SImode);
13677 }
13678
13679 /* Return TRUE if OP is a valid vector addressing mode. */
13680
13681 bool
13682 aarch64_simd_mem_operand_p (rtx op)
13683 {
13684 return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
13685 || REG_P (XEXP (op, 0)));
13686 }
13687
13688 /* Return true if OP is a valid MEM operand for an SVE LD1R instruction. */
13689
13690 bool
13691 aarch64_sve_ld1r_operand_p (rtx op)
13692 {
13693 struct aarch64_address_info addr;
13694 scalar_mode mode;
13695
13696 return (MEM_P (op)
13697 && is_a <scalar_mode> (GET_MODE (op), &mode)
13698 && aarch64_classify_address (&addr, XEXP (op, 0), mode, false)
13699 && addr.type == ADDRESS_REG_IMM
13700 && offset_6bit_unsigned_scaled_p (mode, addr.const_offset));
13701 }
13702
13703 /* Return true if OP is a valid MEM operand for an SVE LDR instruction.
13704 The conditions for STR are the same. */
13705 bool
13706 aarch64_sve_ldr_operand_p (rtx op)
13707 {
13708 struct aarch64_address_info addr;
13709
13710 return (MEM_P (op)
13711 && aarch64_classify_address (&addr, XEXP (op, 0), GET_MODE (op),
13712 false, ADDR_QUERY_ANY)
13713 && addr.type == ADDRESS_REG_IMM);
13714 }
13715
13716 /* Return true if OP is a valid MEM operand for an SVE_STRUCT mode.
13717 We need to be able to access the individual pieces, so the range
13718 is different from LD[234] and ST[234]. */
13719 bool
13720 aarch64_sve_struct_memory_operand_p (rtx op)
13721 {
13722 if (!MEM_P (op))
13723 return false;
13724
13725 machine_mode mode = GET_MODE (op);
13726 struct aarch64_address_info addr;
13727 if (!aarch64_classify_address (&addr, XEXP (op, 0), SVE_BYTE_MODE, false,
13728 ADDR_QUERY_ANY)
13729 || addr.type != ADDRESS_REG_IMM)
13730 return false;
13731
13732 poly_int64 first = addr.const_offset;
13733 poly_int64 last = first + GET_MODE_SIZE (mode) - BYTES_PER_SVE_VECTOR;
13734 return (offset_4bit_signed_scaled_p (SVE_BYTE_MODE, first)
13735 && offset_4bit_signed_scaled_p (SVE_BYTE_MODE, last));
13736 }
13737
13738 /* Emit a register copy from operand to operand, taking care not to
13739 early-clobber source registers in the process.
13740
13741 COUNT is the number of components into which the copy needs to be
13742 decomposed. */
13743 void
13744 aarch64_simd_emit_reg_reg_move (rtx *operands, machine_mode mode,
13745 unsigned int count)
13746 {
13747 unsigned int i;
13748 int rdest = REGNO (operands[0]);
13749 int rsrc = REGNO (operands[1]);
13750
13751 if (!reg_overlap_mentioned_p (operands[0], operands[1])
13752 || rdest < rsrc)
13753 for (i = 0; i < count; i++)
13754 emit_move_insn (gen_rtx_REG (mode, rdest + i),
13755 gen_rtx_REG (mode, rsrc + i));
13756 else
13757 for (i = 0; i < count; i++)
13758 emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
13759 gen_rtx_REG (mode, rsrc + count - i - 1));
13760 }
13761
13762 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
13763 one of VSTRUCT modes: OI, CI, or XI. */
13764 int
13765 aarch64_simd_attr_length_rglist (machine_mode mode)
13766 {
13767 /* This is only used (and only meaningful) for Advanced SIMD, not SVE. */
13768 return (GET_MODE_SIZE (mode).to_constant () / UNITS_PER_VREG) * 4;
13769 }
13770
13771 /* Implement target hook TARGET_VECTOR_ALIGNMENT. The AAPCS64 sets the maximum
13772 alignment of a vector to 128 bits. SVE predicates have an alignment of
13773 16 bits. */
13774 static HOST_WIDE_INT
13775 aarch64_simd_vector_alignment (const_tree type)
13776 {
13777 if (TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
13778 /* ??? Checking the mode isn't ideal, but VECTOR_BOOLEAN_TYPE_P can
13779 be set for non-predicate vectors of booleans. Modes are the most
13780 direct way we have of identifying real SVE predicate types. */
13781 return GET_MODE_CLASS (TYPE_MODE (type)) == MODE_VECTOR_BOOL ? 16 : 128;
13782 HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
13783 return MIN (align, 128);
13784 }
13785
13786 /* Implement target hook TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT. */
13787 static HOST_WIDE_INT
13788 aarch64_vectorize_preferred_vector_alignment (const_tree type)
13789 {
13790 if (aarch64_sve_data_mode_p (TYPE_MODE (type)))
13791 {
13792 /* If the length of the vector is fixed, try to align to that length,
13793 otherwise don't try to align at all. */
13794 HOST_WIDE_INT result;
13795 if (!BITS_PER_SVE_VECTOR.is_constant (&result))
13796 result = TYPE_ALIGN (TREE_TYPE (type));
13797 return result;
13798 }
13799 return TYPE_ALIGN (type);
13800 }
13801
13802 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE. */
13803 static bool
13804 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
13805 {
13806 if (is_packed)
13807 return false;
13808
13809 /* For fixed-length vectors, check that the vectorizer will aim for
13810 full-vector alignment. This isn't true for generic GCC vectors
13811 that are wider than the ABI maximum of 128 bits. */
13812 if (TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
13813 && (wi::to_widest (TYPE_SIZE (type))
13814 != aarch64_vectorize_preferred_vector_alignment (type)))
13815 return false;
13816
13817 /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned. */
13818 return true;
13819 }
13820
13821 /* Return true if the vector misalignment factor is supported by the
13822 target. */
13823 static bool
13824 aarch64_builtin_support_vector_misalignment (machine_mode mode,
13825 const_tree type, int misalignment,
13826 bool is_packed)
13827 {
13828 if (TARGET_SIMD && STRICT_ALIGNMENT)
13829 {
13830 /* Return if movmisalign pattern is not supported for this mode. */
13831 if (optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing)
13832 return false;
13833
13834 /* Misalignment factor is unknown at compile time. */
13835 if (misalignment == -1)
13836 return false;
13837 }
13838 return default_builtin_support_vector_misalignment (mode, type, misalignment,
13839 is_packed);
13840 }
13841
13842 /* If VALS is a vector constant that can be loaded into a register
13843 using DUP, generate instructions to do so and return an RTX to
13844 assign to the register. Otherwise return NULL_RTX. */
13845 static rtx
13846 aarch64_simd_dup_constant (rtx vals)
13847 {
13848 machine_mode mode = GET_MODE (vals);
13849 machine_mode inner_mode = GET_MODE_INNER (mode);
13850 rtx x;
13851
13852 if (!const_vec_duplicate_p (vals, &x))
13853 return NULL_RTX;
13854
13855 /* We can load this constant by using DUP and a constant in a
13856 single ARM register. This will be cheaper than a vector
13857 load. */
13858 x = copy_to_mode_reg (inner_mode, x);
13859 return gen_vec_duplicate (mode, x);
13860 }
13861
13862
13863 /* Generate code to load VALS, which is a PARALLEL containing only
13864 constants (for vec_init) or CONST_VECTOR, efficiently into a
13865 register. Returns an RTX to copy into the register, or NULL_RTX
13866 for a PARALLEL that can not be converted into a CONST_VECTOR. */
13867 static rtx
13868 aarch64_simd_make_constant (rtx vals)
13869 {
13870 machine_mode mode = GET_MODE (vals);
13871 rtx const_dup;
13872 rtx const_vec = NULL_RTX;
13873 int n_const = 0;
13874 int i;
13875
13876 if (GET_CODE (vals) == CONST_VECTOR)
13877 const_vec = vals;
13878 else if (GET_CODE (vals) == PARALLEL)
13879 {
13880 /* A CONST_VECTOR must contain only CONST_INTs and
13881 CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
13882 Only store valid constants in a CONST_VECTOR. */
13883 int n_elts = XVECLEN (vals, 0);
13884 for (i = 0; i < n_elts; ++i)
13885 {
13886 rtx x = XVECEXP (vals, 0, i);
13887 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
13888 n_const++;
13889 }
13890 if (n_const == n_elts)
13891 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
13892 }
13893 else
13894 gcc_unreachable ();
13895
13896 if (const_vec != NULL_RTX
13897 && aarch64_simd_valid_immediate (const_vec, NULL))
13898 /* Load using MOVI/MVNI. */
13899 return const_vec;
13900 else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
13901 /* Loaded using DUP. */
13902 return const_dup;
13903 else if (const_vec != NULL_RTX)
13904 /* Load from constant pool. We can not take advantage of single-cycle
13905 LD1 because we need a PC-relative addressing mode. */
13906 return const_vec;
13907 else
13908 /* A PARALLEL containing something not valid inside CONST_VECTOR.
13909 We can not construct an initializer. */
13910 return NULL_RTX;
13911 }
13912
13913 /* Expand a vector initialisation sequence, such that TARGET is
13914 initialised to contain VALS. */
13915
13916 void
13917 aarch64_expand_vector_init (rtx target, rtx vals)
13918 {
13919 machine_mode mode = GET_MODE (target);
13920 scalar_mode inner_mode = GET_MODE_INNER (mode);
13921 /* The number of vector elements. */
13922 int n_elts = XVECLEN (vals, 0);
13923 /* The number of vector elements which are not constant. */
13924 int n_var = 0;
13925 rtx any_const = NULL_RTX;
13926 /* The first element of vals. */
13927 rtx v0 = XVECEXP (vals, 0, 0);
13928 bool all_same = true;
13929
13930 /* Count the number of variable elements to initialise. */
13931 for (int i = 0; i < n_elts; ++i)
13932 {
13933 rtx x = XVECEXP (vals, 0, i);
13934 if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
13935 ++n_var;
13936 else
13937 any_const = x;
13938
13939 all_same &= rtx_equal_p (x, v0);
13940 }
13941
13942 /* No variable elements, hand off to aarch64_simd_make_constant which knows
13943 how best to handle this. */
13944 if (n_var == 0)
13945 {
13946 rtx constant = aarch64_simd_make_constant (vals);
13947 if (constant != NULL_RTX)
13948 {
13949 emit_move_insn (target, constant);
13950 return;
13951 }
13952 }
13953
13954 /* Splat a single non-constant element if we can. */
13955 if (all_same)
13956 {
13957 rtx x = copy_to_mode_reg (inner_mode, v0);
13958 aarch64_emit_move (target, gen_vec_duplicate (mode, x));
13959 return;
13960 }
13961
13962 enum insn_code icode = optab_handler (vec_set_optab, mode);
13963 gcc_assert (icode != CODE_FOR_nothing);
13964
13965 /* If there are only variable elements, try to optimize
13966 the insertion using dup for the most common element
13967 followed by insertions. */
13968
13969 /* The algorithm will fill matches[*][0] with the earliest matching element,
13970 and matches[X][1] with the count of duplicate elements (if X is the
13971 earliest element which has duplicates). */
13972
13973 if (n_var == n_elts && n_elts <= 16)
13974 {
13975 int matches[16][2] = {0};
13976 for (int i = 0; i < n_elts; i++)
13977 {
13978 for (int j = 0; j <= i; j++)
13979 {
13980 if (rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, j)))
13981 {
13982 matches[i][0] = j;
13983 matches[j][1]++;
13984 break;
13985 }
13986 }
13987 }
13988 int maxelement = 0;
13989 int maxv = 0;
13990 for (int i = 0; i < n_elts; i++)
13991 if (matches[i][1] > maxv)
13992 {
13993 maxelement = i;
13994 maxv = matches[i][1];
13995 }
13996
13997 /* Create a duplicate of the most common element, unless all elements
13998 are equally useless to us, in which case just immediately set the
13999 vector register using the first element. */
14000
14001 if (maxv == 1)
14002 {
14003 /* For vectors of two 64-bit elements, we can do even better. */
14004 if (n_elts == 2
14005 && (inner_mode == E_DImode
14006 || inner_mode == E_DFmode))
14007
14008 {
14009 rtx x0 = XVECEXP (vals, 0, 0);
14010 rtx x1 = XVECEXP (vals, 0, 1);
14011 /* Combine can pick up this case, but handling it directly
14012 here leaves clearer RTL.
14013
14014 This is load_pair_lanes<mode>, and also gives us a clean-up
14015 for store_pair_lanes<mode>. */
14016 if (memory_operand (x0, inner_mode)
14017 && memory_operand (x1, inner_mode)
14018 && !STRICT_ALIGNMENT
14019 && rtx_equal_p (XEXP (x1, 0),
14020 plus_constant (Pmode,
14021 XEXP (x0, 0),
14022 GET_MODE_SIZE (inner_mode))))
14023 {
14024 rtx t;
14025 if (inner_mode == DFmode)
14026 t = gen_load_pair_lanesdf (target, x0, x1);
14027 else
14028 t = gen_load_pair_lanesdi (target, x0, x1);
14029 emit_insn (t);
14030 return;
14031 }
14032 }
14033 /* The subreg-move sequence below will move into lane zero of the
14034 vector register. For big-endian we want that position to hold
14035 the last element of VALS. */
14036 maxelement = BYTES_BIG_ENDIAN ? n_elts - 1 : 0;
14037 rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
14038 aarch64_emit_move (target, lowpart_subreg (mode, x, inner_mode));
14039 }
14040 else
14041 {
14042 rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
14043 aarch64_emit_move (target, gen_vec_duplicate (mode, x));
14044 }
14045
14046 /* Insert the rest. */
14047 for (int i = 0; i < n_elts; i++)
14048 {
14049 rtx x = XVECEXP (vals, 0, i);
14050 if (matches[i][0] == maxelement)
14051 continue;
14052 x = copy_to_mode_reg (inner_mode, x);
14053 emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
14054 }
14055 return;
14056 }
14057
14058 /* Initialise a vector which is part-variable. We want to first try
14059 to build those lanes which are constant in the most efficient way we
14060 can. */
14061 if (n_var != n_elts)
14062 {
14063 rtx copy = copy_rtx (vals);
14064
14065 /* Load constant part of vector. We really don't care what goes into the
14066 parts we will overwrite, but we're more likely to be able to load the
14067 constant efficiently if it has fewer, larger, repeating parts
14068 (see aarch64_simd_valid_immediate). */
14069 for (int i = 0; i < n_elts; i++)
14070 {
14071 rtx x = XVECEXP (vals, 0, i);
14072 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
14073 continue;
14074 rtx subst = any_const;
14075 for (int bit = n_elts / 2; bit > 0; bit /= 2)
14076 {
14077 /* Look in the copied vector, as more elements are const. */
14078 rtx test = XVECEXP (copy, 0, i ^ bit);
14079 if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
14080 {
14081 subst = test;
14082 break;
14083 }
14084 }
14085 XVECEXP (copy, 0, i) = subst;
14086 }
14087 aarch64_expand_vector_init (target, copy);
14088 }
14089
14090 /* Insert the variable lanes directly. */
14091 for (int i = 0; i < n_elts; i++)
14092 {
14093 rtx x = XVECEXP (vals, 0, i);
14094 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
14095 continue;
14096 x = copy_to_mode_reg (inner_mode, x);
14097 emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
14098 }
14099 }
14100
14101 static unsigned HOST_WIDE_INT
14102 aarch64_shift_truncation_mask (machine_mode mode)
14103 {
14104 if (!SHIFT_COUNT_TRUNCATED || aarch64_vector_data_mode_p (mode))
14105 return 0;
14106 return GET_MODE_UNIT_BITSIZE (mode) - 1;
14107 }
14108
14109 /* Select a format to encode pointers in exception handling data. */
14110 int
14111 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
14112 {
14113 int type;
14114 switch (aarch64_cmodel)
14115 {
14116 case AARCH64_CMODEL_TINY:
14117 case AARCH64_CMODEL_TINY_PIC:
14118 case AARCH64_CMODEL_SMALL:
14119 case AARCH64_CMODEL_SMALL_PIC:
14120 case AARCH64_CMODEL_SMALL_SPIC:
14121 /* text+got+data < 4Gb. 4-byte signed relocs are sufficient
14122 for everything. */
14123 type = DW_EH_PE_sdata4;
14124 break;
14125 default:
14126 /* No assumptions here. 8-byte relocs required. */
14127 type = DW_EH_PE_sdata8;
14128 break;
14129 }
14130 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
14131 }
14132
14133 /* The last .arch and .tune assembly strings that we printed. */
14134 static std::string aarch64_last_printed_arch_string;
14135 static std::string aarch64_last_printed_tune_string;
14136
14137 /* Implement ASM_DECLARE_FUNCTION_NAME. Output the ISA features used
14138 by the function fndecl. */
14139
14140 void
14141 aarch64_declare_function_name (FILE *stream, const char* name,
14142 tree fndecl)
14143 {
14144 tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
14145
14146 struct cl_target_option *targ_options;
14147 if (target_parts)
14148 targ_options = TREE_TARGET_OPTION (target_parts);
14149 else
14150 targ_options = TREE_TARGET_OPTION (target_option_current_node);
14151 gcc_assert (targ_options);
14152
14153 const struct processor *this_arch
14154 = aarch64_get_arch (targ_options->x_explicit_arch);
14155
14156 unsigned long isa_flags = targ_options->x_aarch64_isa_flags;
14157 std::string extension
14158 = aarch64_get_extension_string_for_isa_flags (isa_flags,
14159 this_arch->flags);
14160 /* Only update the assembler .arch string if it is distinct from the last
14161 such string we printed. */
14162 std::string to_print = this_arch->name + extension;
14163 if (to_print != aarch64_last_printed_arch_string)
14164 {
14165 asm_fprintf (asm_out_file, "\t.arch %s\n", to_print.c_str ());
14166 aarch64_last_printed_arch_string = to_print;
14167 }
14168
14169 /* Print the cpu name we're tuning for in the comments, might be
14170 useful to readers of the generated asm. Do it only when it changes
14171 from function to function and verbose assembly is requested. */
14172 const struct processor *this_tune
14173 = aarch64_get_tune_cpu (targ_options->x_explicit_tune_core);
14174
14175 if (flag_debug_asm && aarch64_last_printed_tune_string != this_tune->name)
14176 {
14177 asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
14178 this_tune->name);
14179 aarch64_last_printed_tune_string = this_tune->name;
14180 }
14181
14182 /* Don't forget the type directive for ELF. */
14183 ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
14184 ASM_OUTPUT_LABEL (stream, name);
14185 }
14186
14187 /* Implements TARGET_ASM_FILE_START. Output the assembly header. */
14188
14189 static void
14190 aarch64_start_file (void)
14191 {
14192 struct cl_target_option *default_options
14193 = TREE_TARGET_OPTION (target_option_default_node);
14194
14195 const struct processor *default_arch
14196 = aarch64_get_arch (default_options->x_explicit_arch);
14197 unsigned long default_isa_flags = default_options->x_aarch64_isa_flags;
14198 std::string extension
14199 = aarch64_get_extension_string_for_isa_flags (default_isa_flags,
14200 default_arch->flags);
14201
14202 aarch64_last_printed_arch_string = default_arch->name + extension;
14203 aarch64_last_printed_tune_string = "";
14204 asm_fprintf (asm_out_file, "\t.arch %s\n",
14205 aarch64_last_printed_arch_string.c_str ());
14206
14207 default_file_start ();
14208 }
14209
14210 /* Emit load exclusive. */
14211
14212 static void
14213 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
14214 rtx mem, rtx model_rtx)
14215 {
14216 rtx (*gen) (rtx, rtx, rtx);
14217
14218 switch (mode)
14219 {
14220 case E_QImode: gen = gen_aarch64_load_exclusiveqi; break;
14221 case E_HImode: gen = gen_aarch64_load_exclusivehi; break;
14222 case E_SImode: gen = gen_aarch64_load_exclusivesi; break;
14223 case E_DImode: gen = gen_aarch64_load_exclusivedi; break;
14224 default:
14225 gcc_unreachable ();
14226 }
14227
14228 emit_insn (gen (rval, mem, model_rtx));
14229 }
14230
14231 /* Emit store exclusive. */
14232
14233 static void
14234 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
14235 rtx rval, rtx mem, rtx model_rtx)
14236 {
14237 rtx (*gen) (rtx, rtx, rtx, rtx);
14238
14239 switch (mode)
14240 {
14241 case E_QImode: gen = gen_aarch64_store_exclusiveqi; break;
14242 case E_HImode: gen = gen_aarch64_store_exclusivehi; break;
14243 case E_SImode: gen = gen_aarch64_store_exclusivesi; break;
14244 case E_DImode: gen = gen_aarch64_store_exclusivedi; break;
14245 default:
14246 gcc_unreachable ();
14247 }
14248
14249 emit_insn (gen (bval, rval, mem, model_rtx));
14250 }
14251
14252 /* Mark the previous jump instruction as unlikely. */
14253
14254 static void
14255 aarch64_emit_unlikely_jump (rtx insn)
14256 {
14257 rtx_insn *jump = emit_jump_insn (insn);
14258 add_reg_br_prob_note (jump, profile_probability::very_unlikely ());
14259 }
14260
14261 /* Expand a compare and swap pattern. */
14262
14263 void
14264 aarch64_expand_compare_and_swap (rtx operands[])
14265 {
14266 rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x;
14267 machine_mode mode, cmp_mode;
14268 typedef rtx (*gen_cas_fn) (rtx, rtx, rtx, rtx, rtx, rtx, rtx);
14269 int idx;
14270 gen_cas_fn gen;
14271 const gen_cas_fn split_cas[] =
14272 {
14273 gen_aarch64_compare_and_swapqi,
14274 gen_aarch64_compare_and_swaphi,
14275 gen_aarch64_compare_and_swapsi,
14276 gen_aarch64_compare_and_swapdi
14277 };
14278 const gen_cas_fn atomic_cas[] =
14279 {
14280 gen_aarch64_compare_and_swapqi_lse,
14281 gen_aarch64_compare_and_swaphi_lse,
14282 gen_aarch64_compare_and_swapsi_lse,
14283 gen_aarch64_compare_and_swapdi_lse
14284 };
14285
14286 bval = operands[0];
14287 rval = operands[1];
14288 mem = operands[2];
14289 oldval = operands[3];
14290 newval = operands[4];
14291 is_weak = operands[5];
14292 mod_s = operands[6];
14293 mod_f = operands[7];
14294 mode = GET_MODE (mem);
14295 cmp_mode = mode;
14296
14297 /* Normally the succ memory model must be stronger than fail, but in the
14298 unlikely event of fail being ACQUIRE and succ being RELEASE we need to
14299 promote succ to ACQ_REL so that we don't lose the acquire semantics. */
14300
14301 if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
14302 && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
14303 mod_s = GEN_INT (MEMMODEL_ACQ_REL);
14304
14305 switch (mode)
14306 {
14307 case E_QImode:
14308 case E_HImode:
14309 /* For short modes, we're going to perform the comparison in SImode,
14310 so do the zero-extension now. */
14311 cmp_mode = SImode;
14312 rval = gen_reg_rtx (SImode);
14313 oldval = convert_modes (SImode, mode, oldval, true);
14314 /* Fall through. */
14315
14316 case E_SImode:
14317 case E_DImode:
14318 /* Force the value into a register if needed. */
14319 if (!aarch64_plus_operand (oldval, mode))
14320 oldval = force_reg (cmp_mode, oldval);
14321 break;
14322
14323 default:
14324 gcc_unreachable ();
14325 }
14326
14327 switch (mode)
14328 {
14329 case E_QImode: idx = 0; break;
14330 case E_HImode: idx = 1; break;
14331 case E_SImode: idx = 2; break;
14332 case E_DImode: idx = 3; break;
14333 default:
14334 gcc_unreachable ();
14335 }
14336 if (TARGET_LSE)
14337 gen = atomic_cas[idx];
14338 else
14339 gen = split_cas[idx];
14340
14341 emit_insn (gen (rval, mem, oldval, newval, is_weak, mod_s, mod_f));
14342
14343 if (mode == QImode || mode == HImode)
14344 emit_move_insn (operands[1], gen_lowpart (mode, rval));
14345
14346 x = gen_rtx_REG (CCmode, CC_REGNUM);
14347 x = gen_rtx_EQ (SImode, x, const0_rtx);
14348 emit_insn (gen_rtx_SET (bval, x));
14349 }
14350
14351 /* Test whether the target supports using a atomic load-operate instruction.
14352 CODE is the operation and AFTER is TRUE if the data in memory after the
14353 operation should be returned and FALSE if the data before the operation
14354 should be returned. Returns FALSE if the operation isn't supported by the
14355 architecture. */
14356
14357 bool
14358 aarch64_atomic_ldop_supported_p (enum rtx_code code)
14359 {
14360 if (!TARGET_LSE)
14361 return false;
14362
14363 switch (code)
14364 {
14365 case SET:
14366 case AND:
14367 case IOR:
14368 case XOR:
14369 case MINUS:
14370 case PLUS:
14371 return true;
14372 default:
14373 return false;
14374 }
14375 }
14376
14377 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
14378 sequence implementing an atomic operation. */
14379
14380 static void
14381 aarch64_emit_post_barrier (enum memmodel model)
14382 {
14383 const enum memmodel base_model = memmodel_base (model);
14384
14385 if (is_mm_sync (model)
14386 && (base_model == MEMMODEL_ACQUIRE
14387 || base_model == MEMMODEL_ACQ_REL
14388 || base_model == MEMMODEL_SEQ_CST))
14389 {
14390 emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
14391 }
14392 }
14393
14394 /* Emit an atomic compare-and-swap operation. RVAL is the destination register
14395 for the data in memory. EXPECTED is the value expected to be in memory.
14396 DESIRED is the value to store to memory. MEM is the memory location. MODEL
14397 is the memory ordering to use. */
14398
14399 void
14400 aarch64_gen_atomic_cas (rtx rval, rtx mem,
14401 rtx expected, rtx desired,
14402 rtx model)
14403 {
14404 rtx (*gen) (rtx, rtx, rtx, rtx);
14405 machine_mode mode;
14406
14407 mode = GET_MODE (mem);
14408
14409 switch (mode)
14410 {
14411 case E_QImode: gen = gen_aarch64_atomic_casqi; break;
14412 case E_HImode: gen = gen_aarch64_atomic_cashi; break;
14413 case E_SImode: gen = gen_aarch64_atomic_cassi; break;
14414 case E_DImode: gen = gen_aarch64_atomic_casdi; break;
14415 default:
14416 gcc_unreachable ();
14417 }
14418
14419 /* Move the expected value into the CAS destination register. */
14420 emit_insn (gen_rtx_SET (rval, expected));
14421
14422 /* Emit the CAS. */
14423 emit_insn (gen (rval, mem, desired, model));
14424
14425 /* Compare the expected value with the value loaded by the CAS, to establish
14426 whether the swap was made. */
14427 aarch64_gen_compare_reg (EQ, rval, expected);
14428 }
14429
14430 /* Split a compare and swap pattern. */
14431
14432 void
14433 aarch64_split_compare_and_swap (rtx operands[])
14434 {
14435 rtx rval, mem, oldval, newval, scratch;
14436 machine_mode mode;
14437 bool is_weak;
14438 rtx_code_label *label1, *label2;
14439 rtx x, cond;
14440 enum memmodel model;
14441 rtx model_rtx;
14442
14443 rval = operands[0];
14444 mem = operands[1];
14445 oldval = operands[2];
14446 newval = operands[3];
14447 is_weak = (operands[4] != const0_rtx);
14448 model_rtx = operands[5];
14449 scratch = operands[7];
14450 mode = GET_MODE (mem);
14451 model = memmodel_from_int (INTVAL (model_rtx));
14452
14453 /* When OLDVAL is zero and we want the strong version we can emit a tighter
14454 loop:
14455 .label1:
14456 LD[A]XR rval, [mem]
14457 CBNZ rval, .label2
14458 ST[L]XR scratch, newval, [mem]
14459 CBNZ scratch, .label1
14460 .label2:
14461 CMP rval, 0. */
14462 bool strong_zero_p = !is_weak && oldval == const0_rtx;
14463
14464 label1 = NULL;
14465 if (!is_weak)
14466 {
14467 label1 = gen_label_rtx ();
14468 emit_label (label1);
14469 }
14470 label2 = gen_label_rtx ();
14471
14472 /* The initial load can be relaxed for a __sync operation since a final
14473 barrier will be emitted to stop code hoisting. */
14474 if (is_mm_sync (model))
14475 aarch64_emit_load_exclusive (mode, rval, mem,
14476 GEN_INT (MEMMODEL_RELAXED));
14477 else
14478 aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
14479
14480 if (strong_zero_p)
14481 {
14482 x = gen_rtx_NE (VOIDmode, rval, const0_rtx);
14483 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
14484 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
14485 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
14486 }
14487 else
14488 {
14489 cond = aarch64_gen_compare_reg (NE, rval, oldval);
14490 x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
14491 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
14492 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
14493 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
14494 }
14495
14496 aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
14497
14498 if (!is_weak)
14499 {
14500 x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
14501 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
14502 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
14503 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
14504 }
14505 else
14506 {
14507 cond = gen_rtx_REG (CCmode, CC_REGNUM);
14508 x = gen_rtx_COMPARE (CCmode, scratch, const0_rtx);
14509 emit_insn (gen_rtx_SET (cond, x));
14510 }
14511
14512 emit_label (label2);
14513 /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
14514 to set the condition flags. If this is not used it will be removed by
14515 later passes. */
14516 if (strong_zero_p)
14517 {
14518 cond = gen_rtx_REG (CCmode, CC_REGNUM);
14519 x = gen_rtx_COMPARE (CCmode, rval, const0_rtx);
14520 emit_insn (gen_rtx_SET (cond, x));
14521 }
14522 /* Emit any final barrier needed for a __sync operation. */
14523 if (is_mm_sync (model))
14524 aarch64_emit_post_barrier (model);
14525 }
14526
14527 /* Emit a BIC instruction. */
14528
14529 static void
14530 aarch64_emit_bic (machine_mode mode, rtx dst, rtx s1, rtx s2, int shift)
14531 {
14532 rtx shift_rtx = GEN_INT (shift);
14533 rtx (*gen) (rtx, rtx, rtx, rtx);
14534
14535 switch (mode)
14536 {
14537 case E_SImode: gen = gen_and_one_cmpl_lshrsi3; break;
14538 case E_DImode: gen = gen_and_one_cmpl_lshrdi3; break;
14539 default:
14540 gcc_unreachable ();
14541 }
14542
14543 emit_insn (gen (dst, s2, shift_rtx, s1));
14544 }
14545
14546 /* Emit an atomic swap. */
14547
14548 static void
14549 aarch64_emit_atomic_swap (machine_mode mode, rtx dst, rtx value,
14550 rtx mem, rtx model)
14551 {
14552 rtx (*gen) (rtx, rtx, rtx, rtx);
14553
14554 switch (mode)
14555 {
14556 case E_QImode: gen = gen_aarch64_atomic_swpqi; break;
14557 case E_HImode: gen = gen_aarch64_atomic_swphi; break;
14558 case E_SImode: gen = gen_aarch64_atomic_swpsi; break;
14559 case E_DImode: gen = gen_aarch64_atomic_swpdi; break;
14560 default:
14561 gcc_unreachable ();
14562 }
14563
14564 emit_insn (gen (dst, mem, value, model));
14565 }
14566
14567 /* Operations supported by aarch64_emit_atomic_load_op. */
14568
14569 enum aarch64_atomic_load_op_code
14570 {
14571 AARCH64_LDOP_PLUS, /* A + B */
14572 AARCH64_LDOP_XOR, /* A ^ B */
14573 AARCH64_LDOP_OR, /* A | B */
14574 AARCH64_LDOP_BIC /* A & ~B */
14575 };
14576
14577 /* Emit an atomic load-operate. */
14578
14579 static void
14580 aarch64_emit_atomic_load_op (enum aarch64_atomic_load_op_code code,
14581 machine_mode mode, rtx dst, rtx src,
14582 rtx mem, rtx model)
14583 {
14584 typedef rtx (*aarch64_atomic_load_op_fn) (rtx, rtx, rtx, rtx);
14585 const aarch64_atomic_load_op_fn plus[] =
14586 {
14587 gen_aarch64_atomic_loadaddqi,
14588 gen_aarch64_atomic_loadaddhi,
14589 gen_aarch64_atomic_loadaddsi,
14590 gen_aarch64_atomic_loadadddi
14591 };
14592 const aarch64_atomic_load_op_fn eor[] =
14593 {
14594 gen_aarch64_atomic_loadeorqi,
14595 gen_aarch64_atomic_loadeorhi,
14596 gen_aarch64_atomic_loadeorsi,
14597 gen_aarch64_atomic_loadeordi
14598 };
14599 const aarch64_atomic_load_op_fn ior[] =
14600 {
14601 gen_aarch64_atomic_loadsetqi,
14602 gen_aarch64_atomic_loadsethi,
14603 gen_aarch64_atomic_loadsetsi,
14604 gen_aarch64_atomic_loadsetdi
14605 };
14606 const aarch64_atomic_load_op_fn bic[] =
14607 {
14608 gen_aarch64_atomic_loadclrqi,
14609 gen_aarch64_atomic_loadclrhi,
14610 gen_aarch64_atomic_loadclrsi,
14611 gen_aarch64_atomic_loadclrdi
14612 };
14613 aarch64_atomic_load_op_fn gen;
14614 int idx = 0;
14615
14616 switch (mode)
14617 {
14618 case E_QImode: idx = 0; break;
14619 case E_HImode: idx = 1; break;
14620 case E_SImode: idx = 2; break;
14621 case E_DImode: idx = 3; break;
14622 default:
14623 gcc_unreachable ();
14624 }
14625
14626 switch (code)
14627 {
14628 case AARCH64_LDOP_PLUS: gen = plus[idx]; break;
14629 case AARCH64_LDOP_XOR: gen = eor[idx]; break;
14630 case AARCH64_LDOP_OR: gen = ior[idx]; break;
14631 case AARCH64_LDOP_BIC: gen = bic[idx]; break;
14632 default:
14633 gcc_unreachable ();
14634 }
14635
14636 emit_insn (gen (dst, mem, src, model));
14637 }
14638
14639 /* Emit an atomic load+operate. CODE is the operation. OUT_DATA is the
14640 location to store the data read from memory. OUT_RESULT is the location to
14641 store the result of the operation. MEM is the memory location to read and
14642 modify. MODEL_RTX is the memory ordering to use. VALUE is the second
14643 operand for the operation. Either OUT_DATA or OUT_RESULT, but not both, can
14644 be NULL. */
14645
14646 void
14647 aarch64_gen_atomic_ldop (enum rtx_code code, rtx out_data, rtx out_result,
14648 rtx mem, rtx value, rtx model_rtx)
14649 {
14650 machine_mode mode = GET_MODE (mem);
14651 machine_mode wmode = (mode == DImode ? DImode : SImode);
14652 const bool short_mode = (mode < SImode);
14653 aarch64_atomic_load_op_code ldop_code;
14654 rtx src;
14655 rtx x;
14656
14657 if (out_data)
14658 out_data = gen_lowpart (mode, out_data);
14659
14660 if (out_result)
14661 out_result = gen_lowpart (mode, out_result);
14662
14663 /* Make sure the value is in a register, putting it into a destination
14664 register if it needs to be manipulated. */
14665 if (!register_operand (value, mode)
14666 || code == AND || code == MINUS)
14667 {
14668 src = out_result ? out_result : out_data;
14669 emit_move_insn (src, gen_lowpart (mode, value));
14670 }
14671 else
14672 src = value;
14673 gcc_assert (register_operand (src, mode));
14674
14675 /* Preprocess the data for the operation as necessary. If the operation is
14676 a SET then emit a swap instruction and finish. */
14677 switch (code)
14678 {
14679 case SET:
14680 aarch64_emit_atomic_swap (mode, out_data, src, mem, model_rtx);
14681 return;
14682
14683 case MINUS:
14684 /* Negate the value and treat it as a PLUS. */
14685 {
14686 rtx neg_src;
14687
14688 /* Resize the value if necessary. */
14689 if (short_mode)
14690 src = gen_lowpart (wmode, src);
14691
14692 neg_src = gen_rtx_NEG (wmode, src);
14693 emit_insn (gen_rtx_SET (src, neg_src));
14694
14695 if (short_mode)
14696 src = gen_lowpart (mode, src);
14697 }
14698 /* Fall-through. */
14699 case PLUS:
14700 ldop_code = AARCH64_LDOP_PLUS;
14701 break;
14702
14703 case IOR:
14704 ldop_code = AARCH64_LDOP_OR;
14705 break;
14706
14707 case XOR:
14708 ldop_code = AARCH64_LDOP_XOR;
14709 break;
14710
14711 case AND:
14712 {
14713 rtx not_src;
14714
14715 /* Resize the value if necessary. */
14716 if (short_mode)
14717 src = gen_lowpart (wmode, src);
14718
14719 not_src = gen_rtx_NOT (wmode, src);
14720 emit_insn (gen_rtx_SET (src, not_src));
14721
14722 if (short_mode)
14723 src = gen_lowpart (mode, src);
14724 }
14725 ldop_code = AARCH64_LDOP_BIC;
14726 break;
14727
14728 default:
14729 /* The operation can't be done with atomic instructions. */
14730 gcc_unreachable ();
14731 }
14732
14733 aarch64_emit_atomic_load_op (ldop_code, mode, out_data, src, mem, model_rtx);
14734
14735 /* If necessary, calculate the data in memory after the update by redoing the
14736 operation from values in registers. */
14737 if (!out_result)
14738 return;
14739
14740 if (short_mode)
14741 {
14742 src = gen_lowpart (wmode, src);
14743 out_data = gen_lowpart (wmode, out_data);
14744 out_result = gen_lowpart (wmode, out_result);
14745 }
14746
14747 x = NULL_RTX;
14748
14749 switch (code)
14750 {
14751 case MINUS:
14752 case PLUS:
14753 x = gen_rtx_PLUS (wmode, out_data, src);
14754 break;
14755 case IOR:
14756 x = gen_rtx_IOR (wmode, out_data, src);
14757 break;
14758 case XOR:
14759 x = gen_rtx_XOR (wmode, out_data, src);
14760 break;
14761 case AND:
14762 aarch64_emit_bic (wmode, out_result, out_data, src, 0);
14763 return;
14764 default:
14765 gcc_unreachable ();
14766 }
14767
14768 emit_set_insn (out_result, x);
14769
14770 return;
14771 }
14772
14773 /* Split an atomic operation. */
14774
14775 void
14776 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
14777 rtx value, rtx model_rtx, rtx cond)
14778 {
14779 machine_mode mode = GET_MODE (mem);
14780 machine_mode wmode = (mode == DImode ? DImode : SImode);
14781 const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
14782 const bool is_sync = is_mm_sync (model);
14783 rtx_code_label *label;
14784 rtx x;
14785
14786 /* Split the atomic operation into a sequence. */
14787 label = gen_label_rtx ();
14788 emit_label (label);
14789
14790 if (new_out)
14791 new_out = gen_lowpart (wmode, new_out);
14792 if (old_out)
14793 old_out = gen_lowpart (wmode, old_out);
14794 else
14795 old_out = new_out;
14796 value = simplify_gen_subreg (wmode, value, mode, 0);
14797
14798 /* The initial load can be relaxed for a __sync operation since a final
14799 barrier will be emitted to stop code hoisting. */
14800 if (is_sync)
14801 aarch64_emit_load_exclusive (mode, old_out, mem,
14802 GEN_INT (MEMMODEL_RELAXED));
14803 else
14804 aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
14805
14806 switch (code)
14807 {
14808 case SET:
14809 new_out = value;
14810 break;
14811
14812 case NOT:
14813 x = gen_rtx_AND (wmode, old_out, value);
14814 emit_insn (gen_rtx_SET (new_out, x));
14815 x = gen_rtx_NOT (wmode, new_out);
14816 emit_insn (gen_rtx_SET (new_out, x));
14817 break;
14818
14819 case MINUS:
14820 if (CONST_INT_P (value))
14821 {
14822 value = GEN_INT (-INTVAL (value));
14823 code = PLUS;
14824 }
14825 /* Fall through. */
14826
14827 default:
14828 x = gen_rtx_fmt_ee (code, wmode, old_out, value);
14829 emit_insn (gen_rtx_SET (new_out, x));
14830 break;
14831 }
14832
14833 aarch64_emit_store_exclusive (mode, cond, mem,
14834 gen_lowpart (mode, new_out), model_rtx);
14835
14836 x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
14837 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
14838 gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
14839 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
14840
14841 /* Emit any final barrier needed for a __sync operation. */
14842 if (is_sync)
14843 aarch64_emit_post_barrier (model);
14844 }
14845
14846 static void
14847 aarch64_init_libfuncs (void)
14848 {
14849 /* Half-precision float operations. The compiler handles all operations
14850 with NULL libfuncs by converting to SFmode. */
14851
14852 /* Conversions. */
14853 set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
14854 set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
14855
14856 /* Arithmetic. */
14857 set_optab_libfunc (add_optab, HFmode, NULL);
14858 set_optab_libfunc (sdiv_optab, HFmode, NULL);
14859 set_optab_libfunc (smul_optab, HFmode, NULL);
14860 set_optab_libfunc (neg_optab, HFmode, NULL);
14861 set_optab_libfunc (sub_optab, HFmode, NULL);
14862
14863 /* Comparisons. */
14864 set_optab_libfunc (eq_optab, HFmode, NULL);
14865 set_optab_libfunc (ne_optab, HFmode, NULL);
14866 set_optab_libfunc (lt_optab, HFmode, NULL);
14867 set_optab_libfunc (le_optab, HFmode, NULL);
14868 set_optab_libfunc (ge_optab, HFmode, NULL);
14869 set_optab_libfunc (gt_optab, HFmode, NULL);
14870 set_optab_libfunc (unord_optab, HFmode, NULL);
14871 }
14872
14873 /* Target hook for c_mode_for_suffix. */
14874 static machine_mode
14875 aarch64_c_mode_for_suffix (char suffix)
14876 {
14877 if (suffix == 'q')
14878 return TFmode;
14879
14880 return VOIDmode;
14881 }
14882
14883 /* We can only represent floating point constants which will fit in
14884 "quarter-precision" values. These values are characterised by
14885 a sign bit, a 4-bit mantissa and a 3-bit exponent. And are given
14886 by:
14887
14888 (-1)^s * (n/16) * 2^r
14889
14890 Where:
14891 's' is the sign bit.
14892 'n' is an integer in the range 16 <= n <= 31.
14893 'r' is an integer in the range -3 <= r <= 4. */
14894
14895 /* Return true iff X can be represented by a quarter-precision
14896 floating point immediate operand X. Note, we cannot represent 0.0. */
14897 bool
14898 aarch64_float_const_representable_p (rtx x)
14899 {
14900 /* This represents our current view of how many bits
14901 make up the mantissa. */
14902 int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
14903 int exponent;
14904 unsigned HOST_WIDE_INT mantissa, mask;
14905 REAL_VALUE_TYPE r, m;
14906 bool fail;
14907
14908 if (!CONST_DOUBLE_P (x))
14909 return false;
14910
14911 /* We don't support HFmode constants yet. */
14912 if (GET_MODE (x) == VOIDmode || GET_MODE (x) == HFmode)
14913 return false;
14914
14915 r = *CONST_DOUBLE_REAL_VALUE (x);
14916
14917 /* We cannot represent infinities, NaNs or +/-zero. We won't
14918 know if we have +zero until we analyse the mantissa, but we
14919 can reject the other invalid values. */
14920 if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
14921 || REAL_VALUE_MINUS_ZERO (r))
14922 return false;
14923
14924 /* Extract exponent. */
14925 r = real_value_abs (&r);
14926 exponent = REAL_EXP (&r);
14927
14928 /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
14929 highest (sign) bit, with a fixed binary point at bit point_pos.
14930 m1 holds the low part of the mantissa, m2 the high part.
14931 WARNING: If we ever have a representation using more than 2 * H_W_I - 1
14932 bits for the mantissa, this can fail (low bits will be lost). */
14933 real_ldexp (&m, &r, point_pos - exponent);
14934 wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
14935
14936 /* If the low part of the mantissa has bits set we cannot represent
14937 the value. */
14938 if (w.ulow () != 0)
14939 return false;
14940 /* We have rejected the lower HOST_WIDE_INT, so update our
14941 understanding of how many bits lie in the mantissa and
14942 look only at the high HOST_WIDE_INT. */
14943 mantissa = w.elt (1);
14944 point_pos -= HOST_BITS_PER_WIDE_INT;
14945
14946 /* We can only represent values with a mantissa of the form 1.xxxx. */
14947 mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
14948 if ((mantissa & mask) != 0)
14949 return false;
14950
14951 /* Having filtered unrepresentable values, we may now remove all
14952 but the highest 5 bits. */
14953 mantissa >>= point_pos - 5;
14954
14955 /* We cannot represent the value 0.0, so reject it. This is handled
14956 elsewhere. */
14957 if (mantissa == 0)
14958 return false;
14959
14960 /* Then, as bit 4 is always set, we can mask it off, leaving
14961 the mantissa in the range [0, 15]. */
14962 mantissa &= ~(1 << 4);
14963 gcc_assert (mantissa <= 15);
14964
14965 /* GCC internally does not use IEEE754-like encoding (where normalized
14966 significands are in the range [1, 2). GCC uses [0.5, 1) (see real.c).
14967 Our mantissa values are shifted 4 places to the left relative to
14968 normalized IEEE754 so we must modify the exponent returned by REAL_EXP
14969 by 5 places to correct for GCC's representation. */
14970 exponent = 5 - exponent;
14971
14972 return (exponent >= 0 && exponent <= 7);
14973 }
14974
14975 /* Returns the string with the instruction for AdvSIMD MOVI, MVNI, ORR or BIC
14976 immediate with a CONST_VECTOR of MODE and WIDTH. WHICH selects whether to
14977 output MOVI/MVNI, ORR or BIC immediate. */
14978 char*
14979 aarch64_output_simd_mov_immediate (rtx const_vector, unsigned width,
14980 enum simd_immediate_check which)
14981 {
14982 bool is_valid;
14983 static char templ[40];
14984 const char *mnemonic;
14985 const char *shift_op;
14986 unsigned int lane_count = 0;
14987 char element_char;
14988
14989 struct simd_immediate_info info;
14990
14991 /* This will return true to show const_vector is legal for use as either
14992 a AdvSIMD MOVI instruction (or, implicitly, MVNI), ORR or BIC immediate.
14993 It will also update INFO to show how the immediate should be generated.
14994 WHICH selects whether to check for MOVI/MVNI, ORR or BIC. */
14995 is_valid = aarch64_simd_valid_immediate (const_vector, &info, which);
14996 gcc_assert (is_valid);
14997
14998 element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
14999 lane_count = width / GET_MODE_BITSIZE (info.elt_mode);
15000
15001 if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
15002 {
15003 gcc_assert (info.shift == 0 && info.insn == simd_immediate_info::MOV);
15004 /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
15005 move immediate path. */
15006 if (aarch64_float_const_zero_rtx_p (info.value))
15007 info.value = GEN_INT (0);
15008 else
15009 {
15010 const unsigned int buf_size = 20;
15011 char float_buf[buf_size] = {'\0'};
15012 real_to_decimal_for_mode (float_buf,
15013 CONST_DOUBLE_REAL_VALUE (info.value),
15014 buf_size, buf_size, 1, info.elt_mode);
15015
15016 if (lane_count == 1)
15017 snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
15018 else
15019 snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
15020 lane_count, element_char, float_buf);
15021 return templ;
15022 }
15023 }
15024
15025 gcc_assert (CONST_INT_P (info.value));
15026
15027 if (which == AARCH64_CHECK_MOV)
15028 {
15029 mnemonic = info.insn == simd_immediate_info::MVN ? "mvni" : "movi";
15030 shift_op = info.modifier == simd_immediate_info::MSL ? "msl" : "lsl";
15031 if (lane_count == 1)
15032 snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
15033 mnemonic, UINTVAL (info.value));
15034 else if (info.shift)
15035 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
15036 HOST_WIDE_INT_PRINT_HEX ", %s %d", mnemonic, lane_count,
15037 element_char, UINTVAL (info.value), shift_op, info.shift);
15038 else
15039 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
15040 HOST_WIDE_INT_PRINT_HEX, mnemonic, lane_count,
15041 element_char, UINTVAL (info.value));
15042 }
15043 else
15044 {
15045 /* For AARCH64_CHECK_BIC and AARCH64_CHECK_ORR. */
15046 mnemonic = info.insn == simd_immediate_info::MVN ? "bic" : "orr";
15047 if (info.shift)
15048 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
15049 HOST_WIDE_INT_PRINT_DEC ", %s #%d", mnemonic, lane_count,
15050 element_char, UINTVAL (info.value), "lsl", info.shift);
15051 else
15052 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
15053 HOST_WIDE_INT_PRINT_DEC, mnemonic, lane_count,
15054 element_char, UINTVAL (info.value));
15055 }
15056 return templ;
15057 }
15058
15059 char*
15060 aarch64_output_scalar_simd_mov_immediate (rtx immediate, scalar_int_mode mode)
15061 {
15062
15063 /* If a floating point number was passed and we desire to use it in an
15064 integer mode do the conversion to integer. */
15065 if (CONST_DOUBLE_P (immediate) && GET_MODE_CLASS (mode) == MODE_INT)
15066 {
15067 unsigned HOST_WIDE_INT ival;
15068 if (!aarch64_reinterpret_float_as_int (immediate, &ival))
15069 gcc_unreachable ();
15070 immediate = gen_int_mode (ival, mode);
15071 }
15072
15073 machine_mode vmode;
15074 /* use a 64 bit mode for everything except for DI/DF mode, where we use
15075 a 128 bit vector mode. */
15076 int width = GET_MODE_BITSIZE (mode) == 64 ? 128 : 64;
15077
15078 vmode = aarch64_simd_container_mode (mode, width);
15079 rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
15080 return aarch64_output_simd_mov_immediate (v_op, width);
15081 }
15082
15083 /* Return the output string to use for moving immediate CONST_VECTOR
15084 into an SVE register. */
15085
15086 char *
15087 aarch64_output_sve_mov_immediate (rtx const_vector)
15088 {
15089 static char templ[40];
15090 struct simd_immediate_info info;
15091 char element_char;
15092
15093 bool is_valid = aarch64_simd_valid_immediate (const_vector, &info);
15094 gcc_assert (is_valid);
15095
15096 element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
15097
15098 if (info.step)
15099 {
15100 snprintf (templ, sizeof (templ), "index\t%%0.%c, #"
15101 HOST_WIDE_INT_PRINT_DEC ", #" HOST_WIDE_INT_PRINT_DEC,
15102 element_char, INTVAL (info.value), INTVAL (info.step));
15103 return templ;
15104 }
15105
15106 if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
15107 {
15108 if (aarch64_float_const_zero_rtx_p (info.value))
15109 info.value = GEN_INT (0);
15110 else
15111 {
15112 const int buf_size = 20;
15113 char float_buf[buf_size] = {};
15114 real_to_decimal_for_mode (float_buf,
15115 CONST_DOUBLE_REAL_VALUE (info.value),
15116 buf_size, buf_size, 1, info.elt_mode);
15117
15118 snprintf (templ, sizeof (templ), "fmov\t%%0.%c, #%s",
15119 element_char, float_buf);
15120 return templ;
15121 }
15122 }
15123
15124 snprintf (templ, sizeof (templ), "mov\t%%0.%c, #" HOST_WIDE_INT_PRINT_DEC,
15125 element_char, INTVAL (info.value));
15126 return templ;
15127 }
15128
15129 /* Return the asm format for a PTRUE instruction whose destination has
15130 mode MODE. SUFFIX is the element size suffix. */
15131
15132 char *
15133 aarch64_output_ptrue (machine_mode mode, char suffix)
15134 {
15135 unsigned int nunits;
15136 static char buf[sizeof ("ptrue\t%0.N, vlNNNNN")];
15137 if (GET_MODE_NUNITS (mode).is_constant (&nunits))
15138 snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, vl%d", suffix, nunits);
15139 else
15140 snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, all", suffix);
15141 return buf;
15142 }
15143
15144 /* Split operands into moves from op[1] + op[2] into op[0]. */
15145
15146 void
15147 aarch64_split_combinev16qi (rtx operands[3])
15148 {
15149 unsigned int dest = REGNO (operands[0]);
15150 unsigned int src1 = REGNO (operands[1]);
15151 unsigned int src2 = REGNO (operands[2]);
15152 machine_mode halfmode = GET_MODE (operands[1]);
15153 unsigned int halfregs = REG_NREGS (operands[1]);
15154 rtx destlo, desthi;
15155
15156 gcc_assert (halfmode == V16QImode);
15157
15158 if (src1 == dest && src2 == dest + halfregs)
15159 {
15160 /* No-op move. Can't split to nothing; emit something. */
15161 emit_note (NOTE_INSN_DELETED);
15162 return;
15163 }
15164
15165 /* Preserve register attributes for variable tracking. */
15166 destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
15167 desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
15168 GET_MODE_SIZE (halfmode));
15169
15170 /* Special case of reversed high/low parts. */
15171 if (reg_overlap_mentioned_p (operands[2], destlo)
15172 && reg_overlap_mentioned_p (operands[1], desthi))
15173 {
15174 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
15175 emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
15176 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
15177 }
15178 else if (!reg_overlap_mentioned_p (operands[2], destlo))
15179 {
15180 /* Try to avoid unnecessary moves if part of the result
15181 is in the right place already. */
15182 if (src1 != dest)
15183 emit_move_insn (destlo, operands[1]);
15184 if (src2 != dest + halfregs)
15185 emit_move_insn (desthi, operands[2]);
15186 }
15187 else
15188 {
15189 if (src2 != dest + halfregs)
15190 emit_move_insn (desthi, operands[2]);
15191 if (src1 != dest)
15192 emit_move_insn (destlo, operands[1]);
15193 }
15194 }
15195
15196 /* vec_perm support. */
15197
15198 struct expand_vec_perm_d
15199 {
15200 rtx target, op0, op1;
15201 vec_perm_indices perm;
15202 machine_mode vmode;
15203 unsigned int vec_flags;
15204 bool one_vector_p;
15205 bool testing_p;
15206 };
15207
15208 /* Generate a variable permutation. */
15209
15210 static void
15211 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
15212 {
15213 machine_mode vmode = GET_MODE (target);
15214 bool one_vector_p = rtx_equal_p (op0, op1);
15215
15216 gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
15217 gcc_checking_assert (GET_MODE (op0) == vmode);
15218 gcc_checking_assert (GET_MODE (op1) == vmode);
15219 gcc_checking_assert (GET_MODE (sel) == vmode);
15220 gcc_checking_assert (TARGET_SIMD);
15221
15222 if (one_vector_p)
15223 {
15224 if (vmode == V8QImode)
15225 {
15226 /* Expand the argument to a V16QI mode by duplicating it. */
15227 rtx pair = gen_reg_rtx (V16QImode);
15228 emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
15229 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
15230 }
15231 else
15232 {
15233 emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
15234 }
15235 }
15236 else
15237 {
15238 rtx pair;
15239
15240 if (vmode == V8QImode)
15241 {
15242 pair = gen_reg_rtx (V16QImode);
15243 emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
15244 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
15245 }
15246 else
15247 {
15248 pair = gen_reg_rtx (OImode);
15249 emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
15250 emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
15251 }
15252 }
15253 }
15254
15255 /* Expand a vec_perm with the operands given by TARGET, OP0, OP1 and SEL.
15256 NELT is the number of elements in the vector. */
15257
15258 void
15259 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel,
15260 unsigned int nelt)
15261 {
15262 machine_mode vmode = GET_MODE (target);
15263 bool one_vector_p = rtx_equal_p (op0, op1);
15264 rtx mask;
15265
15266 /* The TBL instruction does not use a modulo index, so we must take care
15267 of that ourselves. */
15268 mask = aarch64_simd_gen_const_vector_dup (vmode,
15269 one_vector_p ? nelt - 1 : 2 * nelt - 1);
15270 sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
15271
15272 /* For big-endian, we also need to reverse the index within the vector
15273 (but not which vector). */
15274 if (BYTES_BIG_ENDIAN)
15275 {
15276 /* If one_vector_p, mask is a vector of (nelt - 1)'s already. */
15277 if (!one_vector_p)
15278 mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
15279 sel = expand_simple_binop (vmode, XOR, sel, mask,
15280 NULL, 0, OPTAB_LIB_WIDEN);
15281 }
15282 aarch64_expand_vec_perm_1 (target, op0, op1, sel);
15283 }
15284
15285 /* Generate (set TARGET (unspec [OP0 OP1] CODE)). */
15286
15287 static void
15288 emit_unspec2 (rtx target, int code, rtx op0, rtx op1)
15289 {
15290 emit_insn (gen_rtx_SET (target,
15291 gen_rtx_UNSPEC (GET_MODE (target),
15292 gen_rtvec (2, op0, op1), code)));
15293 }
15294
15295 /* Expand an SVE vec_perm with the given operands. */
15296
15297 void
15298 aarch64_expand_sve_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
15299 {
15300 machine_mode data_mode = GET_MODE (target);
15301 machine_mode sel_mode = GET_MODE (sel);
15302 /* Enforced by the pattern condition. */
15303 int nunits = GET_MODE_NUNITS (sel_mode).to_constant ();
15304
15305 /* Note: vec_perm indices are supposed to wrap when they go beyond the
15306 size of the two value vectors, i.e. the upper bits of the indices
15307 are effectively ignored. SVE TBL instead produces 0 for any
15308 out-of-range indices, so we need to modulo all the vec_perm indices
15309 to ensure they are all in range. */
15310 rtx sel_reg = force_reg (sel_mode, sel);
15311
15312 /* Check if the sel only references the first values vector. */
15313 if (GET_CODE (sel) == CONST_VECTOR
15314 && aarch64_const_vec_all_in_range_p (sel, 0, nunits - 1))
15315 {
15316 emit_unspec2 (target, UNSPEC_TBL, op0, sel_reg);
15317 return;
15318 }
15319
15320 /* Check if the two values vectors are the same. */
15321 if (rtx_equal_p (op0, op1))
15322 {
15323 rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode, nunits - 1);
15324 rtx sel_mod = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
15325 NULL, 0, OPTAB_DIRECT);
15326 emit_unspec2 (target, UNSPEC_TBL, op0, sel_mod);
15327 return;
15328 }
15329
15330 /* Run TBL on for each value vector and combine the results. */
15331
15332 rtx res0 = gen_reg_rtx (data_mode);
15333 rtx res1 = gen_reg_rtx (data_mode);
15334 rtx neg_num_elems = aarch64_simd_gen_const_vector_dup (sel_mode, -nunits);
15335 if (GET_CODE (sel) != CONST_VECTOR
15336 || !aarch64_const_vec_all_in_range_p (sel, 0, 2 * nunits - 1))
15337 {
15338 rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode,
15339 2 * nunits - 1);
15340 sel_reg = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
15341 NULL, 0, OPTAB_DIRECT);
15342 }
15343 emit_unspec2 (res0, UNSPEC_TBL, op0, sel_reg);
15344 rtx sel_sub = expand_simple_binop (sel_mode, PLUS, sel_reg, neg_num_elems,
15345 NULL, 0, OPTAB_DIRECT);
15346 emit_unspec2 (res1, UNSPEC_TBL, op1, sel_sub);
15347 if (GET_MODE_CLASS (data_mode) == MODE_VECTOR_INT)
15348 emit_insn (gen_rtx_SET (target, gen_rtx_IOR (data_mode, res0, res1)));
15349 else
15350 emit_unspec2 (target, UNSPEC_IORF, res0, res1);
15351 }
15352
15353 /* Recognize patterns suitable for the TRN instructions. */
15354 static bool
15355 aarch64_evpc_trn (struct expand_vec_perm_d *d)
15356 {
15357 HOST_WIDE_INT odd;
15358 poly_uint64 nelt = d->perm.length ();
15359 rtx out, in0, in1, x;
15360 machine_mode vmode = d->vmode;
15361
15362 if (GET_MODE_UNIT_SIZE (vmode) > 8)
15363 return false;
15364
15365 /* Note that these are little-endian tests.
15366 We correct for big-endian later. */
15367 if (!d->perm[0].is_constant (&odd)
15368 || (odd != 0 && odd != 1)
15369 || !d->perm.series_p (0, 2, odd, 2)
15370 || !d->perm.series_p (1, 2, nelt + odd, 2))
15371 return false;
15372
15373 /* Success! */
15374 if (d->testing_p)
15375 return true;
15376
15377 in0 = d->op0;
15378 in1 = d->op1;
15379 /* We don't need a big-endian lane correction for SVE; see the comment
15380 at the head of aarch64-sve.md for details. */
15381 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
15382 {
15383 x = in0, in0 = in1, in1 = x;
15384 odd = !odd;
15385 }
15386 out = d->target;
15387
15388 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
15389 odd ? UNSPEC_TRN2 : UNSPEC_TRN1));
15390 return true;
15391 }
15392
15393 /* Recognize patterns suitable for the UZP instructions. */
15394 static bool
15395 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
15396 {
15397 HOST_WIDE_INT odd;
15398 rtx out, in0, in1, x;
15399 machine_mode vmode = d->vmode;
15400
15401 if (GET_MODE_UNIT_SIZE (vmode) > 8)
15402 return false;
15403
15404 /* Note that these are little-endian tests.
15405 We correct for big-endian later. */
15406 if (!d->perm[0].is_constant (&odd)
15407 || (odd != 0 && odd != 1)
15408 || !d->perm.series_p (0, 1, odd, 2))
15409 return false;
15410
15411 /* Success! */
15412 if (d->testing_p)
15413 return true;
15414
15415 in0 = d->op0;
15416 in1 = d->op1;
15417 /* We don't need a big-endian lane correction for SVE; see the comment
15418 at the head of aarch64-sve.md for details. */
15419 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
15420 {
15421 x = in0, in0 = in1, in1 = x;
15422 odd = !odd;
15423 }
15424 out = d->target;
15425
15426 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
15427 odd ? UNSPEC_UZP2 : UNSPEC_UZP1));
15428 return true;
15429 }
15430
15431 /* Recognize patterns suitable for the ZIP instructions. */
15432 static bool
15433 aarch64_evpc_zip (struct expand_vec_perm_d *d)
15434 {
15435 unsigned int high;
15436 poly_uint64 nelt = d->perm.length ();
15437 rtx out, in0, in1, x;
15438 machine_mode vmode = d->vmode;
15439
15440 if (GET_MODE_UNIT_SIZE (vmode) > 8)
15441 return false;
15442
15443 /* Note that these are little-endian tests.
15444 We correct for big-endian later. */
15445 poly_uint64 first = d->perm[0];
15446 if ((maybe_ne (first, 0U) && maybe_ne (first * 2, nelt))
15447 || !d->perm.series_p (0, 2, first, 1)
15448 || !d->perm.series_p (1, 2, first + nelt, 1))
15449 return false;
15450 high = maybe_ne (first, 0U);
15451
15452 /* Success! */
15453 if (d->testing_p)
15454 return true;
15455
15456 in0 = d->op0;
15457 in1 = d->op1;
15458 /* We don't need a big-endian lane correction for SVE; see the comment
15459 at the head of aarch64-sve.md for details. */
15460 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
15461 {
15462 x = in0, in0 = in1, in1 = x;
15463 high = !high;
15464 }
15465 out = d->target;
15466
15467 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
15468 high ? UNSPEC_ZIP2 : UNSPEC_ZIP1));
15469 return true;
15470 }
15471
15472 /* Recognize patterns for the EXT insn. */
15473
15474 static bool
15475 aarch64_evpc_ext (struct expand_vec_perm_d *d)
15476 {
15477 HOST_WIDE_INT location;
15478 rtx offset;
15479
15480 /* The first element always refers to the first vector.
15481 Check if the extracted indices are increasing by one. */
15482 if (d->vec_flags == VEC_SVE_PRED
15483 || !d->perm[0].is_constant (&location)
15484 || !d->perm.series_p (0, 1, location, 1))
15485 return false;
15486
15487 /* Success! */
15488 if (d->testing_p)
15489 return true;
15490
15491 /* The case where (location == 0) is a no-op for both big- and little-endian,
15492 and is removed by the mid-end at optimization levels -O1 and higher.
15493
15494 We don't need a big-endian lane correction for SVE; see the comment
15495 at the head of aarch64-sve.md for details. */
15496 if (BYTES_BIG_ENDIAN && location != 0 && d->vec_flags == VEC_ADVSIMD)
15497 {
15498 /* After setup, we want the high elements of the first vector (stored
15499 at the LSB end of the register), and the low elements of the second
15500 vector (stored at the MSB end of the register). So swap. */
15501 std::swap (d->op0, d->op1);
15502 /* location != 0 (above), so safe to assume (nelt - location) < nelt.
15503 to_constant () is safe since this is restricted to Advanced SIMD
15504 vectors. */
15505 location = d->perm.length ().to_constant () - location;
15506 }
15507
15508 offset = GEN_INT (location);
15509 emit_set_insn (d->target,
15510 gen_rtx_UNSPEC (d->vmode,
15511 gen_rtvec (3, d->op0, d->op1, offset),
15512 UNSPEC_EXT));
15513 return true;
15514 }
15515
15516 /* Recognize patterns for the REV{64,32,16} insns, which reverse elements
15517 within each 64-bit, 32-bit or 16-bit granule. */
15518
15519 static bool
15520 aarch64_evpc_rev_local (struct expand_vec_perm_d *d)
15521 {
15522 HOST_WIDE_INT diff;
15523 unsigned int i, size, unspec;
15524 machine_mode pred_mode;
15525
15526 if (d->vec_flags == VEC_SVE_PRED
15527 || !d->one_vector_p
15528 || !d->perm[0].is_constant (&diff))
15529 return false;
15530
15531 size = (diff + 1) * GET_MODE_UNIT_SIZE (d->vmode);
15532 if (size == 8)
15533 {
15534 unspec = UNSPEC_REV64;
15535 pred_mode = VNx2BImode;
15536 }
15537 else if (size == 4)
15538 {
15539 unspec = UNSPEC_REV32;
15540 pred_mode = VNx4BImode;
15541 }
15542 else if (size == 2)
15543 {
15544 unspec = UNSPEC_REV16;
15545 pred_mode = VNx8BImode;
15546 }
15547 else
15548 return false;
15549
15550 unsigned int step = diff + 1;
15551 for (i = 0; i < step; ++i)
15552 if (!d->perm.series_p (i, step, diff - i, step))
15553 return false;
15554
15555 /* Success! */
15556 if (d->testing_p)
15557 return true;
15558
15559 rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), unspec);
15560 if (d->vec_flags == VEC_SVE_DATA)
15561 {
15562 rtx pred = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
15563 src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (2, pred, src),
15564 UNSPEC_MERGE_PTRUE);
15565 }
15566 emit_set_insn (d->target, src);
15567 return true;
15568 }
15569
15570 /* Recognize patterns for the REV insn, which reverses elements within
15571 a full vector. */
15572
15573 static bool
15574 aarch64_evpc_rev_global (struct expand_vec_perm_d *d)
15575 {
15576 poly_uint64 nelt = d->perm.length ();
15577
15578 if (!d->one_vector_p || d->vec_flags != VEC_SVE_DATA)
15579 return false;
15580
15581 if (!d->perm.series_p (0, 1, nelt - 1, -1))
15582 return false;
15583
15584 /* Success! */
15585 if (d->testing_p)
15586 return true;
15587
15588 rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), UNSPEC_REV);
15589 emit_set_insn (d->target, src);
15590 return true;
15591 }
15592
15593 static bool
15594 aarch64_evpc_dup (struct expand_vec_perm_d *d)
15595 {
15596 rtx out = d->target;
15597 rtx in0;
15598 HOST_WIDE_INT elt;
15599 machine_mode vmode = d->vmode;
15600 rtx lane;
15601
15602 if (d->vec_flags == VEC_SVE_PRED
15603 || d->perm.encoding ().encoded_nelts () != 1
15604 || !d->perm[0].is_constant (&elt))
15605 return false;
15606
15607 if (d->vec_flags == VEC_SVE_DATA && elt >= 64 * GET_MODE_UNIT_SIZE (vmode))
15608 return false;
15609
15610 /* Success! */
15611 if (d->testing_p)
15612 return true;
15613
15614 /* The generic preparation in aarch64_expand_vec_perm_const_1
15615 swaps the operand order and the permute indices if it finds
15616 d->perm[0] to be in the second operand. Thus, we can always
15617 use d->op0 and need not do any extra arithmetic to get the
15618 correct lane number. */
15619 in0 = d->op0;
15620 lane = GEN_INT (elt); /* The pattern corrects for big-endian. */
15621
15622 rtx parallel = gen_rtx_PARALLEL (vmode, gen_rtvec (1, lane));
15623 rtx select = gen_rtx_VEC_SELECT (GET_MODE_INNER (vmode), in0, parallel);
15624 emit_set_insn (out, gen_rtx_VEC_DUPLICATE (vmode, select));
15625 return true;
15626 }
15627
15628 static bool
15629 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
15630 {
15631 rtx rperm[MAX_COMPILE_TIME_VEC_BYTES], sel;
15632 machine_mode vmode = d->vmode;
15633
15634 /* Make sure that the indices are constant. */
15635 unsigned int encoded_nelts = d->perm.encoding ().encoded_nelts ();
15636 for (unsigned int i = 0; i < encoded_nelts; ++i)
15637 if (!d->perm[i].is_constant ())
15638 return false;
15639
15640 if (d->testing_p)
15641 return true;
15642
15643 /* Generic code will try constant permutation twice. Once with the
15644 original mode and again with the elements lowered to QImode.
15645 So wait and don't do the selector expansion ourselves. */
15646 if (vmode != V8QImode && vmode != V16QImode)
15647 return false;
15648
15649 /* to_constant is safe since this routine is specific to Advanced SIMD
15650 vectors. */
15651 unsigned int nelt = d->perm.length ().to_constant ();
15652 for (unsigned int i = 0; i < nelt; ++i)
15653 /* If big-endian and two vectors we end up with a weird mixed-endian
15654 mode on NEON. Reverse the index within each word but not the word
15655 itself. to_constant is safe because we checked is_constant above. */
15656 rperm[i] = GEN_INT (BYTES_BIG_ENDIAN
15657 ? d->perm[i].to_constant () ^ (nelt - 1)
15658 : d->perm[i].to_constant ());
15659
15660 sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
15661 sel = force_reg (vmode, sel);
15662
15663 aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
15664 return true;
15665 }
15666
15667 /* Try to implement D using an SVE TBL instruction. */
15668
15669 static bool
15670 aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
15671 {
15672 unsigned HOST_WIDE_INT nelt;
15673
15674 /* Permuting two variable-length vectors could overflow the
15675 index range. */
15676 if (!d->one_vector_p && !d->perm.length ().is_constant (&nelt))
15677 return false;
15678
15679 if (d->testing_p)
15680 return true;
15681
15682 machine_mode sel_mode = mode_for_int_vector (d->vmode).require ();
15683 rtx sel = vec_perm_indices_to_rtx (sel_mode, d->perm);
15684 aarch64_expand_sve_vec_perm (d->target, d->op0, d->op1, sel);
15685 return true;
15686 }
15687
15688 static bool
15689 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
15690 {
15691 /* The pattern matching functions above are written to look for a small
15692 number to begin the sequence (0, 1, N/2). If we begin with an index
15693 from the second operand, we can swap the operands. */
15694 poly_int64 nelt = d->perm.length ();
15695 if (known_ge (d->perm[0], nelt))
15696 {
15697 d->perm.rotate_inputs (1);
15698 std::swap (d->op0, d->op1);
15699 }
15700
15701 if ((d->vec_flags == VEC_ADVSIMD
15702 || d->vec_flags == VEC_SVE_DATA
15703 || d->vec_flags == VEC_SVE_PRED)
15704 && known_gt (nelt, 1))
15705 {
15706 if (aarch64_evpc_rev_local (d))
15707 return true;
15708 else if (aarch64_evpc_rev_global (d))
15709 return true;
15710 else if (aarch64_evpc_ext (d))
15711 return true;
15712 else if (aarch64_evpc_dup (d))
15713 return true;
15714 else if (aarch64_evpc_zip (d))
15715 return true;
15716 else if (aarch64_evpc_uzp (d))
15717 return true;
15718 else if (aarch64_evpc_trn (d))
15719 return true;
15720 if (d->vec_flags == VEC_SVE_DATA)
15721 return aarch64_evpc_sve_tbl (d);
15722 else if (d->vec_flags == VEC_SVE_DATA)
15723 return aarch64_evpc_tbl (d);
15724 }
15725 return false;
15726 }
15727
15728 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST. */
15729
15730 static bool
15731 aarch64_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
15732 rtx op1, const vec_perm_indices &sel)
15733 {
15734 struct expand_vec_perm_d d;
15735
15736 /* Check whether the mask can be applied to a single vector. */
15737 if (op0 && rtx_equal_p (op0, op1))
15738 d.one_vector_p = true;
15739 else if (sel.all_from_input_p (0))
15740 {
15741 d.one_vector_p = true;
15742 op1 = op0;
15743 }
15744 else if (sel.all_from_input_p (1))
15745 {
15746 d.one_vector_p = true;
15747 op0 = op1;
15748 }
15749 else
15750 d.one_vector_p = false;
15751
15752 d.perm.new_vector (sel.encoding (), d.one_vector_p ? 1 : 2,
15753 sel.nelts_per_input ());
15754 d.vmode = vmode;
15755 d.vec_flags = aarch64_classify_vector_mode (d.vmode);
15756 d.target = target;
15757 d.op0 = op0;
15758 d.op1 = op1;
15759 d.testing_p = !target;
15760
15761 if (!d.testing_p)
15762 return aarch64_expand_vec_perm_const_1 (&d);
15763
15764 rtx_insn *last = get_last_insn ();
15765 bool ret = aarch64_expand_vec_perm_const_1 (&d);
15766 gcc_assert (last == get_last_insn ());
15767
15768 return ret;
15769 }
15770
15771 /* Generate a byte permute mask for a register of mode MODE,
15772 which has NUNITS units. */
15773
15774 rtx
15775 aarch64_reverse_mask (machine_mode mode, unsigned int nunits)
15776 {
15777 /* We have to reverse each vector because we dont have
15778 a permuted load that can reverse-load according to ABI rules. */
15779 rtx mask;
15780 rtvec v = rtvec_alloc (16);
15781 unsigned int i, j;
15782 unsigned int usize = GET_MODE_UNIT_SIZE (mode);
15783
15784 gcc_assert (BYTES_BIG_ENDIAN);
15785 gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
15786
15787 for (i = 0; i < nunits; i++)
15788 for (j = 0; j < usize; j++)
15789 RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
15790 mask = gen_rtx_CONST_VECTOR (V16QImode, v);
15791 return force_reg (V16QImode, mask);
15792 }
15793
15794 /* Return true if X is a valid second operand for the SVE instruction
15795 that implements integer comparison OP_CODE. */
15796
15797 static bool
15798 aarch64_sve_cmp_operand_p (rtx_code op_code, rtx x)
15799 {
15800 if (register_operand (x, VOIDmode))
15801 return true;
15802
15803 switch (op_code)
15804 {
15805 case LTU:
15806 case LEU:
15807 case GEU:
15808 case GTU:
15809 return aarch64_sve_cmp_immediate_p (x, false);
15810 case LT:
15811 case LE:
15812 case GE:
15813 case GT:
15814 case NE:
15815 case EQ:
15816 return aarch64_sve_cmp_immediate_p (x, true);
15817 default:
15818 gcc_unreachable ();
15819 }
15820 }
15821
15822 /* Use predicated SVE instructions to implement the equivalent of:
15823
15824 (set TARGET OP)
15825
15826 given that PTRUE is an all-true predicate of the appropriate mode. */
15827
15828 static void
15829 aarch64_emit_sve_ptrue_op (rtx target, rtx ptrue, rtx op)
15830 {
15831 rtx unspec = gen_rtx_UNSPEC (GET_MODE (target),
15832 gen_rtvec (2, ptrue, op),
15833 UNSPEC_MERGE_PTRUE);
15834 rtx_insn *insn = emit_set_insn (target, unspec);
15835 set_unique_reg_note (insn, REG_EQUAL, copy_rtx (op));
15836 }
15837
15838 /* Likewise, but also clobber the condition codes. */
15839
15840 static void
15841 aarch64_emit_sve_ptrue_op_cc (rtx target, rtx ptrue, rtx op)
15842 {
15843 rtx unspec = gen_rtx_UNSPEC (GET_MODE (target),
15844 gen_rtvec (2, ptrue, op),
15845 UNSPEC_MERGE_PTRUE);
15846 rtx_insn *insn = emit_insn (gen_set_clobber_cc (target, unspec));
15847 set_unique_reg_note (insn, REG_EQUAL, copy_rtx (op));
15848 }
15849
15850 /* Return the UNSPEC_COND_* code for comparison CODE. */
15851
15852 static unsigned int
15853 aarch64_unspec_cond_code (rtx_code code)
15854 {
15855 switch (code)
15856 {
15857 case NE:
15858 return UNSPEC_COND_NE;
15859 case EQ:
15860 return UNSPEC_COND_EQ;
15861 case LT:
15862 return UNSPEC_COND_LT;
15863 case GT:
15864 return UNSPEC_COND_GT;
15865 case LE:
15866 return UNSPEC_COND_LE;
15867 case GE:
15868 return UNSPEC_COND_GE;
15869 default:
15870 gcc_unreachable ();
15871 }
15872 }
15873
15874 /* Emit:
15875
15876 (set TARGET (unspec [PRED OP0 OP1] UNSPEC_COND_<X>))
15877
15878 where <X> is the operation associated with comparison CODE. This form
15879 of instruction is used when (and (CODE OP0 OP1) PRED) would have different
15880 semantics, such as when PRED might not be all-true and when comparing
15881 inactive lanes could have side effects. */
15882
15883 static void
15884 aarch64_emit_sve_predicated_cond (rtx target, rtx_code code,
15885 rtx pred, rtx op0, rtx op1)
15886 {
15887 rtx unspec = gen_rtx_UNSPEC (GET_MODE (pred),
15888 gen_rtvec (3, pred, op0, op1),
15889 aarch64_unspec_cond_code (code));
15890 emit_set_insn (target, unspec);
15891 }
15892
15893 /* Expand an SVE integer comparison using the SVE equivalent of:
15894
15895 (set TARGET (CODE OP0 OP1)). */
15896
15897 void
15898 aarch64_expand_sve_vec_cmp_int (rtx target, rtx_code code, rtx op0, rtx op1)
15899 {
15900 machine_mode pred_mode = GET_MODE (target);
15901 machine_mode data_mode = GET_MODE (op0);
15902
15903 if (!aarch64_sve_cmp_operand_p (code, op1))
15904 op1 = force_reg (data_mode, op1);
15905
15906 rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
15907 rtx cond = gen_rtx_fmt_ee (code, pred_mode, op0, op1);
15908 aarch64_emit_sve_ptrue_op_cc (target, ptrue, cond);
15909 }
15910
15911 /* Emit the SVE equivalent of:
15912
15913 (set TMP1 (CODE1 OP0 OP1))
15914 (set TMP2 (CODE2 OP0 OP1))
15915 (set TARGET (ior:PRED_MODE TMP1 TMP2))
15916
15917 PTRUE is an all-true predicate with the same mode as TARGET. */
15918
15919 static void
15920 aarch64_emit_sve_or_conds (rtx target, rtx_code code1, rtx_code code2,
15921 rtx ptrue, rtx op0, rtx op1)
15922 {
15923 machine_mode pred_mode = GET_MODE (ptrue);
15924 rtx tmp1 = gen_reg_rtx (pred_mode);
15925 aarch64_emit_sve_ptrue_op (tmp1, ptrue,
15926 gen_rtx_fmt_ee (code1, pred_mode, op0, op1));
15927 rtx tmp2 = gen_reg_rtx (pred_mode);
15928 aarch64_emit_sve_ptrue_op (tmp2, ptrue,
15929 gen_rtx_fmt_ee (code2, pred_mode, op0, op1));
15930 aarch64_emit_binop (target, ior_optab, tmp1, tmp2);
15931 }
15932
15933 /* Emit the SVE equivalent of:
15934
15935 (set TMP (CODE OP0 OP1))
15936 (set TARGET (not TMP))
15937
15938 PTRUE is an all-true predicate with the same mode as TARGET. */
15939
15940 static void
15941 aarch64_emit_sve_inverted_cond (rtx target, rtx ptrue, rtx_code code,
15942 rtx op0, rtx op1)
15943 {
15944 machine_mode pred_mode = GET_MODE (ptrue);
15945 rtx tmp = gen_reg_rtx (pred_mode);
15946 aarch64_emit_sve_ptrue_op (tmp, ptrue,
15947 gen_rtx_fmt_ee (code, pred_mode, op0, op1));
15948 aarch64_emit_unop (target, one_cmpl_optab, tmp);
15949 }
15950
15951 /* Expand an SVE floating-point comparison using the SVE equivalent of:
15952
15953 (set TARGET (CODE OP0 OP1))
15954
15955 If CAN_INVERT_P is true, the caller can also handle inverted results;
15956 return true if the result is in fact inverted. */
15957
15958 bool
15959 aarch64_expand_sve_vec_cmp_float (rtx target, rtx_code code,
15960 rtx op0, rtx op1, bool can_invert_p)
15961 {
15962 machine_mode pred_mode = GET_MODE (target);
15963 machine_mode data_mode = GET_MODE (op0);
15964
15965 rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
15966 switch (code)
15967 {
15968 case UNORDERED:
15969 /* UNORDERED has no immediate form. */
15970 op1 = force_reg (data_mode, op1);
15971 /* fall through */
15972 case LT:
15973 case LE:
15974 case GT:
15975 case GE:
15976 case EQ:
15977 case NE:
15978 {
15979 /* There is native support for the comparison. */
15980 rtx cond = gen_rtx_fmt_ee (code, pred_mode, op0, op1);
15981 aarch64_emit_sve_ptrue_op (target, ptrue, cond);
15982 return false;
15983 }
15984
15985 case LTGT:
15986 /* This is a trapping operation (LT or GT). */
15987 aarch64_emit_sve_or_conds (target, LT, GT, ptrue, op0, op1);
15988 return false;
15989
15990 case UNEQ:
15991 if (!flag_trapping_math)
15992 {
15993 /* This would trap for signaling NaNs. */
15994 op1 = force_reg (data_mode, op1);
15995 aarch64_emit_sve_or_conds (target, UNORDERED, EQ, ptrue, op0, op1);
15996 return false;
15997 }
15998 /* fall through */
15999 case UNLT:
16000 case UNLE:
16001 case UNGT:
16002 case UNGE:
16003 if (flag_trapping_math)
16004 {
16005 /* Work out which elements are ordered. */
16006 rtx ordered = gen_reg_rtx (pred_mode);
16007 op1 = force_reg (data_mode, op1);
16008 aarch64_emit_sve_inverted_cond (ordered, ptrue, UNORDERED, op0, op1);
16009
16010 /* Test the opposite condition for the ordered elements,
16011 then invert the result. */
16012 if (code == UNEQ)
16013 code = NE;
16014 else
16015 code = reverse_condition_maybe_unordered (code);
16016 if (can_invert_p)
16017 {
16018 aarch64_emit_sve_predicated_cond (target, code,
16019 ordered, op0, op1);
16020 return true;
16021 }
16022 rtx tmp = gen_reg_rtx (pred_mode);
16023 aarch64_emit_sve_predicated_cond (tmp, code, ordered, op0, op1);
16024 aarch64_emit_unop (target, one_cmpl_optab, tmp);
16025 return false;
16026 }
16027 break;
16028
16029 case ORDERED:
16030 /* ORDERED has no immediate form. */
16031 op1 = force_reg (data_mode, op1);
16032 break;
16033
16034 default:
16035 gcc_unreachable ();
16036 }
16037
16038 /* There is native support for the inverse comparison. */
16039 code = reverse_condition_maybe_unordered (code);
16040 if (can_invert_p)
16041 {
16042 rtx cond = gen_rtx_fmt_ee (code, pred_mode, op0, op1);
16043 aarch64_emit_sve_ptrue_op (target, ptrue, cond);
16044 return true;
16045 }
16046 aarch64_emit_sve_inverted_cond (target, ptrue, code, op0, op1);
16047 return false;
16048 }
16049
16050 /* Expand an SVE vcond pattern with operands OPS. DATA_MODE is the mode
16051 of the data being selected and CMP_MODE is the mode of the values being
16052 compared. */
16053
16054 void
16055 aarch64_expand_sve_vcond (machine_mode data_mode, machine_mode cmp_mode,
16056 rtx *ops)
16057 {
16058 machine_mode pred_mode
16059 = aarch64_get_mask_mode (GET_MODE_NUNITS (cmp_mode),
16060 GET_MODE_SIZE (cmp_mode)).require ();
16061 rtx pred = gen_reg_rtx (pred_mode);
16062 if (FLOAT_MODE_P (cmp_mode))
16063 {
16064 if (aarch64_expand_sve_vec_cmp_float (pred, GET_CODE (ops[3]),
16065 ops[4], ops[5], true))
16066 std::swap (ops[1], ops[2]);
16067 }
16068 else
16069 aarch64_expand_sve_vec_cmp_int (pred, GET_CODE (ops[3]), ops[4], ops[5]);
16070
16071 rtvec vec = gen_rtvec (3, pred, ops[1], ops[2]);
16072 emit_set_insn (ops[0], gen_rtx_UNSPEC (data_mode, vec, UNSPEC_SEL));
16073 }
16074
16075 /* Implement TARGET_MODES_TIEABLE_P. In principle we should always return
16076 true. However due to issues with register allocation it is preferable
16077 to avoid tieing integer scalar and FP scalar modes. Executing integer
16078 operations in general registers is better than treating them as scalar
16079 vector operations. This reduces latency and avoids redundant int<->FP
16080 moves. So tie modes if they are either the same class, or vector modes
16081 with other vector modes, vector structs or any scalar mode. */
16082
16083 static bool
16084 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
16085 {
16086 if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
16087 return true;
16088
16089 /* We specifically want to allow elements of "structure" modes to
16090 be tieable to the structure. This more general condition allows
16091 other rarer situations too. The reason we don't extend this to
16092 predicate modes is that there are no predicate structure modes
16093 nor any specific instructions for extracting part of a predicate
16094 register. */
16095 if (aarch64_vector_data_mode_p (mode1)
16096 && aarch64_vector_data_mode_p (mode2))
16097 return true;
16098
16099 /* Also allow any scalar modes with vectors. */
16100 if (aarch64_vector_mode_supported_p (mode1)
16101 || aarch64_vector_mode_supported_p (mode2))
16102 return true;
16103
16104 return false;
16105 }
16106
16107 /* Return a new RTX holding the result of moving POINTER forward by
16108 AMOUNT bytes. */
16109
16110 static rtx
16111 aarch64_move_pointer (rtx pointer, poly_int64 amount)
16112 {
16113 rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
16114
16115 return adjust_automodify_address (pointer, GET_MODE (pointer),
16116 next, amount);
16117 }
16118
16119 /* Return a new RTX holding the result of moving POINTER forward by the
16120 size of the mode it points to. */
16121
16122 static rtx
16123 aarch64_progress_pointer (rtx pointer)
16124 {
16125 return aarch64_move_pointer (pointer, GET_MODE_SIZE (GET_MODE (pointer)));
16126 }
16127
16128 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
16129 MODE bytes. */
16130
16131 static void
16132 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
16133 machine_mode mode)
16134 {
16135 rtx reg = gen_reg_rtx (mode);
16136
16137 /* "Cast" the pointers to the correct mode. */
16138 *src = adjust_address (*src, mode, 0);
16139 *dst = adjust_address (*dst, mode, 0);
16140 /* Emit the memcpy. */
16141 emit_move_insn (reg, *src);
16142 emit_move_insn (*dst, reg);
16143 /* Move the pointers forward. */
16144 *src = aarch64_progress_pointer (*src);
16145 *dst = aarch64_progress_pointer (*dst);
16146 }
16147
16148 /* Expand movmem, as if from a __builtin_memcpy. Return true if
16149 we succeed, otherwise return false. */
16150
16151 bool
16152 aarch64_expand_movmem (rtx *operands)
16153 {
16154 int n, mode_bits;
16155 rtx dst = operands[0];
16156 rtx src = operands[1];
16157 rtx base;
16158 machine_mode cur_mode = BLKmode, next_mode;
16159 bool speed_p = !optimize_function_for_size_p (cfun);
16160
16161 /* When optimizing for size, give a better estimate of the length of a
16162 memcpy call, but use the default otherwise. Moves larger than 8 bytes
16163 will always require an even number of instructions to do now. And each
16164 operation requires both a load+store, so devide the max number by 2. */
16165 int max_num_moves = (speed_p ? 16 : AARCH64_CALL_RATIO) / 2;
16166
16167 /* We can't do anything smart if the amount to copy is not constant. */
16168 if (!CONST_INT_P (operands[2]))
16169 return false;
16170
16171 n = INTVAL (operands[2]);
16172
16173 /* Try to keep the number of instructions low. For all cases we will do at
16174 most two moves for the residual amount, since we'll always overlap the
16175 remainder. */
16176 if (((n / 16) + (n % 16 ? 2 : 0)) > max_num_moves)
16177 return false;
16178
16179 base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
16180 dst = adjust_automodify_address (dst, VOIDmode, base, 0);
16181
16182 base = copy_to_mode_reg (Pmode, XEXP (src, 0));
16183 src = adjust_automodify_address (src, VOIDmode, base, 0);
16184
16185 /* Convert n to bits to make the rest of the code simpler. */
16186 n = n * BITS_PER_UNIT;
16187
16188 while (n > 0)
16189 {
16190 /* Find the largest mode in which to do the copy in without over reading
16191 or writing. */
16192 opt_scalar_int_mode mode_iter;
16193 FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
16194 if (GET_MODE_BITSIZE (mode_iter.require ()) <= n)
16195 cur_mode = mode_iter.require ();
16196
16197 gcc_assert (cur_mode != BLKmode);
16198
16199 mode_bits = GET_MODE_BITSIZE (cur_mode).to_constant ();
16200 aarch64_copy_one_block_and_progress_pointers (&src, &dst, cur_mode);
16201
16202 n -= mode_bits;
16203
16204 /* Do certain trailing copies as overlapping if it's going to be
16205 cheaper. i.e. less instructions to do so. For instance doing a 15
16206 byte copy it's more efficient to do two overlapping 8 byte copies than
16207 8 + 6 + 1. */
16208 next_mode = smallest_mode_for_size (n, MODE_INT);
16209 int n_bits = GET_MODE_BITSIZE (next_mode).to_constant ();
16210 if (n > 0 && n_bits > n && n_bits <= 8 * BITS_PER_UNIT)
16211 {
16212 src = aarch64_move_pointer (src, (n - n_bits) / BITS_PER_UNIT);
16213 dst = aarch64_move_pointer (dst, (n - n_bits) / BITS_PER_UNIT);
16214 n = n_bits;
16215 }
16216 }
16217
16218 return true;
16219 }
16220
16221 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
16222 SImode stores. Handle the case when the constant has identical
16223 bottom and top halves. This is beneficial when the two stores can be
16224 merged into an STP and we avoid synthesising potentially expensive
16225 immediates twice. Return true if such a split is possible. */
16226
16227 bool
16228 aarch64_split_dimode_const_store (rtx dst, rtx src)
16229 {
16230 rtx lo = gen_lowpart (SImode, src);
16231 rtx hi = gen_highpart_mode (SImode, DImode, src);
16232
16233 bool size_p = optimize_function_for_size_p (cfun);
16234
16235 if (!rtx_equal_p (lo, hi))
16236 return false;
16237
16238 unsigned int orig_cost
16239 = aarch64_internal_mov_immediate (NULL_RTX, src, false, DImode);
16240 unsigned int lo_cost
16241 = aarch64_internal_mov_immediate (NULL_RTX, lo, false, SImode);
16242
16243 /* We want to transform:
16244 MOV x1, 49370
16245 MOVK x1, 0x140, lsl 16
16246 MOVK x1, 0xc0da, lsl 32
16247 MOVK x1, 0x140, lsl 48
16248 STR x1, [x0]
16249 into:
16250 MOV w1, 49370
16251 MOVK w1, 0x140, lsl 16
16252 STP w1, w1, [x0]
16253 So we want to perform this only when we save two instructions
16254 or more. When optimizing for size, however, accept any code size
16255 savings we can. */
16256 if (size_p && orig_cost <= lo_cost)
16257 return false;
16258
16259 if (!size_p
16260 && (orig_cost <= lo_cost + 1))
16261 return false;
16262
16263 rtx mem_lo = adjust_address (dst, SImode, 0);
16264 if (!aarch64_mem_pair_operand (mem_lo, SImode))
16265 return false;
16266
16267 rtx tmp_reg = gen_reg_rtx (SImode);
16268 aarch64_expand_mov_immediate (tmp_reg, lo);
16269 rtx mem_hi = aarch64_move_pointer (mem_lo, GET_MODE_SIZE (SImode));
16270 /* Don't emit an explicit store pair as this may not be always profitable.
16271 Let the sched-fusion logic decide whether to merge them. */
16272 emit_move_insn (mem_lo, tmp_reg);
16273 emit_move_insn (mem_hi, tmp_reg);
16274
16275 return true;
16276 }
16277
16278 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
16279
16280 static unsigned HOST_WIDE_INT
16281 aarch64_asan_shadow_offset (void)
16282 {
16283 return (HOST_WIDE_INT_1 << 36);
16284 }
16285
16286 static rtx
16287 aarch64_gen_ccmp_first (rtx_insn **prep_seq, rtx_insn **gen_seq,
16288 int code, tree treeop0, tree treeop1)
16289 {
16290 machine_mode op_mode, cmp_mode, cc_mode = CCmode;
16291 rtx op0, op1;
16292 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
16293 insn_code icode;
16294 struct expand_operand ops[4];
16295
16296 start_sequence ();
16297 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
16298
16299 op_mode = GET_MODE (op0);
16300 if (op_mode == VOIDmode)
16301 op_mode = GET_MODE (op1);
16302
16303 switch (op_mode)
16304 {
16305 case E_QImode:
16306 case E_HImode:
16307 case E_SImode:
16308 cmp_mode = SImode;
16309 icode = CODE_FOR_cmpsi;
16310 break;
16311
16312 case E_DImode:
16313 cmp_mode = DImode;
16314 icode = CODE_FOR_cmpdi;
16315 break;
16316
16317 case E_SFmode:
16318 cmp_mode = SFmode;
16319 cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
16320 icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpesf : CODE_FOR_fcmpsf;
16321 break;
16322
16323 case E_DFmode:
16324 cmp_mode = DFmode;
16325 cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
16326 icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpedf : CODE_FOR_fcmpdf;
16327 break;
16328
16329 default:
16330 end_sequence ();
16331 return NULL_RTX;
16332 }
16333
16334 op0 = prepare_operand (icode, op0, 0, op_mode, cmp_mode, unsignedp);
16335 op1 = prepare_operand (icode, op1, 1, op_mode, cmp_mode, unsignedp);
16336 if (!op0 || !op1)
16337 {
16338 end_sequence ();
16339 return NULL_RTX;
16340 }
16341 *prep_seq = get_insns ();
16342 end_sequence ();
16343
16344 create_fixed_operand (&ops[0], op0);
16345 create_fixed_operand (&ops[1], op1);
16346
16347 start_sequence ();
16348 if (!maybe_expand_insn (icode, 2, ops))
16349 {
16350 end_sequence ();
16351 return NULL_RTX;
16352 }
16353 *gen_seq = get_insns ();
16354 end_sequence ();
16355
16356 return gen_rtx_fmt_ee ((rtx_code) code, cc_mode,
16357 gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx);
16358 }
16359
16360 static rtx
16361 aarch64_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev,
16362 int cmp_code, tree treeop0, tree treeop1, int bit_code)
16363 {
16364 rtx op0, op1, target;
16365 machine_mode op_mode, cmp_mode, cc_mode = CCmode;
16366 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
16367 insn_code icode;
16368 struct expand_operand ops[6];
16369 int aarch64_cond;
16370
16371 push_to_sequence (*prep_seq);
16372 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
16373
16374 op_mode = GET_MODE (op0);
16375 if (op_mode == VOIDmode)
16376 op_mode = GET_MODE (op1);
16377
16378 switch (op_mode)
16379 {
16380 case E_QImode:
16381 case E_HImode:
16382 case E_SImode:
16383 cmp_mode = SImode;
16384 icode = CODE_FOR_ccmpsi;
16385 break;
16386
16387 case E_DImode:
16388 cmp_mode = DImode;
16389 icode = CODE_FOR_ccmpdi;
16390 break;
16391
16392 case E_SFmode:
16393 cmp_mode = SFmode;
16394 cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
16395 icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpesf : CODE_FOR_fccmpsf;
16396 break;
16397
16398 case E_DFmode:
16399 cmp_mode = DFmode;
16400 cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
16401 icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpedf : CODE_FOR_fccmpdf;
16402 break;
16403
16404 default:
16405 end_sequence ();
16406 return NULL_RTX;
16407 }
16408
16409 op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
16410 op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
16411 if (!op0 || !op1)
16412 {
16413 end_sequence ();
16414 return NULL_RTX;
16415 }
16416 *prep_seq = get_insns ();
16417 end_sequence ();
16418
16419 target = gen_rtx_REG (cc_mode, CC_REGNUM);
16420 aarch64_cond = aarch64_get_condition_code_1 (cc_mode, (rtx_code) cmp_code);
16421
16422 if (bit_code != AND)
16423 {
16424 prev = gen_rtx_fmt_ee (REVERSE_CONDITION (GET_CODE (prev),
16425 GET_MODE (XEXP (prev, 0))),
16426 VOIDmode, XEXP (prev, 0), const0_rtx);
16427 aarch64_cond = AARCH64_INVERSE_CONDITION_CODE (aarch64_cond);
16428 }
16429
16430 create_fixed_operand (&ops[0], XEXP (prev, 0));
16431 create_fixed_operand (&ops[1], target);
16432 create_fixed_operand (&ops[2], op0);
16433 create_fixed_operand (&ops[3], op1);
16434 create_fixed_operand (&ops[4], prev);
16435 create_fixed_operand (&ops[5], GEN_INT (aarch64_cond));
16436
16437 push_to_sequence (*gen_seq);
16438 if (!maybe_expand_insn (icode, 6, ops))
16439 {
16440 end_sequence ();
16441 return NULL_RTX;
16442 }
16443
16444 *gen_seq = get_insns ();
16445 end_sequence ();
16446
16447 return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
16448 }
16449
16450 #undef TARGET_GEN_CCMP_FIRST
16451 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
16452
16453 #undef TARGET_GEN_CCMP_NEXT
16454 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
16455
16456 /* Implement TARGET_SCHED_MACRO_FUSION_P. Return true if target supports
16457 instruction fusion of some sort. */
16458
16459 static bool
16460 aarch64_macro_fusion_p (void)
16461 {
16462 return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
16463 }
16464
16465
16466 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P. Return true if PREV and CURR
16467 should be kept together during scheduling. */
16468
16469 static bool
16470 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
16471 {
16472 rtx set_dest;
16473 rtx prev_set = single_set (prev);
16474 rtx curr_set = single_set (curr);
16475 /* prev and curr are simple SET insns i.e. no flag setting or branching. */
16476 bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
16477
16478 if (!aarch64_macro_fusion_p ())
16479 return false;
16480
16481 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK))
16482 {
16483 /* We are trying to match:
16484 prev (mov) == (set (reg r0) (const_int imm16))
16485 curr (movk) == (set (zero_extract (reg r0)
16486 (const_int 16)
16487 (const_int 16))
16488 (const_int imm16_1)) */
16489
16490 set_dest = SET_DEST (curr_set);
16491
16492 if (GET_CODE (set_dest) == ZERO_EXTRACT
16493 && CONST_INT_P (SET_SRC (curr_set))
16494 && CONST_INT_P (SET_SRC (prev_set))
16495 && CONST_INT_P (XEXP (set_dest, 2))
16496 && INTVAL (XEXP (set_dest, 2)) == 16
16497 && REG_P (XEXP (set_dest, 0))
16498 && REG_P (SET_DEST (prev_set))
16499 && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
16500 {
16501 return true;
16502 }
16503 }
16504
16505 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD))
16506 {
16507
16508 /* We're trying to match:
16509 prev (adrp) == (set (reg r1)
16510 (high (symbol_ref ("SYM"))))
16511 curr (add) == (set (reg r0)
16512 (lo_sum (reg r1)
16513 (symbol_ref ("SYM"))))
16514 Note that r0 need not necessarily be the same as r1, especially
16515 during pre-regalloc scheduling. */
16516
16517 if (satisfies_constraint_Ush (SET_SRC (prev_set))
16518 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
16519 {
16520 if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
16521 && REG_P (XEXP (SET_SRC (curr_set), 0))
16522 && REGNO (XEXP (SET_SRC (curr_set), 0))
16523 == REGNO (SET_DEST (prev_set))
16524 && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
16525 XEXP (SET_SRC (curr_set), 1)))
16526 return true;
16527 }
16528 }
16529
16530 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK))
16531 {
16532
16533 /* We're trying to match:
16534 prev (movk) == (set (zero_extract (reg r0)
16535 (const_int 16)
16536 (const_int 32))
16537 (const_int imm16_1))
16538 curr (movk) == (set (zero_extract (reg r0)
16539 (const_int 16)
16540 (const_int 48))
16541 (const_int imm16_2)) */
16542
16543 if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
16544 && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
16545 && REG_P (XEXP (SET_DEST (prev_set), 0))
16546 && REG_P (XEXP (SET_DEST (curr_set), 0))
16547 && REGNO (XEXP (SET_DEST (prev_set), 0))
16548 == REGNO (XEXP (SET_DEST (curr_set), 0))
16549 && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
16550 && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
16551 && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
16552 && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
16553 && CONST_INT_P (SET_SRC (prev_set))
16554 && CONST_INT_P (SET_SRC (curr_set)))
16555 return true;
16556
16557 }
16558 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR))
16559 {
16560 /* We're trying to match:
16561 prev (adrp) == (set (reg r0)
16562 (high (symbol_ref ("SYM"))))
16563 curr (ldr) == (set (reg r1)
16564 (mem (lo_sum (reg r0)
16565 (symbol_ref ("SYM")))))
16566 or
16567 curr (ldr) == (set (reg r1)
16568 (zero_extend (mem
16569 (lo_sum (reg r0)
16570 (symbol_ref ("SYM")))))) */
16571 if (satisfies_constraint_Ush (SET_SRC (prev_set))
16572 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
16573 {
16574 rtx curr_src = SET_SRC (curr_set);
16575
16576 if (GET_CODE (curr_src) == ZERO_EXTEND)
16577 curr_src = XEXP (curr_src, 0);
16578
16579 if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
16580 && REG_P (XEXP (XEXP (curr_src, 0), 0))
16581 && REGNO (XEXP (XEXP (curr_src, 0), 0))
16582 == REGNO (SET_DEST (prev_set))
16583 && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
16584 XEXP (SET_SRC (prev_set), 0)))
16585 return true;
16586 }
16587 }
16588
16589 if (aarch64_fusion_enabled_p (AARCH64_FUSE_AES_AESMC)
16590 && aarch_crypto_can_dual_issue (prev, curr))
16591 return true;
16592
16593 if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH)
16594 && any_condjump_p (curr))
16595 {
16596 enum attr_type prev_type = get_attr_type (prev);
16597
16598 unsigned int condreg1, condreg2;
16599 rtx cc_reg_1;
16600 aarch64_fixed_condition_code_regs (&condreg1, &condreg2);
16601 cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
16602
16603 if (reg_referenced_p (cc_reg_1, PATTERN (curr))
16604 && prev
16605 && modified_in_p (cc_reg_1, prev))
16606 {
16607 /* FIXME: this misses some which is considered simple arthematic
16608 instructions for ThunderX. Simple shifts are missed here. */
16609 if (prev_type == TYPE_ALUS_SREG
16610 || prev_type == TYPE_ALUS_IMM
16611 || prev_type == TYPE_LOGICS_REG
16612 || prev_type == TYPE_LOGICS_IMM)
16613 return true;
16614 }
16615 }
16616
16617 if (prev_set
16618 && curr_set
16619 && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH)
16620 && any_condjump_p (curr))
16621 {
16622 /* We're trying to match:
16623 prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
16624 curr (cbz) == (set (pc) (if_then_else (eq/ne) (r0)
16625 (const_int 0))
16626 (label_ref ("SYM"))
16627 (pc)) */
16628 if (SET_DEST (curr_set) == (pc_rtx)
16629 && GET_CODE (SET_SRC (curr_set)) == IF_THEN_ELSE
16630 && REG_P (XEXP (XEXP (SET_SRC (curr_set), 0), 0))
16631 && REG_P (SET_DEST (prev_set))
16632 && REGNO (SET_DEST (prev_set))
16633 == REGNO (XEXP (XEXP (SET_SRC (curr_set), 0), 0)))
16634 {
16635 /* Fuse ALU operations followed by conditional branch instruction. */
16636 switch (get_attr_type (prev))
16637 {
16638 case TYPE_ALU_IMM:
16639 case TYPE_ALU_SREG:
16640 case TYPE_ADC_REG:
16641 case TYPE_ADC_IMM:
16642 case TYPE_ADCS_REG:
16643 case TYPE_ADCS_IMM:
16644 case TYPE_LOGIC_REG:
16645 case TYPE_LOGIC_IMM:
16646 case TYPE_CSEL:
16647 case TYPE_ADR:
16648 case TYPE_MOV_IMM:
16649 case TYPE_SHIFT_REG:
16650 case TYPE_SHIFT_IMM:
16651 case TYPE_BFM:
16652 case TYPE_RBIT:
16653 case TYPE_REV:
16654 case TYPE_EXTEND:
16655 return true;
16656
16657 default:;
16658 }
16659 }
16660 }
16661
16662 return false;
16663 }
16664
16665 /* Return true iff the instruction fusion described by OP is enabled. */
16666
16667 bool
16668 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op)
16669 {
16670 return (aarch64_tune_params.fusible_ops & op) != 0;
16671 }
16672
16673 /* If MEM is in the form of [base+offset], extract the two parts
16674 of address and set to BASE and OFFSET, otherwise return false
16675 after clearing BASE and OFFSET. */
16676
16677 bool
16678 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
16679 {
16680 rtx addr;
16681
16682 gcc_assert (MEM_P (mem));
16683
16684 addr = XEXP (mem, 0);
16685
16686 if (REG_P (addr))
16687 {
16688 *base = addr;
16689 *offset = const0_rtx;
16690 return true;
16691 }
16692
16693 if (GET_CODE (addr) == PLUS
16694 && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
16695 {
16696 *base = XEXP (addr, 0);
16697 *offset = XEXP (addr, 1);
16698 return true;
16699 }
16700
16701 *base = NULL_RTX;
16702 *offset = NULL_RTX;
16703
16704 return false;
16705 }
16706
16707 /* Types for scheduling fusion. */
16708 enum sched_fusion_type
16709 {
16710 SCHED_FUSION_NONE = 0,
16711 SCHED_FUSION_LD_SIGN_EXTEND,
16712 SCHED_FUSION_LD_ZERO_EXTEND,
16713 SCHED_FUSION_LD,
16714 SCHED_FUSION_ST,
16715 SCHED_FUSION_NUM
16716 };
16717
16718 /* If INSN is a load or store of address in the form of [base+offset],
16719 extract the two parts and set to BASE and OFFSET. Return scheduling
16720 fusion type this INSN is. */
16721
16722 static enum sched_fusion_type
16723 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
16724 {
16725 rtx x, dest, src;
16726 enum sched_fusion_type fusion = SCHED_FUSION_LD;
16727
16728 gcc_assert (INSN_P (insn));
16729 x = PATTERN (insn);
16730 if (GET_CODE (x) != SET)
16731 return SCHED_FUSION_NONE;
16732
16733 src = SET_SRC (x);
16734 dest = SET_DEST (x);
16735
16736 machine_mode dest_mode = GET_MODE (dest);
16737
16738 if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
16739 return SCHED_FUSION_NONE;
16740
16741 if (GET_CODE (src) == SIGN_EXTEND)
16742 {
16743 fusion = SCHED_FUSION_LD_SIGN_EXTEND;
16744 src = XEXP (src, 0);
16745 if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
16746 return SCHED_FUSION_NONE;
16747 }
16748 else if (GET_CODE (src) == ZERO_EXTEND)
16749 {
16750 fusion = SCHED_FUSION_LD_ZERO_EXTEND;
16751 src = XEXP (src, 0);
16752 if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
16753 return SCHED_FUSION_NONE;
16754 }
16755
16756 if (GET_CODE (src) == MEM && REG_P (dest))
16757 extract_base_offset_in_addr (src, base, offset);
16758 else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
16759 {
16760 fusion = SCHED_FUSION_ST;
16761 extract_base_offset_in_addr (dest, base, offset);
16762 }
16763 else
16764 return SCHED_FUSION_NONE;
16765
16766 if (*base == NULL_RTX || *offset == NULL_RTX)
16767 fusion = SCHED_FUSION_NONE;
16768
16769 return fusion;
16770 }
16771
16772 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
16773
16774 Currently we only support to fuse ldr or str instructions, so FUSION_PRI
16775 and PRI are only calculated for these instructions. For other instruction,
16776 FUSION_PRI and PRI are simply set to MAX_PRI - 1. In the future, other
16777 type instruction fusion can be added by returning different priorities.
16778
16779 It's important that irrelevant instructions get the largest FUSION_PRI. */
16780
16781 static void
16782 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
16783 int *fusion_pri, int *pri)
16784 {
16785 int tmp, off_val;
16786 rtx base, offset;
16787 enum sched_fusion_type fusion;
16788
16789 gcc_assert (INSN_P (insn));
16790
16791 tmp = max_pri - 1;
16792 fusion = fusion_load_store (insn, &base, &offset);
16793 if (fusion == SCHED_FUSION_NONE)
16794 {
16795 *pri = tmp;
16796 *fusion_pri = tmp;
16797 return;
16798 }
16799
16800 /* Set FUSION_PRI according to fusion type and base register. */
16801 *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
16802
16803 /* Calculate PRI. */
16804 tmp /= 2;
16805
16806 /* INSN with smaller offset goes first. */
16807 off_val = (int)(INTVAL (offset));
16808 if (off_val >= 0)
16809 tmp -= (off_val & 0xfffff);
16810 else
16811 tmp += ((- off_val) & 0xfffff);
16812
16813 *pri = tmp;
16814 return;
16815 }
16816
16817 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
16818 Adjust priority of sha1h instructions so they are scheduled before
16819 other SHA1 instructions. */
16820
16821 static int
16822 aarch64_sched_adjust_priority (rtx_insn *insn, int priority)
16823 {
16824 rtx x = PATTERN (insn);
16825
16826 if (GET_CODE (x) == SET)
16827 {
16828 x = SET_SRC (x);
16829
16830 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SHA1H)
16831 return priority + 10;
16832 }
16833
16834 return priority;
16835 }
16836
16837 /* Given OPERANDS of consecutive load/store, check if we can merge
16838 them into ldp/stp. LOAD is true if they are load instructions.
16839 MODE is the mode of memory operands. */
16840
16841 bool
16842 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
16843 machine_mode mode)
16844 {
16845 HOST_WIDE_INT offval_1, offval_2, msize;
16846 enum reg_class rclass_1, rclass_2;
16847 rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
16848
16849 if (load)
16850 {
16851 mem_1 = operands[1];
16852 mem_2 = operands[3];
16853 reg_1 = operands[0];
16854 reg_2 = operands[2];
16855 gcc_assert (REG_P (reg_1) && REG_P (reg_2));
16856 if (REGNO (reg_1) == REGNO (reg_2))
16857 return false;
16858 }
16859 else
16860 {
16861 mem_1 = operands[0];
16862 mem_2 = operands[2];
16863 reg_1 = operands[1];
16864 reg_2 = operands[3];
16865 }
16866
16867 /* The mems cannot be volatile. */
16868 if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
16869 return false;
16870
16871 /* If we have SImode and slow unaligned ldp,
16872 check the alignment to be at least 8 byte. */
16873 if (mode == SImode
16874 && (aarch64_tune_params.extra_tuning_flags
16875 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
16876 && !optimize_size
16877 && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
16878 return false;
16879
16880 /* Check if the addresses are in the form of [base+offset]. */
16881 extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
16882 if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
16883 return false;
16884 extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
16885 if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
16886 return false;
16887
16888 /* Check if the bases are same. */
16889 if (!rtx_equal_p (base_1, base_2))
16890 return false;
16891
16892 /* The operands must be of the same size. */
16893 gcc_assert (known_eq (GET_MODE_SIZE (GET_MODE (mem_1)),
16894 GET_MODE_SIZE (GET_MODE (mem_2))));
16895
16896 offval_1 = INTVAL (offset_1);
16897 offval_2 = INTVAL (offset_2);
16898 /* We should only be trying this for fixed-sized modes. There is no
16899 SVE LDP/STP instruction. */
16900 msize = GET_MODE_SIZE (mode).to_constant ();
16901 /* Check if the offsets are consecutive. */
16902 if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
16903 return false;
16904
16905 /* Check if the addresses are clobbered by load. */
16906 if (load)
16907 {
16908 if (reg_mentioned_p (reg_1, mem_1))
16909 return false;
16910
16911 /* In increasing order, the last load can clobber the address. */
16912 if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
16913 return false;
16914 }
16915
16916 /* One of the memory accesses must be a mempair operand.
16917 If it is not the first one, they need to be swapped by the
16918 peephole. */
16919 if (!aarch64_mem_pair_operand (mem_1, GET_MODE (mem_1))
16920 && !aarch64_mem_pair_operand (mem_2, GET_MODE (mem_2)))
16921 return false;
16922
16923 if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
16924 rclass_1 = FP_REGS;
16925 else
16926 rclass_1 = GENERAL_REGS;
16927
16928 if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
16929 rclass_2 = FP_REGS;
16930 else
16931 rclass_2 = GENERAL_REGS;
16932
16933 /* Check if the registers are of same class. */
16934 if (rclass_1 != rclass_2)
16935 return false;
16936
16937 return true;
16938 }
16939
16940 /* Given OPERANDS of consecutive load/store that can be merged,
16941 swap them if they are not in ascending order. */
16942 void
16943 aarch64_swap_ldrstr_operands (rtx* operands, bool load)
16944 {
16945 rtx mem_1, mem_2, base_1, base_2, offset_1, offset_2;
16946 HOST_WIDE_INT offval_1, offval_2;
16947
16948 if (load)
16949 {
16950 mem_1 = operands[1];
16951 mem_2 = operands[3];
16952 }
16953 else
16954 {
16955 mem_1 = operands[0];
16956 mem_2 = operands[2];
16957 }
16958
16959 extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
16960 extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
16961
16962 offval_1 = INTVAL (offset_1);
16963 offval_2 = INTVAL (offset_2);
16964
16965 if (offval_1 > offval_2)
16966 {
16967 /* Irrespective of whether this is a load or a store,
16968 we do the same swap. */
16969 std::swap (operands[0], operands[2]);
16970 std::swap (operands[1], operands[3]);
16971 }
16972 }
16973
16974 /* Taking X and Y to be HOST_WIDE_INT pointers, return the result of a
16975 comparison between the two. */
16976 int
16977 aarch64_host_wide_int_compare (const void *x, const void *y)
16978 {
16979 return wi::cmps (* ((const HOST_WIDE_INT *) x),
16980 * ((const HOST_WIDE_INT *) y));
16981 }
16982
16983 /* Taking X and Y to be pairs of RTX, one pointing to a MEM rtx and the
16984 other pointing to a REG rtx containing an offset, compare the offsets
16985 of the two pairs.
16986
16987 Return:
16988
16989 1 iff offset (X) > offset (Y)
16990 0 iff offset (X) == offset (Y)
16991 -1 iff offset (X) < offset (Y) */
16992 int
16993 aarch64_ldrstr_offset_compare (const void *x, const void *y)
16994 {
16995 const rtx * operands_1 = (const rtx *) x;
16996 const rtx * operands_2 = (const rtx *) y;
16997 rtx mem_1, mem_2, base, offset_1, offset_2;
16998
16999 if (MEM_P (operands_1[0]))
17000 mem_1 = operands_1[0];
17001 else
17002 mem_1 = operands_1[1];
17003
17004 if (MEM_P (operands_2[0]))
17005 mem_2 = operands_2[0];
17006 else
17007 mem_2 = operands_2[1];
17008
17009 /* Extract the offsets. */
17010 extract_base_offset_in_addr (mem_1, &base, &offset_1);
17011 extract_base_offset_in_addr (mem_2, &base, &offset_2);
17012
17013 gcc_assert (offset_1 != NULL_RTX && offset_2 != NULL_RTX);
17014
17015 return wi::cmps (INTVAL (offset_1), INTVAL (offset_2));
17016 }
17017
17018 /* Given OPERANDS of consecutive load/store, check if we can merge
17019 them into ldp/stp by adjusting the offset. LOAD is true if they
17020 are load instructions. MODE is the mode of memory operands.
17021
17022 Given below consecutive stores:
17023
17024 str w1, [xb, 0x100]
17025 str w1, [xb, 0x104]
17026 str w1, [xb, 0x108]
17027 str w1, [xb, 0x10c]
17028
17029 Though the offsets are out of the range supported by stp, we can
17030 still pair them after adjusting the offset, like:
17031
17032 add scratch, xb, 0x100
17033 stp w1, w1, [scratch]
17034 stp w1, w1, [scratch, 0x8]
17035
17036 The peephole patterns detecting this opportunity should guarantee
17037 the scratch register is avaliable. */
17038
17039 bool
17040 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
17041 scalar_mode mode)
17042 {
17043 enum reg_class rclass_1, rclass_2, rclass_3, rclass_4;
17044 HOST_WIDE_INT offvals[4], msize;
17045 rtx mem_1, mem_2, mem_3, mem_4, reg_1, reg_2, reg_3, reg_4;
17046 rtx base_1, base_2, base_3, base_4, offset_1, offset_2, offset_3, offset_4;
17047
17048 if (load)
17049 {
17050 reg_1 = operands[0];
17051 mem_1 = operands[1];
17052 reg_2 = operands[2];
17053 mem_2 = operands[3];
17054 reg_3 = operands[4];
17055 mem_3 = operands[5];
17056 reg_4 = operands[6];
17057 mem_4 = operands[7];
17058 gcc_assert (REG_P (reg_1) && REG_P (reg_2)
17059 && REG_P (reg_3) && REG_P (reg_4));
17060
17061 /* Do not attempt to merge the loads if the loads clobber each other. */
17062 for (int i = 0; i < 8; i += 2)
17063 for (int j = i + 2; j < 8; j += 2)
17064 if (reg_overlap_mentioned_p (operands[i], operands[j]))
17065 return false;
17066 }
17067 else
17068 {
17069 mem_1 = operands[0];
17070 reg_1 = operands[1];
17071 mem_2 = operands[2];
17072 reg_2 = operands[3];
17073 mem_3 = operands[4];
17074 reg_3 = operands[5];
17075 mem_4 = operands[6];
17076 reg_4 = operands[7];
17077 }
17078 /* Skip if memory operand is by itslef valid for ldp/stp. */
17079 if (!MEM_P (mem_1) || aarch64_mem_pair_operand (mem_1, mode))
17080 return false;
17081
17082 /* The mems cannot be volatile. */
17083 if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2)
17084 || MEM_VOLATILE_P (mem_3) ||MEM_VOLATILE_P (mem_4))
17085 return false;
17086
17087 /* Check if the addresses are in the form of [base+offset]. */
17088 extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
17089 if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
17090 return false;
17091 extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
17092 if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
17093 return false;
17094 extract_base_offset_in_addr (mem_3, &base_3, &offset_3);
17095 if (base_3 == NULL_RTX || offset_3 == NULL_RTX)
17096 return false;
17097 extract_base_offset_in_addr (mem_4, &base_4, &offset_4);
17098 if (base_4 == NULL_RTX || offset_4 == NULL_RTX)
17099 return false;
17100
17101 /* Check if the bases are same. */
17102 if (!rtx_equal_p (base_1, base_2)
17103 || !rtx_equal_p (base_2, base_3)
17104 || !rtx_equal_p (base_3, base_4))
17105 return false;
17106
17107 offvals[0] = INTVAL (offset_1);
17108 offvals[1] = INTVAL (offset_2);
17109 offvals[2] = INTVAL (offset_3);
17110 offvals[3] = INTVAL (offset_4);
17111 msize = GET_MODE_SIZE (mode);
17112
17113 /* Check if the offsets can be put in the right order to do a ldp/stp. */
17114 qsort (offvals, 4, sizeof (HOST_WIDE_INT), aarch64_host_wide_int_compare);
17115
17116 if (!(offvals[1] == offvals[0] + msize
17117 && offvals[3] == offvals[2] + msize))
17118 return false;
17119
17120 /* Check that offsets are within range of each other. The ldp/stp
17121 instructions have 7 bit immediate offsets, so use 0x80. */
17122 if (offvals[2] - offvals[0] >= msize * 0x80)
17123 return false;
17124
17125 /* The offsets must be aligned with respect to each other. */
17126 if (offvals[0] % msize != offvals[2] % msize)
17127 return false;
17128
17129 /* Check if the addresses are clobbered by load. */
17130 if (load && (reg_mentioned_p (reg_1, mem_1)
17131 || reg_mentioned_p (reg_2, mem_2)
17132 || reg_mentioned_p (reg_3, mem_3)
17133 || reg_mentioned_p (reg_4, mem_4)))
17134 return false;
17135
17136 /* If we have SImode and slow unaligned ldp,
17137 check the alignment to be at least 8 byte. */
17138 if (mode == SImode
17139 && (aarch64_tune_params.extra_tuning_flags
17140 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
17141 && !optimize_size
17142 && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
17143 return false;
17144
17145 if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
17146 rclass_1 = FP_REGS;
17147 else
17148 rclass_1 = GENERAL_REGS;
17149
17150 if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
17151 rclass_2 = FP_REGS;
17152 else
17153 rclass_2 = GENERAL_REGS;
17154
17155 if (REG_P (reg_3) && FP_REGNUM_P (REGNO (reg_3)))
17156 rclass_3 = FP_REGS;
17157 else
17158 rclass_3 = GENERAL_REGS;
17159
17160 if (REG_P (reg_4) && FP_REGNUM_P (REGNO (reg_4)))
17161 rclass_4 = FP_REGS;
17162 else
17163 rclass_4 = GENERAL_REGS;
17164
17165 /* Check if the registers are of same class. */
17166 if (rclass_1 != rclass_2 || rclass_2 != rclass_3 || rclass_3 != rclass_4)
17167 return false;
17168
17169 return true;
17170 }
17171
17172 /* Given OPERANDS of consecutive load/store, this function pairs them
17173 into LDP/STP after adjusting the offset. It depends on the fact
17174 that the operands can be sorted so the offsets are correct for STP.
17175 MODE is the mode of memory operands. CODE is the rtl operator
17176 which should be applied to all memory operands, it's SIGN_EXTEND,
17177 ZERO_EXTEND or UNKNOWN. */
17178
17179 bool
17180 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
17181 scalar_mode mode, RTX_CODE code)
17182 {
17183 rtx base, offset_1, offset_3, t1, t2;
17184 rtx mem_1, mem_2, mem_3, mem_4;
17185 rtx temp_operands[8];
17186 HOST_WIDE_INT off_val_1, off_val_3, base_off, new_off_1, new_off_3,
17187 stp_off_upper_limit, stp_off_lower_limit, msize;
17188
17189 /* We make changes on a copy as we may still bail out. */
17190 for (int i = 0; i < 8; i ++)
17191 temp_operands[i] = operands[i];
17192
17193 /* Sort the operands. */
17194 qsort (temp_operands, 4, 2 * sizeof (rtx *), aarch64_ldrstr_offset_compare);
17195
17196 if (load)
17197 {
17198 mem_1 = temp_operands[1];
17199 mem_2 = temp_operands[3];
17200 mem_3 = temp_operands[5];
17201 mem_4 = temp_operands[7];
17202 }
17203 else
17204 {
17205 mem_1 = temp_operands[0];
17206 mem_2 = temp_operands[2];
17207 mem_3 = temp_operands[4];
17208 mem_4 = temp_operands[6];
17209 gcc_assert (code == UNKNOWN);
17210 }
17211
17212 extract_base_offset_in_addr (mem_1, &base, &offset_1);
17213 extract_base_offset_in_addr (mem_3, &base, &offset_3);
17214 gcc_assert (base != NULL_RTX && offset_1 != NULL_RTX
17215 && offset_3 != NULL_RTX);
17216
17217 /* Adjust offset so it can fit in LDP/STP instruction. */
17218 msize = GET_MODE_SIZE (mode);
17219 stp_off_upper_limit = msize * (0x40 - 1);
17220 stp_off_lower_limit = - msize * 0x40;
17221
17222 off_val_1 = INTVAL (offset_1);
17223 off_val_3 = INTVAL (offset_3);
17224
17225 /* The base offset is optimally half way between the two STP/LDP offsets. */
17226 if (msize <= 4)
17227 base_off = (off_val_1 + off_val_3) / 2;
17228 else
17229 /* However, due to issues with negative LDP/STP offset generation for
17230 larger modes, for DF, DI and vector modes. we must not use negative
17231 addresses smaller than 9 signed unadjusted bits can store. This
17232 provides the most range in this case. */
17233 base_off = off_val_1;
17234
17235 /* Adjust the base so that it is aligned with the addresses but still
17236 optimal. */
17237 if (base_off % msize != off_val_1 % msize)
17238 /* Fix the offset, bearing in mind we want to make it bigger not
17239 smaller. */
17240 base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
17241 else if (msize <= 4)
17242 /* The negative range of LDP/STP is one larger than the positive range. */
17243 base_off += msize;
17244
17245 /* Check if base offset is too big or too small. We can attempt to resolve
17246 this issue by setting it to the maximum value and seeing if the offsets
17247 still fit. */
17248 if (base_off >= 0x1000)
17249 {
17250 base_off = 0x1000 - 1;
17251 /* We must still make sure that the base offset is aligned with respect
17252 to the address. But it may may not be made any bigger. */
17253 base_off -= (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
17254 }
17255
17256 /* Likewise for the case where the base is too small. */
17257 if (base_off <= -0x1000)
17258 {
17259 base_off = -0x1000 + 1;
17260 base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
17261 }
17262
17263 /* Offset of the first STP/LDP. */
17264 new_off_1 = off_val_1 - base_off;
17265
17266 /* Offset of the second STP/LDP. */
17267 new_off_3 = off_val_3 - base_off;
17268
17269 /* The offsets must be within the range of the LDP/STP instructions. */
17270 if (new_off_1 > stp_off_upper_limit || new_off_1 < stp_off_lower_limit
17271 || new_off_3 > stp_off_upper_limit || new_off_3 < stp_off_lower_limit)
17272 return false;
17273
17274 replace_equiv_address_nv (mem_1, plus_constant (Pmode, operands[8],
17275 new_off_1), true);
17276 replace_equiv_address_nv (mem_2, plus_constant (Pmode, operands[8],
17277 new_off_1 + msize), true);
17278 replace_equiv_address_nv (mem_3, plus_constant (Pmode, operands[8],
17279 new_off_3), true);
17280 replace_equiv_address_nv (mem_4, plus_constant (Pmode, operands[8],
17281 new_off_3 + msize), true);
17282
17283 if (!aarch64_mem_pair_operand (mem_1, mode)
17284 || !aarch64_mem_pair_operand (mem_3, mode))
17285 return false;
17286
17287 if (code == ZERO_EXTEND)
17288 {
17289 mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
17290 mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
17291 mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
17292 mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
17293 }
17294 else if (code == SIGN_EXTEND)
17295 {
17296 mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
17297 mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
17298 mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
17299 mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
17300 }
17301
17302 if (load)
17303 {
17304 operands[0] = temp_operands[0];
17305 operands[1] = mem_1;
17306 operands[2] = temp_operands[2];
17307 operands[3] = mem_2;
17308 operands[4] = temp_operands[4];
17309 operands[5] = mem_3;
17310 operands[6] = temp_operands[6];
17311 operands[7] = mem_4;
17312 }
17313 else
17314 {
17315 operands[0] = mem_1;
17316 operands[1] = temp_operands[1];
17317 operands[2] = mem_2;
17318 operands[3] = temp_operands[3];
17319 operands[4] = mem_3;
17320 operands[5] = temp_operands[5];
17321 operands[6] = mem_4;
17322 operands[7] = temp_operands[7];
17323 }
17324
17325 /* Emit adjusting instruction. */
17326 emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, base_off)));
17327 /* Emit ldp/stp instructions. */
17328 t1 = gen_rtx_SET (operands[0], operands[1]);
17329 t2 = gen_rtx_SET (operands[2], operands[3]);
17330 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
17331 t1 = gen_rtx_SET (operands[4], operands[5]);
17332 t2 = gen_rtx_SET (operands[6], operands[7]);
17333 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
17334 return true;
17335 }
17336
17337 /* Implement TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE. Assume for now that
17338 it isn't worth branching around empty masked ops (including masked
17339 stores). */
17340
17341 static bool
17342 aarch64_empty_mask_is_expensive (unsigned)
17343 {
17344 return false;
17345 }
17346
17347 /* Return 1 if pseudo register should be created and used to hold
17348 GOT address for PIC code. */
17349
17350 bool
17351 aarch64_use_pseudo_pic_reg (void)
17352 {
17353 return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
17354 }
17355
17356 /* Implement TARGET_UNSPEC_MAY_TRAP_P. */
17357
17358 static int
17359 aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
17360 {
17361 switch (XINT (x, 1))
17362 {
17363 case UNSPEC_GOTSMALLPIC:
17364 case UNSPEC_GOTSMALLPIC28K:
17365 case UNSPEC_GOTTINYPIC:
17366 return 0;
17367 default:
17368 break;
17369 }
17370
17371 return default_unspec_may_trap_p (x, flags);
17372 }
17373
17374
17375 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
17376 return the log2 of that value. Otherwise return -1. */
17377
17378 int
17379 aarch64_fpconst_pow_of_2 (rtx x)
17380 {
17381 const REAL_VALUE_TYPE *r;
17382
17383 if (!CONST_DOUBLE_P (x))
17384 return -1;
17385
17386 r = CONST_DOUBLE_REAL_VALUE (x);
17387
17388 if (REAL_VALUE_NEGATIVE (*r)
17389 || REAL_VALUE_ISNAN (*r)
17390 || REAL_VALUE_ISINF (*r)
17391 || !real_isinteger (r, DFmode))
17392 return -1;
17393
17394 return exact_log2 (real_to_integer (r));
17395 }
17396
17397 /* If X is a vector of equal CONST_DOUBLE values and that value is
17398 Y, return the aarch64_fpconst_pow_of_2 of Y. Otherwise return -1. */
17399
17400 int
17401 aarch64_vec_fpconst_pow_of_2 (rtx x)
17402 {
17403 int nelts;
17404 if (GET_CODE (x) != CONST_VECTOR
17405 || !CONST_VECTOR_NUNITS (x).is_constant (&nelts))
17406 return -1;
17407
17408 if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
17409 return -1;
17410
17411 int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
17412 if (firstval <= 0)
17413 return -1;
17414
17415 for (int i = 1; i < nelts; i++)
17416 if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
17417 return -1;
17418
17419 return firstval;
17420 }
17421
17422 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
17423 to float.
17424
17425 __fp16 always promotes through this hook.
17426 _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
17427 through the generic excess precision logic rather than here. */
17428
17429 static tree
17430 aarch64_promoted_type (const_tree t)
17431 {
17432 if (SCALAR_FLOAT_TYPE_P (t)
17433 && TYPE_MAIN_VARIANT (t) == aarch64_fp16_type_node)
17434 return float_type_node;
17435
17436 return NULL_TREE;
17437 }
17438
17439 /* Implement the TARGET_OPTAB_SUPPORTED_P hook. */
17440
17441 static bool
17442 aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
17443 optimization_type opt_type)
17444 {
17445 switch (op)
17446 {
17447 case rsqrt_optab:
17448 return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
17449
17450 default:
17451 return true;
17452 }
17453 }
17454
17455 /* Implement the TARGET_DWARF_POLY_INDETERMINATE_VALUE hook. */
17456
17457 static unsigned int
17458 aarch64_dwarf_poly_indeterminate_value (unsigned int i, unsigned int *factor,
17459 int *offset)
17460 {
17461 /* Polynomial invariant 1 == (VG / 2) - 1. */
17462 gcc_assert (i == 1);
17463 *factor = 2;
17464 *offset = 1;
17465 return AARCH64_DWARF_VG;
17466 }
17467
17468 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
17469 if MODE is HFmode, and punt to the generic implementation otherwise. */
17470
17471 static bool
17472 aarch64_libgcc_floating_mode_supported_p (scalar_float_mode mode)
17473 {
17474 return (mode == HFmode
17475 ? true
17476 : default_libgcc_floating_mode_supported_p (mode));
17477 }
17478
17479 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
17480 if MODE is HFmode, and punt to the generic implementation otherwise. */
17481
17482 static bool
17483 aarch64_scalar_mode_supported_p (scalar_mode mode)
17484 {
17485 return (mode == HFmode
17486 ? true
17487 : default_scalar_mode_supported_p (mode));
17488 }
17489
17490 /* Set the value of FLT_EVAL_METHOD.
17491 ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
17492
17493 0: evaluate all operations and constants, whose semantic type has at
17494 most the range and precision of type float, to the range and
17495 precision of float; evaluate all other operations and constants to
17496 the range and precision of the semantic type;
17497
17498 N, where _FloatN is a supported interchange floating type
17499 evaluate all operations and constants, whose semantic type has at
17500 most the range and precision of _FloatN type, to the range and
17501 precision of the _FloatN type; evaluate all other operations and
17502 constants to the range and precision of the semantic type;
17503
17504 If we have the ARMv8.2-A extensions then we support _Float16 in native
17505 precision, so we should set this to 16. Otherwise, we support the type,
17506 but want to evaluate expressions in float precision, so set this to
17507 0. */
17508
17509 static enum flt_eval_method
17510 aarch64_excess_precision (enum excess_precision_type type)
17511 {
17512 switch (type)
17513 {
17514 case EXCESS_PRECISION_TYPE_FAST:
17515 case EXCESS_PRECISION_TYPE_STANDARD:
17516 /* We can calculate either in 16-bit range and precision or
17517 32-bit range and precision. Make that decision based on whether
17518 we have native support for the ARMv8.2-A 16-bit floating-point
17519 instructions or not. */
17520 return (TARGET_FP_F16INST
17521 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
17522 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT);
17523 case EXCESS_PRECISION_TYPE_IMPLICIT:
17524 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16;
17525 default:
17526 gcc_unreachable ();
17527 }
17528 return FLT_EVAL_METHOD_UNPREDICTABLE;
17529 }
17530
17531 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN. Return true if INSN can be
17532 scheduled for speculative execution. Reject the long-running division
17533 and square-root instructions. */
17534
17535 static bool
17536 aarch64_sched_can_speculate_insn (rtx_insn *insn)
17537 {
17538 switch (get_attr_type (insn))
17539 {
17540 case TYPE_SDIV:
17541 case TYPE_UDIV:
17542 case TYPE_FDIVS:
17543 case TYPE_FDIVD:
17544 case TYPE_FSQRTS:
17545 case TYPE_FSQRTD:
17546 case TYPE_NEON_FP_SQRT_S:
17547 case TYPE_NEON_FP_SQRT_D:
17548 case TYPE_NEON_FP_SQRT_S_Q:
17549 case TYPE_NEON_FP_SQRT_D_Q:
17550 case TYPE_NEON_FP_DIV_S:
17551 case TYPE_NEON_FP_DIV_D:
17552 case TYPE_NEON_FP_DIV_S_Q:
17553 case TYPE_NEON_FP_DIV_D_Q:
17554 return false;
17555 default:
17556 return true;
17557 }
17558 }
17559
17560 /* Implement TARGET_COMPUTE_PRESSURE_CLASSES. */
17561
17562 static int
17563 aarch64_compute_pressure_classes (reg_class *classes)
17564 {
17565 int i = 0;
17566 classes[i++] = GENERAL_REGS;
17567 classes[i++] = FP_REGS;
17568 /* PR_REGS isn't a useful pressure class because many predicate pseudo
17569 registers need to go in PR_LO_REGS at some point during their
17570 lifetime. Splitting it into two halves has the effect of making
17571 all predicates count against PR_LO_REGS, so that we try whenever
17572 possible to restrict the number of live predicates to 8. This
17573 greatly reduces the amount of spilling in certain loops. */
17574 classes[i++] = PR_LO_REGS;
17575 classes[i++] = PR_HI_REGS;
17576 return i;
17577 }
17578
17579 /* Implement TARGET_CAN_CHANGE_MODE_CLASS. */
17580
17581 static bool
17582 aarch64_can_change_mode_class (machine_mode from,
17583 machine_mode to, reg_class_t)
17584 {
17585 if (BYTES_BIG_ENDIAN)
17586 {
17587 bool from_sve_p = aarch64_sve_data_mode_p (from);
17588 bool to_sve_p = aarch64_sve_data_mode_p (to);
17589
17590 /* Don't allow changes between SVE data modes and non-SVE modes.
17591 See the comment at the head of aarch64-sve.md for details. */
17592 if (from_sve_p != to_sve_p)
17593 return false;
17594
17595 /* Don't allow changes in element size: lane 0 of the new vector
17596 would not then be lane 0 of the old vector. See the comment
17597 above aarch64_maybe_expand_sve_subreg_move for a more detailed
17598 description.
17599
17600 In the worst case, this forces a register to be spilled in
17601 one mode and reloaded in the other, which handles the
17602 endianness correctly. */
17603 if (from_sve_p && GET_MODE_UNIT_SIZE (from) != GET_MODE_UNIT_SIZE (to))
17604 return false;
17605 }
17606 return true;
17607 }
17608
17609 /* Implement TARGET_EARLY_REMAT_MODES. */
17610
17611 static void
17612 aarch64_select_early_remat_modes (sbitmap modes)
17613 {
17614 /* SVE values are not normally live across a call, so it should be
17615 worth doing early rematerialization even in VL-specific mode. */
17616 for (int i = 0; i < NUM_MACHINE_MODES; ++i)
17617 {
17618 machine_mode mode = (machine_mode) i;
17619 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
17620 if (vec_flags & VEC_ANY_SVE)
17621 bitmap_set_bit (modes, i);
17622 }
17623 }
17624
17625 /* Target-specific selftests. */
17626
17627 #if CHECKING_P
17628
17629 namespace selftest {
17630
17631 /* Selftest for the RTL loader.
17632 Verify that the RTL loader copes with a dump from
17633 print_rtx_function. This is essentially just a test that class
17634 function_reader can handle a real dump, but it also verifies
17635 that lookup_reg_by_dump_name correctly handles hard regs.
17636 The presence of hard reg names in the dump means that the test is
17637 target-specific, hence it is in this file. */
17638
17639 static void
17640 aarch64_test_loading_full_dump ()
17641 {
17642 rtl_dump_test t (SELFTEST_LOCATION, locate_file ("aarch64/times-two.rtl"));
17643
17644 ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
17645
17646 rtx_insn *insn_1 = get_insn_by_uid (1);
17647 ASSERT_EQ (NOTE, GET_CODE (insn_1));
17648
17649 rtx_insn *insn_15 = get_insn_by_uid (15);
17650 ASSERT_EQ (INSN, GET_CODE (insn_15));
17651 ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
17652
17653 /* Verify crtl->return_rtx. */
17654 ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
17655 ASSERT_EQ (0, REGNO (crtl->return_rtx));
17656 ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
17657 }
17658
17659 /* Run all target-specific selftests. */
17660
17661 static void
17662 aarch64_run_selftests (void)
17663 {
17664 aarch64_test_loading_full_dump ();
17665 }
17666
17667 } // namespace selftest
17668
17669 #endif /* #if CHECKING_P */
17670
17671 #undef TARGET_ADDRESS_COST
17672 #define TARGET_ADDRESS_COST aarch64_address_cost
17673
17674 /* This hook will determines whether unnamed bitfields affect the alignment
17675 of the containing structure. The hook returns true if the structure
17676 should inherit the alignment requirements of an unnamed bitfield's
17677 type. */
17678 #undef TARGET_ALIGN_ANON_BITFIELD
17679 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
17680
17681 #undef TARGET_ASM_ALIGNED_DI_OP
17682 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
17683
17684 #undef TARGET_ASM_ALIGNED_HI_OP
17685 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
17686
17687 #undef TARGET_ASM_ALIGNED_SI_OP
17688 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
17689
17690 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
17691 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
17692 hook_bool_const_tree_hwi_hwi_const_tree_true
17693
17694 #undef TARGET_ASM_FILE_START
17695 #define TARGET_ASM_FILE_START aarch64_start_file
17696
17697 #undef TARGET_ASM_OUTPUT_MI_THUNK
17698 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
17699
17700 #undef TARGET_ASM_SELECT_RTX_SECTION
17701 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
17702
17703 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
17704 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
17705
17706 #undef TARGET_BUILD_BUILTIN_VA_LIST
17707 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
17708
17709 #undef TARGET_CALLEE_COPIES
17710 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
17711
17712 #undef TARGET_CAN_ELIMINATE
17713 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
17714
17715 #undef TARGET_CAN_INLINE_P
17716 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
17717
17718 #undef TARGET_CANNOT_FORCE_CONST_MEM
17719 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
17720
17721 #undef TARGET_CASE_VALUES_THRESHOLD
17722 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
17723
17724 #undef TARGET_CONDITIONAL_REGISTER_USAGE
17725 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
17726
17727 /* Only the least significant bit is used for initialization guard
17728 variables. */
17729 #undef TARGET_CXX_GUARD_MASK_BIT
17730 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
17731
17732 #undef TARGET_C_MODE_FOR_SUFFIX
17733 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
17734
17735 #ifdef TARGET_BIG_ENDIAN_DEFAULT
17736 #undef TARGET_DEFAULT_TARGET_FLAGS
17737 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
17738 #endif
17739
17740 #undef TARGET_CLASS_MAX_NREGS
17741 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
17742
17743 #undef TARGET_BUILTIN_DECL
17744 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
17745
17746 #undef TARGET_BUILTIN_RECIPROCAL
17747 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
17748
17749 #undef TARGET_C_EXCESS_PRECISION
17750 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
17751
17752 #undef TARGET_EXPAND_BUILTIN
17753 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
17754
17755 #undef TARGET_EXPAND_BUILTIN_VA_START
17756 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
17757
17758 #undef TARGET_FOLD_BUILTIN
17759 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
17760
17761 #undef TARGET_FUNCTION_ARG
17762 #define TARGET_FUNCTION_ARG aarch64_function_arg
17763
17764 #undef TARGET_FUNCTION_ARG_ADVANCE
17765 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
17766
17767 #undef TARGET_FUNCTION_ARG_BOUNDARY
17768 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
17769
17770 #undef TARGET_FUNCTION_ARG_PADDING
17771 #define TARGET_FUNCTION_ARG_PADDING aarch64_function_arg_padding
17772
17773 #undef TARGET_GET_RAW_RESULT_MODE
17774 #define TARGET_GET_RAW_RESULT_MODE aarch64_get_reg_raw_mode
17775 #undef TARGET_GET_RAW_ARG_MODE
17776 #define TARGET_GET_RAW_ARG_MODE aarch64_get_reg_raw_mode
17777
17778 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
17779 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
17780
17781 #undef TARGET_FUNCTION_VALUE
17782 #define TARGET_FUNCTION_VALUE aarch64_function_value
17783
17784 #undef TARGET_FUNCTION_VALUE_REGNO_P
17785 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
17786
17787 #undef TARGET_GIMPLE_FOLD_BUILTIN
17788 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
17789
17790 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
17791 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
17792
17793 #undef TARGET_INIT_BUILTINS
17794 #define TARGET_INIT_BUILTINS aarch64_init_builtins
17795
17796 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
17797 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
17798 aarch64_ira_change_pseudo_allocno_class
17799
17800 #undef TARGET_LEGITIMATE_ADDRESS_P
17801 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
17802
17803 #undef TARGET_LEGITIMATE_CONSTANT_P
17804 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
17805
17806 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
17807 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
17808 aarch64_legitimize_address_displacement
17809
17810 #undef TARGET_LIBGCC_CMP_RETURN_MODE
17811 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
17812
17813 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
17814 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
17815 aarch64_libgcc_floating_mode_supported_p
17816
17817 #undef TARGET_MANGLE_TYPE
17818 #define TARGET_MANGLE_TYPE aarch64_mangle_type
17819
17820 #undef TARGET_MEMORY_MOVE_COST
17821 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
17822
17823 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
17824 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
17825
17826 #undef TARGET_MUST_PASS_IN_STACK
17827 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
17828
17829 /* This target hook should return true if accesses to volatile bitfields
17830 should use the narrowest mode possible. It should return false if these
17831 accesses should use the bitfield container type. */
17832 #undef TARGET_NARROW_VOLATILE_BITFIELD
17833 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
17834
17835 #undef TARGET_OPTION_OVERRIDE
17836 #define TARGET_OPTION_OVERRIDE aarch64_override_options
17837
17838 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
17839 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
17840 aarch64_override_options_after_change
17841
17842 #undef TARGET_OPTION_SAVE
17843 #define TARGET_OPTION_SAVE aarch64_option_save
17844
17845 #undef TARGET_OPTION_RESTORE
17846 #define TARGET_OPTION_RESTORE aarch64_option_restore
17847
17848 #undef TARGET_OPTION_PRINT
17849 #define TARGET_OPTION_PRINT aarch64_option_print
17850
17851 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
17852 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
17853
17854 #undef TARGET_SET_CURRENT_FUNCTION
17855 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
17856
17857 #undef TARGET_PASS_BY_REFERENCE
17858 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
17859
17860 #undef TARGET_PREFERRED_RELOAD_CLASS
17861 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
17862
17863 #undef TARGET_SCHED_REASSOCIATION_WIDTH
17864 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
17865
17866 #undef TARGET_PROMOTED_TYPE
17867 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
17868
17869 #undef TARGET_SECONDARY_RELOAD
17870 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
17871
17872 #undef TARGET_SHIFT_TRUNCATION_MASK
17873 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
17874
17875 #undef TARGET_SETUP_INCOMING_VARARGS
17876 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
17877
17878 #undef TARGET_STRUCT_VALUE_RTX
17879 #define TARGET_STRUCT_VALUE_RTX aarch64_struct_value_rtx
17880
17881 #undef TARGET_REGISTER_MOVE_COST
17882 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
17883
17884 #undef TARGET_RETURN_IN_MEMORY
17885 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
17886
17887 #undef TARGET_RETURN_IN_MSB
17888 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
17889
17890 #undef TARGET_RTX_COSTS
17891 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
17892
17893 #undef TARGET_SCALAR_MODE_SUPPORTED_P
17894 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
17895
17896 #undef TARGET_SCHED_ISSUE_RATE
17897 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
17898
17899 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
17900 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
17901 aarch64_sched_first_cycle_multipass_dfa_lookahead
17902
17903 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
17904 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
17905 aarch64_first_cycle_multipass_dfa_lookahead_guard
17906
17907 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
17908 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
17909 aarch64_get_separate_components
17910
17911 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
17912 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
17913 aarch64_components_for_bb
17914
17915 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
17916 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
17917 aarch64_disqualify_components
17918
17919 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
17920 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
17921 aarch64_emit_prologue_components
17922
17923 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
17924 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
17925 aarch64_emit_epilogue_components
17926
17927 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
17928 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
17929 aarch64_set_handled_components
17930
17931 #undef TARGET_TRAMPOLINE_INIT
17932 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
17933
17934 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
17935 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
17936
17937 #undef TARGET_VECTOR_MODE_SUPPORTED_P
17938 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
17939
17940 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
17941 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
17942 aarch64_builtin_support_vector_misalignment
17943
17944 #undef TARGET_ARRAY_MODE
17945 #define TARGET_ARRAY_MODE aarch64_array_mode
17946
17947 #undef TARGET_ARRAY_MODE_SUPPORTED_P
17948 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
17949
17950 #undef TARGET_VECTORIZE_ADD_STMT_COST
17951 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
17952
17953 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
17954 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
17955 aarch64_builtin_vectorization_cost
17956
17957 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
17958 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
17959
17960 #undef TARGET_VECTORIZE_BUILTINS
17961 #define TARGET_VECTORIZE_BUILTINS
17962
17963 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
17964 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
17965 aarch64_builtin_vectorized_function
17966
17967 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
17968 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
17969 aarch64_autovectorize_vector_sizes
17970
17971 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
17972 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
17973 aarch64_atomic_assign_expand_fenv
17974
17975 /* Section anchor support. */
17976
17977 #undef TARGET_MIN_ANCHOR_OFFSET
17978 #define TARGET_MIN_ANCHOR_OFFSET -256
17979
17980 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
17981 byte offset; we can do much more for larger data types, but have no way
17982 to determine the size of the access. We assume accesses are aligned. */
17983 #undef TARGET_MAX_ANCHOR_OFFSET
17984 #define TARGET_MAX_ANCHOR_OFFSET 4095
17985
17986 #undef TARGET_VECTOR_ALIGNMENT
17987 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
17988
17989 #undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
17990 #define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
17991 aarch64_vectorize_preferred_vector_alignment
17992 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
17993 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
17994 aarch64_simd_vector_alignment_reachable
17995
17996 /* vec_perm support. */
17997
17998 #undef TARGET_VECTORIZE_VEC_PERM_CONST
17999 #define TARGET_VECTORIZE_VEC_PERM_CONST \
18000 aarch64_vectorize_vec_perm_const
18001
18002 #undef TARGET_VECTORIZE_GET_MASK_MODE
18003 #define TARGET_VECTORIZE_GET_MASK_MODE aarch64_get_mask_mode
18004 #undef TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE
18005 #define TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE \
18006 aarch64_empty_mask_is_expensive
18007 #undef TARGET_PREFERRED_ELSE_VALUE
18008 #define TARGET_PREFERRED_ELSE_VALUE \
18009 aarch64_preferred_else_value
18010
18011 #undef TARGET_INIT_LIBFUNCS
18012 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
18013
18014 #undef TARGET_FIXED_CONDITION_CODE_REGS
18015 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
18016
18017 #undef TARGET_FLAGS_REGNUM
18018 #define TARGET_FLAGS_REGNUM CC_REGNUM
18019
18020 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
18021 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
18022
18023 #undef TARGET_ASAN_SHADOW_OFFSET
18024 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
18025
18026 #undef TARGET_LEGITIMIZE_ADDRESS
18027 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
18028
18029 #undef TARGET_SCHED_CAN_SPECULATE_INSN
18030 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
18031
18032 #undef TARGET_CAN_USE_DOLOOP_P
18033 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
18034
18035 #undef TARGET_SCHED_ADJUST_PRIORITY
18036 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
18037
18038 #undef TARGET_SCHED_MACRO_FUSION_P
18039 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
18040
18041 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
18042 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
18043
18044 #undef TARGET_SCHED_FUSION_PRIORITY
18045 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
18046
18047 #undef TARGET_UNSPEC_MAY_TRAP_P
18048 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
18049
18050 #undef TARGET_USE_PSEUDO_PIC_REG
18051 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
18052
18053 #undef TARGET_PRINT_OPERAND
18054 #define TARGET_PRINT_OPERAND aarch64_print_operand
18055
18056 #undef TARGET_PRINT_OPERAND_ADDRESS
18057 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
18058
18059 #undef TARGET_OPTAB_SUPPORTED_P
18060 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
18061
18062 #undef TARGET_OMIT_STRUCT_RETURN_REG
18063 #define TARGET_OMIT_STRUCT_RETURN_REG true
18064
18065 #undef TARGET_DWARF_POLY_INDETERMINATE_VALUE
18066 #define TARGET_DWARF_POLY_INDETERMINATE_VALUE \
18067 aarch64_dwarf_poly_indeterminate_value
18068
18069 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors. */
18070 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
18071 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
18072
18073 #undef TARGET_HARD_REGNO_NREGS
18074 #define TARGET_HARD_REGNO_NREGS aarch64_hard_regno_nregs
18075 #undef TARGET_HARD_REGNO_MODE_OK
18076 #define TARGET_HARD_REGNO_MODE_OK aarch64_hard_regno_mode_ok
18077
18078 #undef TARGET_MODES_TIEABLE_P
18079 #define TARGET_MODES_TIEABLE_P aarch64_modes_tieable_p
18080
18081 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
18082 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
18083 aarch64_hard_regno_call_part_clobbered
18084
18085 #undef TARGET_CONSTANT_ALIGNMENT
18086 #define TARGET_CONSTANT_ALIGNMENT aarch64_constant_alignment
18087
18088 #undef TARGET_COMPUTE_PRESSURE_CLASSES
18089 #define TARGET_COMPUTE_PRESSURE_CLASSES aarch64_compute_pressure_classes
18090
18091 #undef TARGET_CAN_CHANGE_MODE_CLASS
18092 #define TARGET_CAN_CHANGE_MODE_CLASS aarch64_can_change_mode_class
18093
18094 #undef TARGET_SELECT_EARLY_REMAT_MODES
18095 #define TARGET_SELECT_EARLY_REMAT_MODES aarch64_select_early_remat_modes
18096
18097 #if CHECKING_P
18098 #undef TARGET_RUN_TARGET_SELFTESTS
18099 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
18100 #endif /* #if CHECKING_P */
18101
18102 struct gcc_target targetm = TARGET_INITIALIZER;
18103
18104 #include "gt-aarch64.h"