]> git.ipfire.org Git - thirdparty/gcc.git/blob - gcc/config/aarch64/aarch64.c
[AARCH64] Use STLUR for atomic_store
[thirdparty/gcc.git] / gcc / config / aarch64 / aarch64.c
1 /* Machine description for AArch64 architecture.
2 Copyright (C) 2009-2018 Free Software Foundation, Inc.
3 Contributed by ARM Ltd.
4
5 This file is part of GCC.
6
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3, or (at your option)
10 any later version.
11
12 GCC is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 General Public License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
20
21 #define IN_TARGET_CODE 1
22
23 #include "config.h"
24 #define INCLUDE_STRING
25 #include "system.h"
26 #include "coretypes.h"
27 #include "backend.h"
28 #include "target.h"
29 #include "rtl.h"
30 #include "tree.h"
31 #include "memmodel.h"
32 #include "gimple.h"
33 #include "cfghooks.h"
34 #include "cfgloop.h"
35 #include "df.h"
36 #include "tm_p.h"
37 #include "stringpool.h"
38 #include "attribs.h"
39 #include "optabs.h"
40 #include "regs.h"
41 #include "emit-rtl.h"
42 #include "recog.h"
43 #include "diagnostic.h"
44 #include "insn-attr.h"
45 #include "alias.h"
46 #include "fold-const.h"
47 #include "stor-layout.h"
48 #include "calls.h"
49 #include "varasm.h"
50 #include "output.h"
51 #include "flags.h"
52 #include "explow.h"
53 #include "expr.h"
54 #include "reload.h"
55 #include "langhooks.h"
56 #include "opts.h"
57 #include "params.h"
58 #include "gimplify.h"
59 #include "dwarf2.h"
60 #include "gimple-iterator.h"
61 #include "tree-vectorizer.h"
62 #include "aarch64-cost-tables.h"
63 #include "dumpfile.h"
64 #include "builtins.h"
65 #include "rtl-iter.h"
66 #include "tm-constrs.h"
67 #include "sched-int.h"
68 #include "target-globals.h"
69 #include "common/common-target.h"
70 #include "cfgrtl.h"
71 #include "selftest.h"
72 #include "selftest-rtl.h"
73 #include "rtx-vector-builder.h"
74
75 /* This file should be included last. */
76 #include "target-def.h"
77
78 /* Defined for convenience. */
79 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
80
81 /* Information about a legitimate vector immediate operand. */
82 struct simd_immediate_info
83 {
84 enum insn_type { MOV, MVN };
85 enum modifier_type { LSL, MSL };
86
87 simd_immediate_info () {}
88 simd_immediate_info (scalar_float_mode, rtx);
89 simd_immediate_info (scalar_int_mode, unsigned HOST_WIDE_INT,
90 insn_type = MOV, modifier_type = LSL,
91 unsigned int = 0);
92 simd_immediate_info (scalar_mode, rtx, rtx);
93
94 /* The mode of the elements. */
95 scalar_mode elt_mode;
96
97 /* The value of each element if all elements are the same, or the
98 first value if the constant is a series. */
99 rtx value;
100
101 /* The value of the step if the constant is a series, null otherwise. */
102 rtx step;
103
104 /* The instruction to use to move the immediate into a vector. */
105 insn_type insn;
106
107 /* The kind of shift modifier to use, and the number of bits to shift.
108 This is (LSL, 0) if no shift is needed. */
109 modifier_type modifier;
110 unsigned int shift;
111 };
112
113 /* Construct a floating-point immediate in which each element has mode
114 ELT_MODE_IN and value VALUE_IN. */
115 inline simd_immediate_info
116 ::simd_immediate_info (scalar_float_mode elt_mode_in, rtx value_in)
117 : elt_mode (elt_mode_in), value (value_in), step (NULL_RTX), insn (MOV),
118 modifier (LSL), shift (0)
119 {}
120
121 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
122 and value VALUE_IN. The other parameters are as for the structure
123 fields. */
124 inline simd_immediate_info
125 ::simd_immediate_info (scalar_int_mode elt_mode_in,
126 unsigned HOST_WIDE_INT value_in,
127 insn_type insn_in, modifier_type modifier_in,
128 unsigned int shift_in)
129 : elt_mode (elt_mode_in), value (gen_int_mode (value_in, elt_mode_in)),
130 step (NULL_RTX), insn (insn_in), modifier (modifier_in), shift (shift_in)
131 {}
132
133 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
134 and where element I is equal to VALUE_IN + I * STEP_IN. */
135 inline simd_immediate_info
136 ::simd_immediate_info (scalar_mode elt_mode_in, rtx value_in, rtx step_in)
137 : elt_mode (elt_mode_in), value (value_in), step (step_in), insn (MOV),
138 modifier (LSL), shift (0)
139 {}
140
141 /* The current code model. */
142 enum aarch64_code_model aarch64_cmodel;
143
144 /* The number of 64-bit elements in an SVE vector. */
145 poly_uint16 aarch64_sve_vg;
146
147 #ifdef HAVE_AS_TLS
148 #undef TARGET_HAVE_TLS
149 #define TARGET_HAVE_TLS 1
150 #endif
151
152 static bool aarch64_composite_type_p (const_tree, machine_mode);
153 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
154 const_tree,
155 machine_mode *, int *,
156 bool *);
157 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
158 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
159 static void aarch64_override_options_after_change (void);
160 static bool aarch64_vector_mode_supported_p (machine_mode);
161 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
162 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
163 const_tree type,
164 int misalignment,
165 bool is_packed);
166 static machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64);
167 static bool aarch64_print_address_internal (FILE*, machine_mode, rtx,
168 aarch64_addr_query_type);
169
170 /* Major revision number of the ARM Architecture implemented by the target. */
171 unsigned aarch64_architecture_version;
172
173 /* The processor for which instructions should be scheduled. */
174 enum aarch64_processor aarch64_tune = cortexa53;
175
176 /* Mask to specify which instruction scheduling options should be used. */
177 unsigned long aarch64_tune_flags = 0;
178
179 /* Global flag for PC relative loads. */
180 bool aarch64_pcrelative_literal_loads;
181
182 /* Global flag for whether frame pointer is enabled. */
183 bool aarch64_use_frame_pointer;
184
185 /* Support for command line parsing of boolean flags in the tuning
186 structures. */
187 struct aarch64_flag_desc
188 {
189 const char* name;
190 unsigned int flag;
191 };
192
193 #define AARCH64_FUSION_PAIR(name, internal_name) \
194 { name, AARCH64_FUSE_##internal_name },
195 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
196 {
197 { "none", AARCH64_FUSE_NOTHING },
198 #include "aarch64-fusion-pairs.def"
199 { "all", AARCH64_FUSE_ALL },
200 { NULL, AARCH64_FUSE_NOTHING }
201 };
202
203 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
204 { name, AARCH64_EXTRA_TUNE_##internal_name },
205 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
206 {
207 { "none", AARCH64_EXTRA_TUNE_NONE },
208 #include "aarch64-tuning-flags.def"
209 { "all", AARCH64_EXTRA_TUNE_ALL },
210 { NULL, AARCH64_EXTRA_TUNE_NONE }
211 };
212
213 /* Tuning parameters. */
214
215 static const struct cpu_addrcost_table generic_addrcost_table =
216 {
217 {
218 1, /* hi */
219 0, /* si */
220 0, /* di */
221 1, /* ti */
222 },
223 0, /* pre_modify */
224 0, /* post_modify */
225 0, /* register_offset */
226 0, /* register_sextend */
227 0, /* register_zextend */
228 0 /* imm_offset */
229 };
230
231 static const struct cpu_addrcost_table exynosm1_addrcost_table =
232 {
233 {
234 0, /* hi */
235 0, /* si */
236 0, /* di */
237 2, /* ti */
238 },
239 0, /* pre_modify */
240 0, /* post_modify */
241 1, /* register_offset */
242 1, /* register_sextend */
243 2, /* register_zextend */
244 0, /* imm_offset */
245 };
246
247 static const struct cpu_addrcost_table xgene1_addrcost_table =
248 {
249 {
250 1, /* hi */
251 0, /* si */
252 0, /* di */
253 1, /* ti */
254 },
255 1, /* pre_modify */
256 0, /* post_modify */
257 0, /* register_offset */
258 1, /* register_sextend */
259 1, /* register_zextend */
260 0, /* imm_offset */
261 };
262
263 static const struct cpu_addrcost_table thunderx2t99_addrcost_table =
264 {
265 {
266 1, /* hi */
267 1, /* si */
268 1, /* di */
269 2, /* ti */
270 },
271 0, /* pre_modify */
272 0, /* post_modify */
273 2, /* register_offset */
274 3, /* register_sextend */
275 3, /* register_zextend */
276 0, /* imm_offset */
277 };
278
279 static const struct cpu_addrcost_table qdf24xx_addrcost_table =
280 {
281 {
282 1, /* hi */
283 1, /* si */
284 1, /* di */
285 2, /* ti */
286 },
287 1, /* pre_modify */
288 1, /* post_modify */
289 3, /* register_offset */
290 3, /* register_sextend */
291 3, /* register_zextend */
292 2, /* imm_offset */
293 };
294
295 static const struct cpu_regmove_cost generic_regmove_cost =
296 {
297 1, /* GP2GP */
298 /* Avoid the use of slow int<->fp moves for spilling by setting
299 their cost higher than memmov_cost. */
300 5, /* GP2FP */
301 5, /* FP2GP */
302 2 /* FP2FP */
303 };
304
305 static const struct cpu_regmove_cost cortexa57_regmove_cost =
306 {
307 1, /* GP2GP */
308 /* Avoid the use of slow int<->fp moves for spilling by setting
309 their cost higher than memmov_cost. */
310 5, /* GP2FP */
311 5, /* FP2GP */
312 2 /* FP2FP */
313 };
314
315 static const struct cpu_regmove_cost cortexa53_regmove_cost =
316 {
317 1, /* GP2GP */
318 /* Avoid the use of slow int<->fp moves for spilling by setting
319 their cost higher than memmov_cost. */
320 5, /* GP2FP */
321 5, /* FP2GP */
322 2 /* FP2FP */
323 };
324
325 static const struct cpu_regmove_cost exynosm1_regmove_cost =
326 {
327 1, /* GP2GP */
328 /* Avoid the use of slow int<->fp moves for spilling by setting
329 their cost higher than memmov_cost (actual, 4 and 9). */
330 9, /* GP2FP */
331 9, /* FP2GP */
332 1 /* FP2FP */
333 };
334
335 static const struct cpu_regmove_cost thunderx_regmove_cost =
336 {
337 2, /* GP2GP */
338 2, /* GP2FP */
339 6, /* FP2GP */
340 4 /* FP2FP */
341 };
342
343 static const struct cpu_regmove_cost xgene1_regmove_cost =
344 {
345 1, /* GP2GP */
346 /* Avoid the use of slow int<->fp moves for spilling by setting
347 their cost higher than memmov_cost. */
348 8, /* GP2FP */
349 8, /* FP2GP */
350 2 /* FP2FP */
351 };
352
353 static const struct cpu_regmove_cost qdf24xx_regmove_cost =
354 {
355 2, /* GP2GP */
356 /* Avoid the use of int<->fp moves for spilling. */
357 6, /* GP2FP */
358 6, /* FP2GP */
359 4 /* FP2FP */
360 };
361
362 static const struct cpu_regmove_cost thunderx2t99_regmove_cost =
363 {
364 1, /* GP2GP */
365 /* Avoid the use of int<->fp moves for spilling. */
366 8, /* GP2FP */
367 8, /* FP2GP */
368 4 /* FP2FP */
369 };
370
371 /* Generic costs for vector insn classes. */
372 static const struct cpu_vector_cost generic_vector_cost =
373 {
374 1, /* scalar_int_stmt_cost */
375 1, /* scalar_fp_stmt_cost */
376 1, /* scalar_load_cost */
377 1, /* scalar_store_cost */
378 1, /* vec_int_stmt_cost */
379 1, /* vec_fp_stmt_cost */
380 2, /* vec_permute_cost */
381 1, /* vec_to_scalar_cost */
382 1, /* scalar_to_vec_cost */
383 1, /* vec_align_load_cost */
384 1, /* vec_unalign_load_cost */
385 1, /* vec_unalign_store_cost */
386 1, /* vec_store_cost */
387 3, /* cond_taken_branch_cost */
388 1 /* cond_not_taken_branch_cost */
389 };
390
391 /* QDF24XX costs for vector insn classes. */
392 static const struct cpu_vector_cost qdf24xx_vector_cost =
393 {
394 1, /* scalar_int_stmt_cost */
395 1, /* scalar_fp_stmt_cost */
396 1, /* scalar_load_cost */
397 1, /* scalar_store_cost */
398 1, /* vec_int_stmt_cost */
399 3, /* vec_fp_stmt_cost */
400 2, /* vec_permute_cost */
401 1, /* vec_to_scalar_cost */
402 1, /* scalar_to_vec_cost */
403 1, /* vec_align_load_cost */
404 1, /* vec_unalign_load_cost */
405 1, /* vec_unalign_store_cost */
406 1, /* vec_store_cost */
407 3, /* cond_taken_branch_cost */
408 1 /* cond_not_taken_branch_cost */
409 };
410
411 /* ThunderX costs for vector insn classes. */
412 static const struct cpu_vector_cost thunderx_vector_cost =
413 {
414 1, /* scalar_int_stmt_cost */
415 1, /* scalar_fp_stmt_cost */
416 3, /* scalar_load_cost */
417 1, /* scalar_store_cost */
418 4, /* vec_int_stmt_cost */
419 1, /* vec_fp_stmt_cost */
420 4, /* vec_permute_cost */
421 2, /* vec_to_scalar_cost */
422 2, /* scalar_to_vec_cost */
423 3, /* vec_align_load_cost */
424 5, /* vec_unalign_load_cost */
425 5, /* vec_unalign_store_cost */
426 1, /* vec_store_cost */
427 3, /* cond_taken_branch_cost */
428 3 /* cond_not_taken_branch_cost */
429 };
430
431 /* Generic costs for vector insn classes. */
432 static const struct cpu_vector_cost cortexa57_vector_cost =
433 {
434 1, /* scalar_int_stmt_cost */
435 1, /* scalar_fp_stmt_cost */
436 4, /* scalar_load_cost */
437 1, /* scalar_store_cost */
438 2, /* vec_int_stmt_cost */
439 2, /* vec_fp_stmt_cost */
440 3, /* vec_permute_cost */
441 8, /* vec_to_scalar_cost */
442 8, /* scalar_to_vec_cost */
443 4, /* vec_align_load_cost */
444 4, /* vec_unalign_load_cost */
445 1, /* vec_unalign_store_cost */
446 1, /* vec_store_cost */
447 1, /* cond_taken_branch_cost */
448 1 /* cond_not_taken_branch_cost */
449 };
450
451 static const struct cpu_vector_cost exynosm1_vector_cost =
452 {
453 1, /* scalar_int_stmt_cost */
454 1, /* scalar_fp_stmt_cost */
455 5, /* scalar_load_cost */
456 1, /* scalar_store_cost */
457 3, /* vec_int_stmt_cost */
458 3, /* vec_fp_stmt_cost */
459 3, /* vec_permute_cost */
460 3, /* vec_to_scalar_cost */
461 3, /* scalar_to_vec_cost */
462 5, /* vec_align_load_cost */
463 5, /* vec_unalign_load_cost */
464 1, /* vec_unalign_store_cost */
465 1, /* vec_store_cost */
466 1, /* cond_taken_branch_cost */
467 1 /* cond_not_taken_branch_cost */
468 };
469
470 /* Generic costs for vector insn classes. */
471 static const struct cpu_vector_cost xgene1_vector_cost =
472 {
473 1, /* scalar_int_stmt_cost */
474 1, /* scalar_fp_stmt_cost */
475 5, /* scalar_load_cost */
476 1, /* scalar_store_cost */
477 2, /* vec_int_stmt_cost */
478 2, /* vec_fp_stmt_cost */
479 2, /* vec_permute_cost */
480 4, /* vec_to_scalar_cost */
481 4, /* scalar_to_vec_cost */
482 10, /* vec_align_load_cost */
483 10, /* vec_unalign_load_cost */
484 2, /* vec_unalign_store_cost */
485 2, /* vec_store_cost */
486 2, /* cond_taken_branch_cost */
487 1 /* cond_not_taken_branch_cost */
488 };
489
490 /* Costs for vector insn classes for Vulcan. */
491 static const struct cpu_vector_cost thunderx2t99_vector_cost =
492 {
493 1, /* scalar_int_stmt_cost */
494 6, /* scalar_fp_stmt_cost */
495 4, /* scalar_load_cost */
496 1, /* scalar_store_cost */
497 5, /* vec_int_stmt_cost */
498 6, /* vec_fp_stmt_cost */
499 3, /* vec_permute_cost */
500 6, /* vec_to_scalar_cost */
501 5, /* scalar_to_vec_cost */
502 8, /* vec_align_load_cost */
503 8, /* vec_unalign_load_cost */
504 4, /* vec_unalign_store_cost */
505 4, /* vec_store_cost */
506 2, /* cond_taken_branch_cost */
507 1 /* cond_not_taken_branch_cost */
508 };
509
510 /* Generic costs for branch instructions. */
511 static const struct cpu_branch_cost generic_branch_cost =
512 {
513 1, /* Predictable. */
514 3 /* Unpredictable. */
515 };
516
517 /* Generic approximation modes. */
518 static const cpu_approx_modes generic_approx_modes =
519 {
520 AARCH64_APPROX_NONE, /* division */
521 AARCH64_APPROX_NONE, /* sqrt */
522 AARCH64_APPROX_NONE /* recip_sqrt */
523 };
524
525 /* Approximation modes for Exynos M1. */
526 static const cpu_approx_modes exynosm1_approx_modes =
527 {
528 AARCH64_APPROX_NONE, /* division */
529 AARCH64_APPROX_ALL, /* sqrt */
530 AARCH64_APPROX_ALL /* recip_sqrt */
531 };
532
533 /* Approximation modes for X-Gene 1. */
534 static const cpu_approx_modes xgene1_approx_modes =
535 {
536 AARCH64_APPROX_NONE, /* division */
537 AARCH64_APPROX_NONE, /* sqrt */
538 AARCH64_APPROX_ALL /* recip_sqrt */
539 };
540
541 /* Generic prefetch settings (which disable prefetch). */
542 static const cpu_prefetch_tune generic_prefetch_tune =
543 {
544 0, /* num_slots */
545 -1, /* l1_cache_size */
546 -1, /* l1_cache_line_size */
547 -1, /* l2_cache_size */
548 true, /* prefetch_dynamic_strides */
549 -1, /* minimum_stride */
550 -1 /* default_opt_level */
551 };
552
553 static const cpu_prefetch_tune exynosm1_prefetch_tune =
554 {
555 0, /* num_slots */
556 -1, /* l1_cache_size */
557 64, /* l1_cache_line_size */
558 -1, /* l2_cache_size */
559 true, /* prefetch_dynamic_strides */
560 -1, /* minimum_stride */
561 -1 /* default_opt_level */
562 };
563
564 static const cpu_prefetch_tune qdf24xx_prefetch_tune =
565 {
566 4, /* num_slots */
567 32, /* l1_cache_size */
568 64, /* l1_cache_line_size */
569 512, /* l2_cache_size */
570 false, /* prefetch_dynamic_strides */
571 2048, /* minimum_stride */
572 3 /* default_opt_level */
573 };
574
575 static const cpu_prefetch_tune thunderxt88_prefetch_tune =
576 {
577 8, /* num_slots */
578 32, /* l1_cache_size */
579 128, /* l1_cache_line_size */
580 16*1024, /* l2_cache_size */
581 true, /* prefetch_dynamic_strides */
582 -1, /* minimum_stride */
583 3 /* default_opt_level */
584 };
585
586 static const cpu_prefetch_tune thunderx_prefetch_tune =
587 {
588 8, /* num_slots */
589 32, /* l1_cache_size */
590 128, /* l1_cache_line_size */
591 -1, /* l2_cache_size */
592 true, /* prefetch_dynamic_strides */
593 -1, /* minimum_stride */
594 -1 /* default_opt_level */
595 };
596
597 static const cpu_prefetch_tune thunderx2t99_prefetch_tune =
598 {
599 8, /* num_slots */
600 32, /* l1_cache_size */
601 64, /* l1_cache_line_size */
602 256, /* l2_cache_size */
603 true, /* prefetch_dynamic_strides */
604 -1, /* minimum_stride */
605 -1 /* default_opt_level */
606 };
607
608 static const struct tune_params generic_tunings =
609 {
610 &cortexa57_extra_costs,
611 &generic_addrcost_table,
612 &generic_regmove_cost,
613 &generic_vector_cost,
614 &generic_branch_cost,
615 &generic_approx_modes,
616 4, /* memmov_cost */
617 2, /* issue_rate */
618 (AARCH64_FUSE_AES_AESMC), /* fusible_ops */
619 "8", /* function_align. */
620 "4", /* jump_align. */
621 "8", /* loop_align. */
622 2, /* int_reassoc_width. */
623 4, /* fp_reassoc_width. */
624 1, /* vec_reassoc_width. */
625 2, /* min_div_recip_mul_sf. */
626 2, /* min_div_recip_mul_df. */
627 0, /* max_case_values. */
628 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
629 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
630 &generic_prefetch_tune
631 };
632
633 static const struct tune_params cortexa35_tunings =
634 {
635 &cortexa53_extra_costs,
636 &generic_addrcost_table,
637 &cortexa53_regmove_cost,
638 &generic_vector_cost,
639 &generic_branch_cost,
640 &generic_approx_modes,
641 4, /* memmov_cost */
642 1, /* issue_rate */
643 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
644 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
645 "16", /* function_align. */
646 "4", /* jump_align. */
647 "8", /* loop_align. */
648 2, /* int_reassoc_width. */
649 4, /* fp_reassoc_width. */
650 1, /* vec_reassoc_width. */
651 2, /* min_div_recip_mul_sf. */
652 2, /* min_div_recip_mul_df. */
653 0, /* max_case_values. */
654 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
655 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
656 &generic_prefetch_tune
657 };
658
659 static const struct tune_params cortexa53_tunings =
660 {
661 &cortexa53_extra_costs,
662 &generic_addrcost_table,
663 &cortexa53_regmove_cost,
664 &generic_vector_cost,
665 &generic_branch_cost,
666 &generic_approx_modes,
667 4, /* memmov_cost */
668 2, /* issue_rate */
669 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
670 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
671 "16", /* function_align. */
672 "4", /* jump_align. */
673 "8", /* loop_align. */
674 2, /* int_reassoc_width. */
675 4, /* fp_reassoc_width. */
676 1, /* vec_reassoc_width. */
677 2, /* min_div_recip_mul_sf. */
678 2, /* min_div_recip_mul_df. */
679 0, /* max_case_values. */
680 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
681 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
682 &generic_prefetch_tune
683 };
684
685 static const struct tune_params cortexa57_tunings =
686 {
687 &cortexa57_extra_costs,
688 &generic_addrcost_table,
689 &cortexa57_regmove_cost,
690 &cortexa57_vector_cost,
691 &generic_branch_cost,
692 &generic_approx_modes,
693 4, /* memmov_cost */
694 3, /* issue_rate */
695 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
696 | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
697 "16", /* function_align. */
698 "4", /* jump_align. */
699 "8", /* loop_align. */
700 2, /* int_reassoc_width. */
701 4, /* fp_reassoc_width. */
702 1, /* vec_reassoc_width. */
703 2, /* min_div_recip_mul_sf. */
704 2, /* min_div_recip_mul_df. */
705 0, /* max_case_values. */
706 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
707 (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS), /* tune_flags. */
708 &generic_prefetch_tune
709 };
710
711 static const struct tune_params cortexa72_tunings =
712 {
713 &cortexa57_extra_costs,
714 &generic_addrcost_table,
715 &cortexa57_regmove_cost,
716 &cortexa57_vector_cost,
717 &generic_branch_cost,
718 &generic_approx_modes,
719 4, /* memmov_cost */
720 3, /* issue_rate */
721 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
722 | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
723 "16", /* function_align. */
724 "4", /* jump_align. */
725 "8", /* loop_align. */
726 2, /* int_reassoc_width. */
727 4, /* fp_reassoc_width. */
728 1, /* vec_reassoc_width. */
729 2, /* min_div_recip_mul_sf. */
730 2, /* min_div_recip_mul_df. */
731 0, /* max_case_values. */
732 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
733 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
734 &generic_prefetch_tune
735 };
736
737 static const struct tune_params cortexa73_tunings =
738 {
739 &cortexa57_extra_costs,
740 &generic_addrcost_table,
741 &cortexa57_regmove_cost,
742 &cortexa57_vector_cost,
743 &generic_branch_cost,
744 &generic_approx_modes,
745 4, /* memmov_cost. */
746 2, /* issue_rate. */
747 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
748 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
749 "16", /* function_align. */
750 "4", /* jump_align. */
751 "8", /* loop_align. */
752 2, /* int_reassoc_width. */
753 4, /* fp_reassoc_width. */
754 1, /* vec_reassoc_width. */
755 2, /* min_div_recip_mul_sf. */
756 2, /* min_div_recip_mul_df. */
757 0, /* max_case_values. */
758 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
759 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
760 &generic_prefetch_tune
761 };
762
763
764
765 static const struct tune_params exynosm1_tunings =
766 {
767 &exynosm1_extra_costs,
768 &exynosm1_addrcost_table,
769 &exynosm1_regmove_cost,
770 &exynosm1_vector_cost,
771 &generic_branch_cost,
772 &exynosm1_approx_modes,
773 4, /* memmov_cost */
774 3, /* issue_rate */
775 (AARCH64_FUSE_AES_AESMC), /* fusible_ops */
776 "4", /* function_align. */
777 "4", /* jump_align. */
778 "4", /* loop_align. */
779 2, /* int_reassoc_width. */
780 4, /* fp_reassoc_width. */
781 1, /* vec_reassoc_width. */
782 2, /* min_div_recip_mul_sf. */
783 2, /* min_div_recip_mul_df. */
784 48, /* max_case_values. */
785 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
786 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
787 &exynosm1_prefetch_tune
788 };
789
790 static const struct tune_params thunderxt88_tunings =
791 {
792 &thunderx_extra_costs,
793 &generic_addrcost_table,
794 &thunderx_regmove_cost,
795 &thunderx_vector_cost,
796 &generic_branch_cost,
797 &generic_approx_modes,
798 6, /* memmov_cost */
799 2, /* issue_rate */
800 AARCH64_FUSE_CMP_BRANCH, /* fusible_ops */
801 "8", /* function_align. */
802 "8", /* jump_align. */
803 "8", /* loop_align. */
804 2, /* int_reassoc_width. */
805 4, /* fp_reassoc_width. */
806 1, /* vec_reassoc_width. */
807 2, /* min_div_recip_mul_sf. */
808 2, /* min_div_recip_mul_df. */
809 0, /* max_case_values. */
810 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
811 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW), /* tune_flags. */
812 &thunderxt88_prefetch_tune
813 };
814
815 static const struct tune_params thunderx_tunings =
816 {
817 &thunderx_extra_costs,
818 &generic_addrcost_table,
819 &thunderx_regmove_cost,
820 &thunderx_vector_cost,
821 &generic_branch_cost,
822 &generic_approx_modes,
823 6, /* memmov_cost */
824 2, /* issue_rate */
825 AARCH64_FUSE_CMP_BRANCH, /* fusible_ops */
826 "8", /* function_align. */
827 "8", /* jump_align. */
828 "8", /* loop_align. */
829 2, /* int_reassoc_width. */
830 4, /* fp_reassoc_width. */
831 1, /* vec_reassoc_width. */
832 2, /* min_div_recip_mul_sf. */
833 2, /* min_div_recip_mul_df. */
834 0, /* max_case_values. */
835 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
836 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
837 | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND), /* tune_flags. */
838 &thunderx_prefetch_tune
839 };
840
841 static const struct tune_params xgene1_tunings =
842 {
843 &xgene1_extra_costs,
844 &xgene1_addrcost_table,
845 &xgene1_regmove_cost,
846 &xgene1_vector_cost,
847 &generic_branch_cost,
848 &xgene1_approx_modes,
849 6, /* memmov_cost */
850 4, /* issue_rate */
851 AARCH64_FUSE_NOTHING, /* fusible_ops */
852 "16", /* function_align. */
853 "8", /* jump_align. */
854 "16", /* loop_align. */
855 2, /* int_reassoc_width. */
856 4, /* fp_reassoc_width. */
857 1, /* vec_reassoc_width. */
858 2, /* min_div_recip_mul_sf. */
859 2, /* min_div_recip_mul_df. */
860 0, /* max_case_values. */
861 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
862 (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS), /* tune_flags. */
863 &generic_prefetch_tune
864 };
865
866 static const struct tune_params qdf24xx_tunings =
867 {
868 &qdf24xx_extra_costs,
869 &qdf24xx_addrcost_table,
870 &qdf24xx_regmove_cost,
871 &qdf24xx_vector_cost,
872 &generic_branch_cost,
873 &generic_approx_modes,
874 4, /* memmov_cost */
875 4, /* issue_rate */
876 (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
877 | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops */
878 "16", /* function_align. */
879 "8", /* jump_align. */
880 "16", /* loop_align. */
881 2, /* int_reassoc_width. */
882 4, /* fp_reassoc_width. */
883 1, /* vec_reassoc_width. */
884 2, /* min_div_recip_mul_sf. */
885 2, /* min_div_recip_mul_df. */
886 0, /* max_case_values. */
887 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
888 AARCH64_EXTRA_TUNE_RENAME_LOAD_REGS, /* tune_flags. */
889 &qdf24xx_prefetch_tune
890 };
891
892 /* Tuning structure for the Qualcomm Saphira core. Default to falkor values
893 for now. */
894 static const struct tune_params saphira_tunings =
895 {
896 &generic_extra_costs,
897 &generic_addrcost_table,
898 &generic_regmove_cost,
899 &generic_vector_cost,
900 &generic_branch_cost,
901 &generic_approx_modes,
902 4, /* memmov_cost */
903 4, /* issue_rate */
904 (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
905 | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops */
906 "16", /* function_align. */
907 "8", /* jump_align. */
908 "16", /* loop_align. */
909 2, /* int_reassoc_width. */
910 4, /* fp_reassoc_width. */
911 1, /* vec_reassoc_width. */
912 2, /* min_div_recip_mul_sf. */
913 2, /* min_div_recip_mul_df. */
914 0, /* max_case_values. */
915 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
916 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
917 &generic_prefetch_tune
918 };
919
920 static const struct tune_params thunderx2t99_tunings =
921 {
922 &thunderx2t99_extra_costs,
923 &thunderx2t99_addrcost_table,
924 &thunderx2t99_regmove_cost,
925 &thunderx2t99_vector_cost,
926 &generic_branch_cost,
927 &generic_approx_modes,
928 4, /* memmov_cost. */
929 4, /* issue_rate. */
930 (AARCH64_FUSE_CMP_BRANCH | AARCH64_FUSE_AES_AESMC
931 | AARCH64_FUSE_ALU_BRANCH), /* fusible_ops */
932 "16", /* function_align. */
933 "8", /* jump_align. */
934 "16", /* loop_align. */
935 3, /* int_reassoc_width. */
936 2, /* fp_reassoc_width. */
937 2, /* vec_reassoc_width. */
938 2, /* min_div_recip_mul_sf. */
939 2, /* min_div_recip_mul_df. */
940 0, /* max_case_values. */
941 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
942 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
943 &thunderx2t99_prefetch_tune
944 };
945
946 /* Support for fine-grained override of the tuning structures. */
947 struct aarch64_tuning_override_function
948 {
949 const char* name;
950 void (*parse_override)(const char*, struct tune_params*);
951 };
952
953 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
954 static void aarch64_parse_tune_string (const char*, struct tune_params*);
955
956 static const struct aarch64_tuning_override_function
957 aarch64_tuning_override_functions[] =
958 {
959 { "fuse", aarch64_parse_fuse_string },
960 { "tune", aarch64_parse_tune_string },
961 { NULL, NULL }
962 };
963
964 /* A processor implementing AArch64. */
965 struct processor
966 {
967 const char *const name;
968 enum aarch64_processor ident;
969 enum aarch64_processor sched_core;
970 enum aarch64_arch arch;
971 unsigned architecture_version;
972 const unsigned long flags;
973 const struct tune_params *const tune;
974 };
975
976 /* Architectures implementing AArch64. */
977 static const struct processor all_architectures[] =
978 {
979 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
980 {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
981 #include "aarch64-arches.def"
982 {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
983 };
984
985 /* Processor cores implementing AArch64. */
986 static const struct processor all_cores[] =
987 {
988 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART, VARIANT) \
989 {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH, \
990 all_architectures[AARCH64_ARCH_##ARCH].architecture_version, \
991 FLAGS, &COSTS##_tunings},
992 #include "aarch64-cores.def"
993 {"generic", generic, cortexa53, AARCH64_ARCH_8A, 8,
994 AARCH64_FL_FOR_ARCH8, &generic_tunings},
995 {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
996 };
997
998
999 /* Target specification. These are populated by the -march, -mtune, -mcpu
1000 handling code or by target attributes. */
1001 static const struct processor *selected_arch;
1002 static const struct processor *selected_cpu;
1003 static const struct processor *selected_tune;
1004
1005 /* The current tuning set. */
1006 struct tune_params aarch64_tune_params = generic_tunings;
1007
1008 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
1009
1010 /* An ISA extension in the co-processor and main instruction set space. */
1011 struct aarch64_option_extension
1012 {
1013 const char *const name;
1014 const unsigned long flags_on;
1015 const unsigned long flags_off;
1016 };
1017
1018 typedef enum aarch64_cond_code
1019 {
1020 AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
1021 AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
1022 AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
1023 }
1024 aarch64_cc;
1025
1026 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
1027
1028 /* The condition codes of the processor, and the inverse function. */
1029 static const char * const aarch64_condition_codes[] =
1030 {
1031 "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
1032 "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
1033 };
1034
1035 /* Generate code to enable conditional branches in functions over 1 MiB. */
1036 const char *
1037 aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
1038 const char * branch_format)
1039 {
1040 rtx_code_label * tmp_label = gen_label_rtx ();
1041 char label_buf[256];
1042 char buffer[128];
1043 ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
1044 CODE_LABEL_NUMBER (tmp_label));
1045 const char *label_ptr = targetm.strip_name_encoding (label_buf);
1046 rtx dest_label = operands[pos_label];
1047 operands[pos_label] = tmp_label;
1048
1049 snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
1050 output_asm_insn (buffer, operands);
1051
1052 snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
1053 operands[pos_label] = dest_label;
1054 output_asm_insn (buffer, operands);
1055 return "";
1056 }
1057
1058 void
1059 aarch64_err_no_fpadvsimd (machine_mode mode)
1060 {
1061 if (TARGET_GENERAL_REGS_ONLY)
1062 if (FLOAT_MODE_P (mode))
1063 error ("%qs is incompatible with the use of floating-point types",
1064 "-mgeneral-regs-only");
1065 else
1066 error ("%qs is incompatible with the use of vector types",
1067 "-mgeneral-regs-only");
1068 else
1069 if (FLOAT_MODE_P (mode))
1070 error ("%qs feature modifier is incompatible with the use of"
1071 " floating-point types", "+nofp");
1072 else
1073 error ("%qs feature modifier is incompatible with the use of"
1074 " vector types", "+nofp");
1075 }
1076
1077 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
1078 The register allocator chooses POINTER_AND_FP_REGS if FP_REGS and
1079 GENERAL_REGS have the same cost - even if POINTER_AND_FP_REGS has a much
1080 higher cost. POINTER_AND_FP_REGS is also used if the cost of both FP_REGS
1081 and GENERAL_REGS is lower than the memory cost (in this case the best class
1082 is the lowest cost one). Using POINTER_AND_FP_REGS irrespectively of its
1083 cost results in bad allocations with many redundant int<->FP moves which
1084 are expensive on various cores.
1085 To avoid this we don't allow POINTER_AND_FP_REGS as the allocno class, but
1086 force a decision between FP_REGS and GENERAL_REGS. We use the allocno class
1087 if it isn't POINTER_AND_FP_REGS. Similarly, use the best class if it isn't
1088 POINTER_AND_FP_REGS. Otherwise set the allocno class depending on the mode.
1089 The result of this is that it is no longer inefficient to have a higher
1090 memory move cost than the register move cost.
1091 */
1092
1093 static reg_class_t
1094 aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class,
1095 reg_class_t best_class)
1096 {
1097 machine_mode mode;
1098
1099 if (!reg_class_subset_p (GENERAL_REGS, allocno_class)
1100 || !reg_class_subset_p (FP_REGS, allocno_class))
1101 return allocno_class;
1102
1103 if (!reg_class_subset_p (GENERAL_REGS, best_class)
1104 || !reg_class_subset_p (FP_REGS, best_class))
1105 return best_class;
1106
1107 mode = PSEUDO_REGNO_MODE (regno);
1108 return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS;
1109 }
1110
1111 static unsigned int
1112 aarch64_min_divisions_for_recip_mul (machine_mode mode)
1113 {
1114 if (GET_MODE_UNIT_SIZE (mode) == 4)
1115 return aarch64_tune_params.min_div_recip_mul_sf;
1116 return aarch64_tune_params.min_div_recip_mul_df;
1117 }
1118
1119 /* Return the reassociation width of treeop OPC with mode MODE. */
1120 static int
1121 aarch64_reassociation_width (unsigned opc, machine_mode mode)
1122 {
1123 if (VECTOR_MODE_P (mode))
1124 return aarch64_tune_params.vec_reassoc_width;
1125 if (INTEGRAL_MODE_P (mode))
1126 return aarch64_tune_params.int_reassoc_width;
1127 /* Avoid reassociating floating point addition so we emit more FMAs. */
1128 if (FLOAT_MODE_P (mode) && opc != PLUS_EXPR)
1129 return aarch64_tune_params.fp_reassoc_width;
1130 return 1;
1131 }
1132
1133 /* Provide a mapping from gcc register numbers to dwarf register numbers. */
1134 unsigned
1135 aarch64_dbx_register_number (unsigned regno)
1136 {
1137 if (GP_REGNUM_P (regno))
1138 return AARCH64_DWARF_R0 + regno - R0_REGNUM;
1139 else if (regno == SP_REGNUM)
1140 return AARCH64_DWARF_SP;
1141 else if (FP_REGNUM_P (regno))
1142 return AARCH64_DWARF_V0 + regno - V0_REGNUM;
1143 else if (PR_REGNUM_P (regno))
1144 return AARCH64_DWARF_P0 + regno - P0_REGNUM;
1145 else if (regno == VG_REGNUM)
1146 return AARCH64_DWARF_VG;
1147
1148 /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
1149 equivalent DWARF register. */
1150 return DWARF_FRAME_REGISTERS;
1151 }
1152
1153 /* Return true if MODE is any of the Advanced SIMD structure modes. */
1154 static bool
1155 aarch64_advsimd_struct_mode_p (machine_mode mode)
1156 {
1157 return (TARGET_SIMD
1158 && (mode == OImode || mode == CImode || mode == XImode));
1159 }
1160
1161 /* Return true if MODE is an SVE predicate mode. */
1162 static bool
1163 aarch64_sve_pred_mode_p (machine_mode mode)
1164 {
1165 return (TARGET_SVE
1166 && (mode == VNx16BImode
1167 || mode == VNx8BImode
1168 || mode == VNx4BImode
1169 || mode == VNx2BImode));
1170 }
1171
1172 /* Three mutually-exclusive flags describing a vector or predicate type. */
1173 const unsigned int VEC_ADVSIMD = 1;
1174 const unsigned int VEC_SVE_DATA = 2;
1175 const unsigned int VEC_SVE_PRED = 4;
1176 /* Can be used in combination with VEC_ADVSIMD or VEC_SVE_DATA to indicate
1177 a structure of 2, 3 or 4 vectors. */
1178 const unsigned int VEC_STRUCT = 8;
1179 /* Useful combinations of the above. */
1180 const unsigned int VEC_ANY_SVE = VEC_SVE_DATA | VEC_SVE_PRED;
1181 const unsigned int VEC_ANY_DATA = VEC_ADVSIMD | VEC_SVE_DATA;
1182
1183 /* Return a set of flags describing the vector properties of mode MODE.
1184 Ignore modes that are not supported by the current target. */
1185 static unsigned int
1186 aarch64_classify_vector_mode (machine_mode mode)
1187 {
1188 if (aarch64_advsimd_struct_mode_p (mode))
1189 return VEC_ADVSIMD | VEC_STRUCT;
1190
1191 if (aarch64_sve_pred_mode_p (mode))
1192 return VEC_SVE_PRED;
1193
1194 scalar_mode inner = GET_MODE_INNER (mode);
1195 if (VECTOR_MODE_P (mode)
1196 && (inner == QImode
1197 || inner == HImode
1198 || inner == HFmode
1199 || inner == SImode
1200 || inner == SFmode
1201 || inner == DImode
1202 || inner == DFmode))
1203 {
1204 if (TARGET_SVE)
1205 {
1206 if (known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR))
1207 return VEC_SVE_DATA;
1208 if (known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR * 2)
1209 || known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR * 3)
1210 || known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR * 4))
1211 return VEC_SVE_DATA | VEC_STRUCT;
1212 }
1213
1214 /* This includes V1DF but not V1DI (which doesn't exist). */
1215 if (TARGET_SIMD
1216 && (known_eq (GET_MODE_BITSIZE (mode), 64)
1217 || known_eq (GET_MODE_BITSIZE (mode), 128)))
1218 return VEC_ADVSIMD;
1219 }
1220
1221 return 0;
1222 }
1223
1224 /* Return true if MODE is any of the data vector modes, including
1225 structure modes. */
1226 static bool
1227 aarch64_vector_data_mode_p (machine_mode mode)
1228 {
1229 return aarch64_classify_vector_mode (mode) & VEC_ANY_DATA;
1230 }
1231
1232 /* Return true if MODE is an SVE data vector mode; either a single vector
1233 or a structure of vectors. */
1234 static bool
1235 aarch64_sve_data_mode_p (machine_mode mode)
1236 {
1237 return aarch64_classify_vector_mode (mode) & VEC_SVE_DATA;
1238 }
1239
1240 /* Implement target hook TARGET_ARRAY_MODE. */
1241 static opt_machine_mode
1242 aarch64_array_mode (machine_mode mode, unsigned HOST_WIDE_INT nelems)
1243 {
1244 if (aarch64_classify_vector_mode (mode) == VEC_SVE_DATA
1245 && IN_RANGE (nelems, 2, 4))
1246 return mode_for_vector (GET_MODE_INNER (mode),
1247 GET_MODE_NUNITS (mode) * nelems);
1248
1249 return opt_machine_mode ();
1250 }
1251
1252 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P. */
1253 static bool
1254 aarch64_array_mode_supported_p (machine_mode mode,
1255 unsigned HOST_WIDE_INT nelems)
1256 {
1257 if (TARGET_SIMD
1258 && (AARCH64_VALID_SIMD_QREG_MODE (mode)
1259 || AARCH64_VALID_SIMD_DREG_MODE (mode))
1260 && (nelems >= 2 && nelems <= 4))
1261 return true;
1262
1263 return false;
1264 }
1265
1266 /* Return the SVE predicate mode to use for elements that have
1267 ELEM_NBYTES bytes, if such a mode exists. */
1268
1269 opt_machine_mode
1270 aarch64_sve_pred_mode (unsigned int elem_nbytes)
1271 {
1272 if (TARGET_SVE)
1273 {
1274 if (elem_nbytes == 1)
1275 return VNx16BImode;
1276 if (elem_nbytes == 2)
1277 return VNx8BImode;
1278 if (elem_nbytes == 4)
1279 return VNx4BImode;
1280 if (elem_nbytes == 8)
1281 return VNx2BImode;
1282 }
1283 return opt_machine_mode ();
1284 }
1285
1286 /* Implement TARGET_VECTORIZE_GET_MASK_MODE. */
1287
1288 static opt_machine_mode
1289 aarch64_get_mask_mode (poly_uint64 nunits, poly_uint64 nbytes)
1290 {
1291 if (TARGET_SVE && known_eq (nbytes, BYTES_PER_SVE_VECTOR))
1292 {
1293 unsigned int elem_nbytes = vector_element_size (nbytes, nunits);
1294 machine_mode pred_mode;
1295 if (aarch64_sve_pred_mode (elem_nbytes).exists (&pred_mode))
1296 return pred_mode;
1297 }
1298
1299 return default_get_mask_mode (nunits, nbytes);
1300 }
1301
1302 /* Implement TARGET_PREFERRED_ELSE_VALUE. For binary operations,
1303 prefer to use the first arithmetic operand as the else value if
1304 the else value doesn't matter, since that exactly matches the SVE
1305 destructive merging form. For ternary operations we could either
1306 pick the first operand and use FMAD-like instructions or the last
1307 operand and use FMLA-like instructions; the latter seems more
1308 natural. */
1309
1310 static tree
1311 aarch64_preferred_else_value (unsigned, tree, unsigned int nops, tree *ops)
1312 {
1313 return nops == 3 ? ops[2] : ops[0];
1314 }
1315
1316 /* Implement TARGET_HARD_REGNO_NREGS. */
1317
1318 static unsigned int
1319 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
1320 {
1321 /* ??? Logically we should only need to provide a value when
1322 HARD_REGNO_MODE_OK says that the combination is valid,
1323 but at the moment we need to handle all modes. Just ignore
1324 any runtime parts for registers that can't store them. */
1325 HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
1326 switch (aarch64_regno_regclass (regno))
1327 {
1328 case FP_REGS:
1329 case FP_LO_REGS:
1330 if (aarch64_sve_data_mode_p (mode))
1331 return exact_div (GET_MODE_SIZE (mode),
1332 BYTES_PER_SVE_VECTOR).to_constant ();
1333 return CEIL (lowest_size, UNITS_PER_VREG);
1334 case PR_REGS:
1335 case PR_LO_REGS:
1336 case PR_HI_REGS:
1337 return 1;
1338 default:
1339 return CEIL (lowest_size, UNITS_PER_WORD);
1340 }
1341 gcc_unreachable ();
1342 }
1343
1344 /* Implement TARGET_HARD_REGNO_MODE_OK. */
1345
1346 static bool
1347 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
1348 {
1349 if (GET_MODE_CLASS (mode) == MODE_CC)
1350 return regno == CC_REGNUM;
1351
1352 if (regno == VG_REGNUM)
1353 /* This must have the same size as _Unwind_Word. */
1354 return mode == DImode;
1355
1356 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1357 if (vec_flags & VEC_SVE_PRED)
1358 return PR_REGNUM_P (regno);
1359
1360 if (PR_REGNUM_P (regno))
1361 return 0;
1362
1363 if (regno == SP_REGNUM)
1364 /* The purpose of comparing with ptr_mode is to support the
1365 global register variable associated with the stack pointer
1366 register via the syntax of asm ("wsp") in ILP32. */
1367 return mode == Pmode || mode == ptr_mode;
1368
1369 if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
1370 return mode == Pmode;
1371
1372 if (GP_REGNUM_P (regno) && known_le (GET_MODE_SIZE (mode), 16))
1373 return true;
1374
1375 if (FP_REGNUM_P (regno))
1376 {
1377 if (vec_flags & VEC_STRUCT)
1378 return end_hard_regno (mode, regno) - 1 <= V31_REGNUM;
1379 else
1380 return !VECTOR_MODE_P (mode) || vec_flags != 0;
1381 }
1382
1383 return false;
1384 }
1385
1386 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED. The callee only saves
1387 the lower 64 bits of a 128-bit register. Tell the compiler the callee
1388 clobbers the top 64 bits when restoring the bottom 64 bits. */
1389
1390 static bool
1391 aarch64_hard_regno_call_part_clobbered (unsigned int regno, machine_mode mode)
1392 {
1393 return FP_REGNUM_P (regno) && maybe_gt (GET_MODE_SIZE (mode), 8);
1394 }
1395
1396 /* Implement REGMODE_NATURAL_SIZE. */
1397 poly_uint64
1398 aarch64_regmode_natural_size (machine_mode mode)
1399 {
1400 /* The natural size for SVE data modes is one SVE data vector,
1401 and similarly for predicates. We can't independently modify
1402 anything smaller than that. */
1403 /* ??? For now, only do this for variable-width SVE registers.
1404 Doing it for constant-sized registers breaks lower-subreg.c. */
1405 /* ??? And once that's fixed, we should probably have similar
1406 code for Advanced SIMD. */
1407 if (!aarch64_sve_vg.is_constant ())
1408 {
1409 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1410 if (vec_flags & VEC_SVE_PRED)
1411 return BYTES_PER_SVE_PRED;
1412 if (vec_flags & VEC_SVE_DATA)
1413 return BYTES_PER_SVE_VECTOR;
1414 }
1415 return UNITS_PER_WORD;
1416 }
1417
1418 /* Implement HARD_REGNO_CALLER_SAVE_MODE. */
1419 machine_mode
1420 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned,
1421 machine_mode mode)
1422 {
1423 /* The predicate mode determines which bits are significant and
1424 which are "don't care". Decreasing the number of lanes would
1425 lose data while increasing the number of lanes would make bits
1426 unnecessarily significant. */
1427 if (PR_REGNUM_P (regno))
1428 return mode;
1429 if (known_ge (GET_MODE_SIZE (mode), 4))
1430 return mode;
1431 else
1432 return SImode;
1433 }
1434
1435 /* Return true if I's bits are consecutive ones from the MSB. */
1436 bool
1437 aarch64_high_bits_all_ones_p (HOST_WIDE_INT i)
1438 {
1439 return exact_log2 (-i) != HOST_WIDE_INT_M1;
1440 }
1441
1442 /* Implement TARGET_CONSTANT_ALIGNMENT. Make strings word-aligned so
1443 that strcpy from constants will be faster. */
1444
1445 static HOST_WIDE_INT
1446 aarch64_constant_alignment (const_tree exp, HOST_WIDE_INT align)
1447 {
1448 if (TREE_CODE (exp) == STRING_CST && !optimize_size)
1449 return MAX (align, BITS_PER_WORD);
1450 return align;
1451 }
1452
1453 /* Return true if calls to DECL should be treated as
1454 long-calls (ie called via a register). */
1455 static bool
1456 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
1457 {
1458 return false;
1459 }
1460
1461 /* Return true if calls to symbol-ref SYM should be treated as
1462 long-calls (ie called via a register). */
1463 bool
1464 aarch64_is_long_call_p (rtx sym)
1465 {
1466 return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
1467 }
1468
1469 /* Return true if calls to symbol-ref SYM should not go through
1470 plt stubs. */
1471
1472 bool
1473 aarch64_is_noplt_call_p (rtx sym)
1474 {
1475 const_tree decl = SYMBOL_REF_DECL (sym);
1476
1477 if (flag_pic
1478 && decl
1479 && (!flag_plt
1480 || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
1481 && !targetm.binds_local_p (decl))
1482 return true;
1483
1484 return false;
1485 }
1486
1487 /* Return true if the offsets to a zero/sign-extract operation
1488 represent an expression that matches an extend operation. The
1489 operands represent the paramters from
1490
1491 (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)). */
1492 bool
1493 aarch64_is_extend_from_extract (scalar_int_mode mode, rtx mult_imm,
1494 rtx extract_imm)
1495 {
1496 HOST_WIDE_INT mult_val, extract_val;
1497
1498 if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
1499 return false;
1500
1501 mult_val = INTVAL (mult_imm);
1502 extract_val = INTVAL (extract_imm);
1503
1504 if (extract_val > 8
1505 && extract_val < GET_MODE_BITSIZE (mode)
1506 && exact_log2 (extract_val & ~7) > 0
1507 && (extract_val & 7) <= 4
1508 && mult_val == (1 << (extract_val & 7)))
1509 return true;
1510
1511 return false;
1512 }
1513
1514 /* Emit an insn that's a simple single-set. Both the operands must be
1515 known to be valid. */
1516 inline static rtx_insn *
1517 emit_set_insn (rtx x, rtx y)
1518 {
1519 return emit_insn (gen_rtx_SET (x, y));
1520 }
1521
1522 /* X and Y are two things to compare using CODE. Emit the compare insn and
1523 return the rtx for register 0 in the proper mode. */
1524 rtx
1525 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
1526 {
1527 machine_mode mode = SELECT_CC_MODE (code, x, y);
1528 rtx cc_reg = gen_rtx_REG (mode, CC_REGNUM);
1529
1530 emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y));
1531 return cc_reg;
1532 }
1533
1534 /* Build the SYMBOL_REF for __tls_get_addr. */
1535
1536 static GTY(()) rtx tls_get_addr_libfunc;
1537
1538 rtx
1539 aarch64_tls_get_addr (void)
1540 {
1541 if (!tls_get_addr_libfunc)
1542 tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
1543 return tls_get_addr_libfunc;
1544 }
1545
1546 /* Return the TLS model to use for ADDR. */
1547
1548 static enum tls_model
1549 tls_symbolic_operand_type (rtx addr)
1550 {
1551 enum tls_model tls_kind = TLS_MODEL_NONE;
1552 if (GET_CODE (addr) == CONST)
1553 {
1554 poly_int64 addend;
1555 rtx sym = strip_offset (addr, &addend);
1556 if (GET_CODE (sym) == SYMBOL_REF)
1557 tls_kind = SYMBOL_REF_TLS_MODEL (sym);
1558 }
1559 else if (GET_CODE (addr) == SYMBOL_REF)
1560 tls_kind = SYMBOL_REF_TLS_MODEL (addr);
1561
1562 return tls_kind;
1563 }
1564
1565 /* We'll allow lo_sum's in addresses in our legitimate addresses
1566 so that combine would take care of combining addresses where
1567 necessary, but for generation purposes, we'll generate the address
1568 as :
1569 RTL Absolute
1570 tmp = hi (symbol_ref); adrp x1, foo
1571 dest = lo_sum (tmp, symbol_ref); add dest, x1, :lo_12:foo
1572 nop
1573
1574 PIC TLS
1575 adrp x1, :got:foo adrp tmp, :tlsgd:foo
1576 ldr x1, [:got_lo12:foo] add dest, tmp, :tlsgd_lo12:foo
1577 bl __tls_get_addr
1578 nop
1579
1580 Load TLS symbol, depending on TLS mechanism and TLS access model.
1581
1582 Global Dynamic - Traditional TLS:
1583 adrp tmp, :tlsgd:imm
1584 add dest, tmp, #:tlsgd_lo12:imm
1585 bl __tls_get_addr
1586
1587 Global Dynamic - TLS Descriptors:
1588 adrp dest, :tlsdesc:imm
1589 ldr tmp, [dest, #:tlsdesc_lo12:imm]
1590 add dest, dest, #:tlsdesc_lo12:imm
1591 blr tmp
1592 mrs tp, tpidr_el0
1593 add dest, dest, tp
1594
1595 Initial Exec:
1596 mrs tp, tpidr_el0
1597 adrp tmp, :gottprel:imm
1598 ldr dest, [tmp, #:gottprel_lo12:imm]
1599 add dest, dest, tp
1600
1601 Local Exec:
1602 mrs tp, tpidr_el0
1603 add t0, tp, #:tprel_hi12:imm, lsl #12
1604 add t0, t0, #:tprel_lo12_nc:imm
1605 */
1606
1607 static void
1608 aarch64_load_symref_appropriately (rtx dest, rtx imm,
1609 enum aarch64_symbol_type type)
1610 {
1611 switch (type)
1612 {
1613 case SYMBOL_SMALL_ABSOLUTE:
1614 {
1615 /* In ILP32, the mode of dest can be either SImode or DImode. */
1616 rtx tmp_reg = dest;
1617 machine_mode mode = GET_MODE (dest);
1618
1619 gcc_assert (mode == Pmode || mode == ptr_mode);
1620
1621 if (can_create_pseudo_p ())
1622 tmp_reg = gen_reg_rtx (mode);
1623
1624 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1625 emit_insn (gen_add_losym (dest, tmp_reg, imm));
1626 return;
1627 }
1628
1629 case SYMBOL_TINY_ABSOLUTE:
1630 emit_insn (gen_rtx_SET (dest, imm));
1631 return;
1632
1633 case SYMBOL_SMALL_GOT_28K:
1634 {
1635 machine_mode mode = GET_MODE (dest);
1636 rtx gp_rtx = pic_offset_table_rtx;
1637 rtx insn;
1638 rtx mem;
1639
1640 /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
1641 here before rtl expand. Tree IVOPT will generate rtl pattern to
1642 decide rtx costs, in which case pic_offset_table_rtx is not
1643 initialized. For that case no need to generate the first adrp
1644 instruction as the final cost for global variable access is
1645 one instruction. */
1646 if (gp_rtx != NULL)
1647 {
1648 /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
1649 using the page base as GOT base, the first page may be wasted,
1650 in the worst scenario, there is only 28K space for GOT).
1651
1652 The generate instruction sequence for accessing global variable
1653 is:
1654
1655 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
1656
1657 Only one instruction needed. But we must initialize
1658 pic_offset_table_rtx properly. We generate initialize insn for
1659 every global access, and allow CSE to remove all redundant.
1660
1661 The final instruction sequences will look like the following
1662 for multiply global variables access.
1663
1664 adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
1665
1666 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
1667 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
1668 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
1669 ... */
1670
1671 rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
1672 crtl->uses_pic_offset_table = 1;
1673 emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
1674
1675 if (mode != GET_MODE (gp_rtx))
1676 gp_rtx = gen_lowpart (mode, gp_rtx);
1677
1678 }
1679
1680 if (mode == ptr_mode)
1681 {
1682 if (mode == DImode)
1683 insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
1684 else
1685 insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
1686
1687 mem = XVECEXP (SET_SRC (insn), 0, 0);
1688 }
1689 else
1690 {
1691 gcc_assert (mode == Pmode);
1692
1693 insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
1694 mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1695 }
1696
1697 /* The operand is expected to be MEM. Whenever the related insn
1698 pattern changed, above code which calculate mem should be
1699 updated. */
1700 gcc_assert (GET_CODE (mem) == MEM);
1701 MEM_READONLY_P (mem) = 1;
1702 MEM_NOTRAP_P (mem) = 1;
1703 emit_insn (insn);
1704 return;
1705 }
1706
1707 case SYMBOL_SMALL_GOT_4G:
1708 {
1709 /* In ILP32, the mode of dest can be either SImode or DImode,
1710 while the got entry is always of SImode size. The mode of
1711 dest depends on how dest is used: if dest is assigned to a
1712 pointer (e.g. in the memory), it has SImode; it may have
1713 DImode if dest is dereferenced to access the memeory.
1714 This is why we have to handle three different ldr_got_small
1715 patterns here (two patterns for ILP32). */
1716
1717 rtx insn;
1718 rtx mem;
1719 rtx tmp_reg = dest;
1720 machine_mode mode = GET_MODE (dest);
1721
1722 if (can_create_pseudo_p ())
1723 tmp_reg = gen_reg_rtx (mode);
1724
1725 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1726 if (mode == ptr_mode)
1727 {
1728 if (mode == DImode)
1729 insn = gen_ldr_got_small_di (dest, tmp_reg, imm);
1730 else
1731 insn = gen_ldr_got_small_si (dest, tmp_reg, imm);
1732
1733 mem = XVECEXP (SET_SRC (insn), 0, 0);
1734 }
1735 else
1736 {
1737 gcc_assert (mode == Pmode);
1738
1739 insn = gen_ldr_got_small_sidi (dest, tmp_reg, imm);
1740 mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1741 }
1742
1743 gcc_assert (GET_CODE (mem) == MEM);
1744 MEM_READONLY_P (mem) = 1;
1745 MEM_NOTRAP_P (mem) = 1;
1746 emit_insn (insn);
1747 return;
1748 }
1749
1750 case SYMBOL_SMALL_TLSGD:
1751 {
1752 rtx_insn *insns;
1753 machine_mode mode = GET_MODE (dest);
1754 rtx result = gen_rtx_REG (mode, R0_REGNUM);
1755
1756 start_sequence ();
1757 if (TARGET_ILP32)
1758 aarch64_emit_call_insn (gen_tlsgd_small_si (result, imm));
1759 else
1760 aarch64_emit_call_insn (gen_tlsgd_small_di (result, imm));
1761 insns = get_insns ();
1762 end_sequence ();
1763
1764 RTL_CONST_CALL_P (insns) = 1;
1765 emit_libcall_block (insns, dest, result, imm);
1766 return;
1767 }
1768
1769 case SYMBOL_SMALL_TLSDESC:
1770 {
1771 machine_mode mode = GET_MODE (dest);
1772 rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
1773 rtx tp;
1774
1775 gcc_assert (mode == Pmode || mode == ptr_mode);
1776
1777 /* In ILP32, the got entry is always of SImode size. Unlike
1778 small GOT, the dest is fixed at reg 0. */
1779 if (TARGET_ILP32)
1780 emit_insn (gen_tlsdesc_small_si (imm));
1781 else
1782 emit_insn (gen_tlsdesc_small_di (imm));
1783 tp = aarch64_load_tp (NULL);
1784
1785 if (mode != Pmode)
1786 tp = gen_lowpart (mode, tp);
1787
1788 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
1789 if (REG_P (dest))
1790 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1791 return;
1792 }
1793
1794 case SYMBOL_SMALL_TLSIE:
1795 {
1796 /* In ILP32, the mode of dest can be either SImode or DImode,
1797 while the got entry is always of SImode size. The mode of
1798 dest depends on how dest is used: if dest is assigned to a
1799 pointer (e.g. in the memory), it has SImode; it may have
1800 DImode if dest is dereferenced to access the memeory.
1801 This is why we have to handle three different tlsie_small
1802 patterns here (two patterns for ILP32). */
1803 machine_mode mode = GET_MODE (dest);
1804 rtx tmp_reg = gen_reg_rtx (mode);
1805 rtx tp = aarch64_load_tp (NULL);
1806
1807 if (mode == ptr_mode)
1808 {
1809 if (mode == DImode)
1810 emit_insn (gen_tlsie_small_di (tmp_reg, imm));
1811 else
1812 {
1813 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
1814 tp = gen_lowpart (mode, tp);
1815 }
1816 }
1817 else
1818 {
1819 gcc_assert (mode == Pmode);
1820 emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
1821 }
1822
1823 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
1824 if (REG_P (dest))
1825 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1826 return;
1827 }
1828
1829 case SYMBOL_TLSLE12:
1830 case SYMBOL_TLSLE24:
1831 case SYMBOL_TLSLE32:
1832 case SYMBOL_TLSLE48:
1833 {
1834 machine_mode mode = GET_MODE (dest);
1835 rtx tp = aarch64_load_tp (NULL);
1836
1837 if (mode != Pmode)
1838 tp = gen_lowpart (mode, tp);
1839
1840 switch (type)
1841 {
1842 case SYMBOL_TLSLE12:
1843 emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
1844 (dest, tp, imm));
1845 break;
1846 case SYMBOL_TLSLE24:
1847 emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
1848 (dest, tp, imm));
1849 break;
1850 case SYMBOL_TLSLE32:
1851 emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
1852 (dest, imm));
1853 emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1854 (dest, dest, tp));
1855 break;
1856 case SYMBOL_TLSLE48:
1857 emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
1858 (dest, imm));
1859 emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1860 (dest, dest, tp));
1861 break;
1862 default:
1863 gcc_unreachable ();
1864 }
1865
1866 if (REG_P (dest))
1867 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1868 return;
1869 }
1870
1871 case SYMBOL_TINY_GOT:
1872 emit_insn (gen_ldr_got_tiny (dest, imm));
1873 return;
1874
1875 case SYMBOL_TINY_TLSIE:
1876 {
1877 machine_mode mode = GET_MODE (dest);
1878 rtx tp = aarch64_load_tp (NULL);
1879
1880 if (mode == ptr_mode)
1881 {
1882 if (mode == DImode)
1883 emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
1884 else
1885 {
1886 tp = gen_lowpart (mode, tp);
1887 emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
1888 }
1889 }
1890 else
1891 {
1892 gcc_assert (mode == Pmode);
1893 emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
1894 }
1895
1896 if (REG_P (dest))
1897 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1898 return;
1899 }
1900
1901 default:
1902 gcc_unreachable ();
1903 }
1904 }
1905
1906 /* Emit a move from SRC to DEST. Assume that the move expanders can
1907 handle all moves if !can_create_pseudo_p (). The distinction is
1908 important because, unlike emit_move_insn, the move expanders know
1909 how to force Pmode objects into the constant pool even when the
1910 constant pool address is not itself legitimate. */
1911 static rtx
1912 aarch64_emit_move (rtx dest, rtx src)
1913 {
1914 return (can_create_pseudo_p ()
1915 ? emit_move_insn (dest, src)
1916 : emit_move_insn_1 (dest, src));
1917 }
1918
1919 /* Apply UNOPTAB to OP and store the result in DEST. */
1920
1921 static void
1922 aarch64_emit_unop (rtx dest, optab unoptab, rtx op)
1923 {
1924 rtx tmp = expand_unop (GET_MODE (dest), unoptab, op, dest, 0);
1925 if (dest != tmp)
1926 emit_move_insn (dest, tmp);
1927 }
1928
1929 /* Apply BINOPTAB to OP0 and OP1 and store the result in DEST. */
1930
1931 static void
1932 aarch64_emit_binop (rtx dest, optab binoptab, rtx op0, rtx op1)
1933 {
1934 rtx tmp = expand_binop (GET_MODE (dest), binoptab, op0, op1, dest, 0,
1935 OPTAB_DIRECT);
1936 if (dest != tmp)
1937 emit_move_insn (dest, tmp);
1938 }
1939
1940 /* Split a 128-bit move operation into two 64-bit move operations,
1941 taking care to handle partial overlap of register to register
1942 copies. Special cases are needed when moving between GP regs and
1943 FP regs. SRC can be a register, constant or memory; DST a register
1944 or memory. If either operand is memory it must not have any side
1945 effects. */
1946 void
1947 aarch64_split_128bit_move (rtx dst, rtx src)
1948 {
1949 rtx dst_lo, dst_hi;
1950 rtx src_lo, src_hi;
1951
1952 machine_mode mode = GET_MODE (dst);
1953
1954 gcc_assert (mode == TImode || mode == TFmode);
1955 gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
1956 gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
1957
1958 if (REG_P (dst) && REG_P (src))
1959 {
1960 int src_regno = REGNO (src);
1961 int dst_regno = REGNO (dst);
1962
1963 /* Handle FP <-> GP regs. */
1964 if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
1965 {
1966 src_lo = gen_lowpart (word_mode, src);
1967 src_hi = gen_highpart (word_mode, src);
1968
1969 emit_insn (gen_aarch64_movlow_di (mode, dst, src_lo));
1970 emit_insn (gen_aarch64_movhigh_di (mode, dst, src_hi));
1971 return;
1972 }
1973 else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
1974 {
1975 dst_lo = gen_lowpart (word_mode, dst);
1976 dst_hi = gen_highpart (word_mode, dst);
1977
1978 emit_insn (gen_aarch64_movdi_low (mode, dst_lo, src));
1979 emit_insn (gen_aarch64_movdi_high (mode, dst_hi, src));
1980 return;
1981 }
1982 }
1983
1984 dst_lo = gen_lowpart (word_mode, dst);
1985 dst_hi = gen_highpart (word_mode, dst);
1986 src_lo = gen_lowpart (word_mode, src);
1987 src_hi = gen_highpart_mode (word_mode, mode, src);
1988
1989 /* At most one pairing may overlap. */
1990 if (reg_overlap_mentioned_p (dst_lo, src_hi))
1991 {
1992 aarch64_emit_move (dst_hi, src_hi);
1993 aarch64_emit_move (dst_lo, src_lo);
1994 }
1995 else
1996 {
1997 aarch64_emit_move (dst_lo, src_lo);
1998 aarch64_emit_move (dst_hi, src_hi);
1999 }
2000 }
2001
2002 bool
2003 aarch64_split_128bit_move_p (rtx dst, rtx src)
2004 {
2005 return (! REG_P (src)
2006 || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
2007 }
2008
2009 /* Split a complex SIMD combine. */
2010
2011 void
2012 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
2013 {
2014 machine_mode src_mode = GET_MODE (src1);
2015 machine_mode dst_mode = GET_MODE (dst);
2016
2017 gcc_assert (VECTOR_MODE_P (dst_mode));
2018 gcc_assert (register_operand (dst, dst_mode)
2019 && register_operand (src1, src_mode)
2020 && register_operand (src2, src_mode));
2021
2022 emit_insn (gen_aarch64_simd_combine (src_mode, dst, src1, src2));
2023 return;
2024 }
2025
2026 /* Split a complex SIMD move. */
2027
2028 void
2029 aarch64_split_simd_move (rtx dst, rtx src)
2030 {
2031 machine_mode src_mode = GET_MODE (src);
2032 machine_mode dst_mode = GET_MODE (dst);
2033
2034 gcc_assert (VECTOR_MODE_P (dst_mode));
2035
2036 if (REG_P (dst) && REG_P (src))
2037 {
2038 gcc_assert (VECTOR_MODE_P (src_mode));
2039 emit_insn (gen_aarch64_split_simd_mov (src_mode, dst, src));
2040 }
2041 }
2042
2043 bool
2044 aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
2045 machine_mode ymode, rtx y)
2046 {
2047 rtx r = simplify_const_unary_operation (ZERO_EXTEND, xmode, y, ymode);
2048 gcc_assert (r != NULL);
2049 return rtx_equal_p (x, r);
2050 }
2051
2052
2053 static rtx
2054 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
2055 {
2056 if (can_create_pseudo_p ())
2057 return force_reg (mode, value);
2058 else
2059 {
2060 gcc_assert (x);
2061 aarch64_emit_move (x, value);
2062 return x;
2063 }
2064 }
2065
2066 /* Return true if we can move VALUE into a register using a single
2067 CNT[BHWD] instruction. */
2068
2069 static bool
2070 aarch64_sve_cnt_immediate_p (poly_int64 value)
2071 {
2072 HOST_WIDE_INT factor = value.coeffs[0];
2073 /* The coefficient must be [1, 16] * {2, 4, 8, 16}. */
2074 return (value.coeffs[1] == factor
2075 && IN_RANGE (factor, 2, 16 * 16)
2076 && (factor & 1) == 0
2077 && factor <= 16 * (factor & -factor));
2078 }
2079
2080 /* Likewise for rtx X. */
2081
2082 bool
2083 aarch64_sve_cnt_immediate_p (rtx x)
2084 {
2085 poly_int64 value;
2086 return poly_int_rtx_p (x, &value) && aarch64_sve_cnt_immediate_p (value);
2087 }
2088
2089 /* Return the asm string for an instruction with a CNT-like vector size
2090 operand (a vector pattern followed by a multiplier in the range [1, 16]).
2091 PREFIX is the mnemonic without the size suffix and OPERANDS is the
2092 first part of the operands template (the part that comes before the
2093 vector size itself). FACTOR is the number of quadwords.
2094 NELTS_PER_VQ, if nonzero, is the number of elements in each quadword.
2095 If it is zero, we can use any element size. */
2096
2097 static char *
2098 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
2099 unsigned int factor,
2100 unsigned int nelts_per_vq)
2101 {
2102 static char buffer[sizeof ("sqincd\t%x0, %w0, all, mul #16")];
2103
2104 if (nelts_per_vq == 0)
2105 /* There is some overlap in the ranges of the four CNT instructions.
2106 Here we always use the smallest possible element size, so that the
2107 multiplier is 1 whereever possible. */
2108 nelts_per_vq = factor & -factor;
2109 int shift = std::min (exact_log2 (nelts_per_vq), 4);
2110 gcc_assert (IN_RANGE (shift, 1, 4));
2111 char suffix = "dwhb"[shift - 1];
2112
2113 factor >>= shift;
2114 unsigned int written;
2115 if (factor == 1)
2116 written = snprintf (buffer, sizeof (buffer), "%s%c\t%s",
2117 prefix, suffix, operands);
2118 else
2119 written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, all, mul #%d",
2120 prefix, suffix, operands, factor);
2121 gcc_assert (written < sizeof (buffer));
2122 return buffer;
2123 }
2124
2125 /* Return the asm string for an instruction with a CNT-like vector size
2126 operand (a vector pattern followed by a multiplier in the range [1, 16]).
2127 PREFIX is the mnemonic without the size suffix and OPERANDS is the
2128 first part of the operands template (the part that comes before the
2129 vector size itself). X is the value of the vector size operand,
2130 as a polynomial integer rtx. */
2131
2132 char *
2133 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
2134 rtx x)
2135 {
2136 poly_int64 value = rtx_to_poly_int64 (x);
2137 gcc_assert (aarch64_sve_cnt_immediate_p (value));
2138 return aarch64_output_sve_cnt_immediate (prefix, operands,
2139 value.coeffs[1], 0);
2140 }
2141
2142 /* Return true if we can add VALUE to a register using a single ADDVL
2143 or ADDPL instruction. */
2144
2145 static bool
2146 aarch64_sve_addvl_addpl_immediate_p (poly_int64 value)
2147 {
2148 HOST_WIDE_INT factor = value.coeffs[0];
2149 if (factor == 0 || value.coeffs[1] != factor)
2150 return false;
2151 /* FACTOR counts VG / 2, so a value of 2 is one predicate width
2152 and a value of 16 is one vector width. */
2153 return (((factor & 15) == 0 && IN_RANGE (factor, -32 * 16, 31 * 16))
2154 || ((factor & 1) == 0 && IN_RANGE (factor, -32 * 2, 31 * 2)));
2155 }
2156
2157 /* Likewise for rtx X. */
2158
2159 bool
2160 aarch64_sve_addvl_addpl_immediate_p (rtx x)
2161 {
2162 poly_int64 value;
2163 return (poly_int_rtx_p (x, &value)
2164 && aarch64_sve_addvl_addpl_immediate_p (value));
2165 }
2166
2167 /* Return the asm string for adding ADDVL or ADDPL immediate X to operand 1
2168 and storing the result in operand 0. */
2169
2170 char *
2171 aarch64_output_sve_addvl_addpl (rtx dest, rtx base, rtx offset)
2172 {
2173 static char buffer[sizeof ("addpl\t%x0, %x1, #-") + 3 * sizeof (int)];
2174 poly_int64 offset_value = rtx_to_poly_int64 (offset);
2175 gcc_assert (aarch64_sve_addvl_addpl_immediate_p (offset_value));
2176
2177 /* Use INC or DEC if possible. */
2178 if (rtx_equal_p (dest, base) && GP_REGNUM_P (REGNO (dest)))
2179 {
2180 if (aarch64_sve_cnt_immediate_p (offset_value))
2181 return aarch64_output_sve_cnt_immediate ("inc", "%x0",
2182 offset_value.coeffs[1], 0);
2183 if (aarch64_sve_cnt_immediate_p (-offset_value))
2184 return aarch64_output_sve_cnt_immediate ("dec", "%x0",
2185 -offset_value.coeffs[1], 0);
2186 }
2187
2188 int factor = offset_value.coeffs[1];
2189 if ((factor & 15) == 0)
2190 snprintf (buffer, sizeof (buffer), "addvl\t%%x0, %%x1, #%d", factor / 16);
2191 else
2192 snprintf (buffer, sizeof (buffer), "addpl\t%%x0, %%x1, #%d", factor / 2);
2193 return buffer;
2194 }
2195
2196 /* Return true if X is a valid immediate for an SVE vector INC or DEC
2197 instruction. If it is, store the number of elements in each vector
2198 quadword in *NELTS_PER_VQ_OUT (if nonnull) and store the multiplication
2199 factor in *FACTOR_OUT (if nonnull). */
2200
2201 bool
2202 aarch64_sve_inc_dec_immediate_p (rtx x, int *factor_out,
2203 unsigned int *nelts_per_vq_out)
2204 {
2205 rtx elt;
2206 poly_int64 value;
2207
2208 if (!const_vec_duplicate_p (x, &elt)
2209 || !poly_int_rtx_p (elt, &value))
2210 return false;
2211
2212 unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (GET_MODE (x));
2213 if (nelts_per_vq != 8 && nelts_per_vq != 4 && nelts_per_vq != 2)
2214 /* There's no vector INCB. */
2215 return false;
2216
2217 HOST_WIDE_INT factor = value.coeffs[0];
2218 if (value.coeffs[1] != factor)
2219 return false;
2220
2221 /* The coefficient must be [1, 16] * NELTS_PER_VQ. */
2222 if ((factor % nelts_per_vq) != 0
2223 || !IN_RANGE (abs (factor), nelts_per_vq, 16 * nelts_per_vq))
2224 return false;
2225
2226 if (factor_out)
2227 *factor_out = factor;
2228 if (nelts_per_vq_out)
2229 *nelts_per_vq_out = nelts_per_vq;
2230 return true;
2231 }
2232
2233 /* Return true if X is a valid immediate for an SVE vector INC or DEC
2234 instruction. */
2235
2236 bool
2237 aarch64_sve_inc_dec_immediate_p (rtx x)
2238 {
2239 return aarch64_sve_inc_dec_immediate_p (x, NULL, NULL);
2240 }
2241
2242 /* Return the asm template for an SVE vector INC or DEC instruction.
2243 OPERANDS gives the operands before the vector count and X is the
2244 value of the vector count operand itself. */
2245
2246 char *
2247 aarch64_output_sve_inc_dec_immediate (const char *operands, rtx x)
2248 {
2249 int factor;
2250 unsigned int nelts_per_vq;
2251 if (!aarch64_sve_inc_dec_immediate_p (x, &factor, &nelts_per_vq))
2252 gcc_unreachable ();
2253 if (factor < 0)
2254 return aarch64_output_sve_cnt_immediate ("dec", operands, -factor,
2255 nelts_per_vq);
2256 else
2257 return aarch64_output_sve_cnt_immediate ("inc", operands, factor,
2258 nelts_per_vq);
2259 }
2260
2261 static int
2262 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
2263 scalar_int_mode mode)
2264 {
2265 int i;
2266 unsigned HOST_WIDE_INT val, val2, mask;
2267 int one_match, zero_match;
2268 int num_insns;
2269
2270 val = INTVAL (imm);
2271
2272 if (aarch64_move_imm (val, mode))
2273 {
2274 if (generate)
2275 emit_insn (gen_rtx_SET (dest, imm));
2276 return 1;
2277 }
2278
2279 /* Check to see if the low 32 bits are either 0xffffXXXX or 0xXXXXffff
2280 (with XXXX non-zero). In that case check to see if the move can be done in
2281 a smaller mode. */
2282 val2 = val & 0xffffffff;
2283 if (mode == DImode
2284 && aarch64_move_imm (val2, SImode)
2285 && (((val >> 32) & 0xffff) == 0 || (val >> 48) == 0))
2286 {
2287 if (generate)
2288 emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
2289
2290 /* Check if we have to emit a second instruction by checking to see
2291 if any of the upper 32 bits of the original DI mode value is set. */
2292 if (val == val2)
2293 return 1;
2294
2295 i = (val >> 48) ? 48 : 32;
2296
2297 if (generate)
2298 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
2299 GEN_INT ((val >> i) & 0xffff)));
2300
2301 return 2;
2302 }
2303
2304 if ((val >> 32) == 0 || mode == SImode)
2305 {
2306 if (generate)
2307 {
2308 emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
2309 if (mode == SImode)
2310 emit_insn (gen_insv_immsi (dest, GEN_INT (16),
2311 GEN_INT ((val >> 16) & 0xffff)));
2312 else
2313 emit_insn (gen_insv_immdi (dest, GEN_INT (16),
2314 GEN_INT ((val >> 16) & 0xffff)));
2315 }
2316 return 2;
2317 }
2318
2319 /* Remaining cases are all for DImode. */
2320
2321 mask = 0xffff;
2322 zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
2323 ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
2324 one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
2325 ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
2326
2327 if (zero_match != 2 && one_match != 2)
2328 {
2329 /* Try emitting a bitmask immediate with a movk replacing 16 bits.
2330 For a 64-bit bitmask try whether changing 16 bits to all ones or
2331 zeroes creates a valid bitmask. To check any repeated bitmask,
2332 try using 16 bits from the other 32-bit half of val. */
2333
2334 for (i = 0; i < 64; i += 16, mask <<= 16)
2335 {
2336 val2 = val & ~mask;
2337 if (val2 != val && aarch64_bitmask_imm (val2, mode))
2338 break;
2339 val2 = val | mask;
2340 if (val2 != val && aarch64_bitmask_imm (val2, mode))
2341 break;
2342 val2 = val2 & ~mask;
2343 val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
2344 if (val2 != val && aarch64_bitmask_imm (val2, mode))
2345 break;
2346 }
2347 if (i != 64)
2348 {
2349 if (generate)
2350 {
2351 emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
2352 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
2353 GEN_INT ((val >> i) & 0xffff)));
2354 }
2355 return 2;
2356 }
2357 }
2358
2359 /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
2360 are emitted by the initial mov. If one_match > zero_match, skip set bits,
2361 otherwise skip zero bits. */
2362
2363 num_insns = 1;
2364 mask = 0xffff;
2365 val2 = one_match > zero_match ? ~val : val;
2366 i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
2367
2368 if (generate)
2369 emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
2370 ? (val | ~(mask << i))
2371 : (val & (mask << i)))));
2372 for (i += 16; i < 64; i += 16)
2373 {
2374 if ((val2 & (mask << i)) == 0)
2375 continue;
2376 if (generate)
2377 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
2378 GEN_INT ((val >> i) & 0xffff)));
2379 num_insns ++;
2380 }
2381
2382 return num_insns;
2383 }
2384
2385 /* Return whether imm is a 128-bit immediate which is simple enough to
2386 expand inline. */
2387 bool
2388 aarch64_mov128_immediate (rtx imm)
2389 {
2390 if (GET_CODE (imm) == CONST_INT)
2391 return true;
2392
2393 gcc_assert (CONST_WIDE_INT_NUNITS (imm) == 2);
2394
2395 rtx lo = GEN_INT (CONST_WIDE_INT_ELT (imm, 0));
2396 rtx hi = GEN_INT (CONST_WIDE_INT_ELT (imm, 1));
2397
2398 return aarch64_internal_mov_immediate (NULL_RTX, lo, false, DImode)
2399 + aarch64_internal_mov_immediate (NULL_RTX, hi, false, DImode) <= 4;
2400 }
2401
2402
2403 /* Return the number of temporary registers that aarch64_add_offset_1
2404 would need to add OFFSET to a register. */
2405
2406 static unsigned int
2407 aarch64_add_offset_1_temporaries (HOST_WIDE_INT offset)
2408 {
2409 return abs_hwi (offset) < 0x1000000 ? 0 : 1;
2410 }
2411
2412 /* A subroutine of aarch64_add_offset. Set DEST to SRC + OFFSET for
2413 a non-polynomial OFFSET. MODE is the mode of the addition.
2414 FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
2415 be set and CFA adjustments added to the generated instructions.
2416
2417 TEMP1, if nonnull, is a register of mode MODE that can be used as a
2418 temporary if register allocation is already complete. This temporary
2419 register may overlap DEST but must not overlap SRC. If TEMP1 is known
2420 to hold abs (OFFSET), EMIT_MOVE_IMM can be set to false to avoid emitting
2421 the immediate again.
2422
2423 Since this function may be used to adjust the stack pointer, we must
2424 ensure that it cannot cause transient stack deallocation (for example
2425 by first incrementing SP and then decrementing when adjusting by a
2426 large immediate). */
2427
2428 static void
2429 aarch64_add_offset_1 (scalar_int_mode mode, rtx dest,
2430 rtx src, HOST_WIDE_INT offset, rtx temp1,
2431 bool frame_related_p, bool emit_move_imm)
2432 {
2433 gcc_assert (emit_move_imm || temp1 != NULL_RTX);
2434 gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
2435
2436 HOST_WIDE_INT moffset = abs_hwi (offset);
2437 rtx_insn *insn;
2438
2439 if (!moffset)
2440 {
2441 if (!rtx_equal_p (dest, src))
2442 {
2443 insn = emit_insn (gen_rtx_SET (dest, src));
2444 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2445 }
2446 return;
2447 }
2448
2449 /* Single instruction adjustment. */
2450 if (aarch64_uimm12_shift (moffset))
2451 {
2452 insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (offset)));
2453 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2454 return;
2455 }
2456
2457 /* Emit 2 additions/subtractions if the adjustment is less than 24 bits
2458 and either:
2459
2460 a) the offset cannot be loaded by a 16-bit move or
2461 b) there is no spare register into which we can move it. */
2462 if (moffset < 0x1000000
2463 && ((!temp1 && !can_create_pseudo_p ())
2464 || !aarch64_move_imm (moffset, mode)))
2465 {
2466 HOST_WIDE_INT low_off = moffset & 0xfff;
2467
2468 low_off = offset < 0 ? -low_off : low_off;
2469 insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (low_off)));
2470 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2471 insn = emit_insn (gen_add2_insn (dest, GEN_INT (offset - low_off)));
2472 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2473 return;
2474 }
2475
2476 /* Emit a move immediate if required and an addition/subtraction. */
2477 if (emit_move_imm)
2478 {
2479 gcc_assert (temp1 != NULL_RTX || can_create_pseudo_p ());
2480 temp1 = aarch64_force_temporary (mode, temp1, GEN_INT (moffset));
2481 }
2482 insn = emit_insn (offset < 0
2483 ? gen_sub3_insn (dest, src, temp1)
2484 : gen_add3_insn (dest, src, temp1));
2485 if (frame_related_p)
2486 {
2487 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2488 rtx adj = plus_constant (mode, src, offset);
2489 add_reg_note (insn, REG_CFA_ADJUST_CFA, gen_rtx_SET (dest, adj));
2490 }
2491 }
2492
2493 /* Return the number of temporary registers that aarch64_add_offset
2494 would need to move OFFSET into a register or add OFFSET to a register;
2495 ADD_P is true if we want the latter rather than the former. */
2496
2497 static unsigned int
2498 aarch64_offset_temporaries (bool add_p, poly_int64 offset)
2499 {
2500 /* This follows the same structure as aarch64_add_offset. */
2501 if (add_p && aarch64_sve_addvl_addpl_immediate_p (offset))
2502 return 0;
2503
2504 unsigned int count = 0;
2505 HOST_WIDE_INT factor = offset.coeffs[1];
2506 HOST_WIDE_INT constant = offset.coeffs[0] - factor;
2507 poly_int64 poly_offset (factor, factor);
2508 if (add_p && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
2509 /* Need one register for the ADDVL/ADDPL result. */
2510 count += 1;
2511 else if (factor != 0)
2512 {
2513 factor = abs (factor);
2514 if (factor > 16 * (factor & -factor))
2515 /* Need one register for the CNT result and one for the multiplication
2516 factor. If necessary, the second temporary can be reused for the
2517 constant part of the offset. */
2518 return 2;
2519 /* Need one register for the CNT result (which might then
2520 be shifted). */
2521 count += 1;
2522 }
2523 return count + aarch64_add_offset_1_temporaries (constant);
2524 }
2525
2526 /* If X can be represented as a poly_int64, return the number
2527 of temporaries that are required to add it to a register.
2528 Return -1 otherwise. */
2529
2530 int
2531 aarch64_add_offset_temporaries (rtx x)
2532 {
2533 poly_int64 offset;
2534 if (!poly_int_rtx_p (x, &offset))
2535 return -1;
2536 return aarch64_offset_temporaries (true, offset);
2537 }
2538
2539 /* Set DEST to SRC + OFFSET. MODE is the mode of the addition.
2540 FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
2541 be set and CFA adjustments added to the generated instructions.
2542
2543 TEMP1, if nonnull, is a register of mode MODE that can be used as a
2544 temporary if register allocation is already complete. This temporary
2545 register may overlap DEST if !FRAME_RELATED_P but must not overlap SRC.
2546 If TEMP1 is known to hold abs (OFFSET), EMIT_MOVE_IMM can be set to
2547 false to avoid emitting the immediate again.
2548
2549 TEMP2, if nonnull, is a second temporary register that doesn't
2550 overlap either DEST or REG.
2551
2552 Since this function may be used to adjust the stack pointer, we must
2553 ensure that it cannot cause transient stack deallocation (for example
2554 by first incrementing SP and then decrementing when adjusting by a
2555 large immediate). */
2556
2557 static void
2558 aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx src,
2559 poly_int64 offset, rtx temp1, rtx temp2,
2560 bool frame_related_p, bool emit_move_imm = true)
2561 {
2562 gcc_assert (emit_move_imm || temp1 != NULL_RTX);
2563 gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
2564 gcc_assert (temp1 == NULL_RTX
2565 || !frame_related_p
2566 || !reg_overlap_mentioned_p (temp1, dest));
2567 gcc_assert (temp2 == NULL_RTX || !reg_overlap_mentioned_p (dest, temp2));
2568
2569 /* Try using ADDVL or ADDPL to add the whole value. */
2570 if (src != const0_rtx && aarch64_sve_addvl_addpl_immediate_p (offset))
2571 {
2572 rtx offset_rtx = gen_int_mode (offset, mode);
2573 rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
2574 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2575 return;
2576 }
2577
2578 /* Coefficient 1 is multiplied by the number of 128-bit blocks in an
2579 SVE vector register, over and above the minimum size of 128 bits.
2580 This is equivalent to half the value returned by CNTD with a
2581 vector shape of ALL. */
2582 HOST_WIDE_INT factor = offset.coeffs[1];
2583 HOST_WIDE_INT constant = offset.coeffs[0] - factor;
2584
2585 /* Try using ADDVL or ADDPL to add the VG-based part. */
2586 poly_int64 poly_offset (factor, factor);
2587 if (src != const0_rtx
2588 && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
2589 {
2590 rtx offset_rtx = gen_int_mode (poly_offset, mode);
2591 if (frame_related_p)
2592 {
2593 rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
2594 RTX_FRAME_RELATED_P (insn) = true;
2595 src = dest;
2596 }
2597 else
2598 {
2599 rtx addr = gen_rtx_PLUS (mode, src, offset_rtx);
2600 src = aarch64_force_temporary (mode, temp1, addr);
2601 temp1 = temp2;
2602 temp2 = NULL_RTX;
2603 }
2604 }
2605 /* Otherwise use a CNT-based sequence. */
2606 else if (factor != 0)
2607 {
2608 /* Use a subtraction if we have a negative factor. */
2609 rtx_code code = PLUS;
2610 if (factor < 0)
2611 {
2612 factor = -factor;
2613 code = MINUS;
2614 }
2615
2616 /* Calculate CNTD * FACTOR / 2. First try to fold the division
2617 into the multiplication. */
2618 rtx val;
2619 int shift = 0;
2620 if (factor & 1)
2621 /* Use a right shift by 1. */
2622 shift = -1;
2623 else
2624 factor /= 2;
2625 HOST_WIDE_INT low_bit = factor & -factor;
2626 if (factor <= 16 * low_bit)
2627 {
2628 if (factor > 16 * 8)
2629 {
2630 /* "CNTB Xn, ALL, MUL #FACTOR" is out of range, so calculate
2631 the value with the minimum multiplier and shift it into
2632 position. */
2633 int extra_shift = exact_log2 (low_bit);
2634 shift += extra_shift;
2635 factor >>= extra_shift;
2636 }
2637 val = gen_int_mode (poly_int64 (factor * 2, factor * 2), mode);
2638 }
2639 else
2640 {
2641 /* Use CNTD, then multiply it by FACTOR. */
2642 val = gen_int_mode (poly_int64 (2, 2), mode);
2643 val = aarch64_force_temporary (mode, temp1, val);
2644
2645 /* Go back to using a negative multiplication factor if we have
2646 no register from which to subtract. */
2647 if (code == MINUS && src == const0_rtx)
2648 {
2649 factor = -factor;
2650 code = PLUS;
2651 }
2652 rtx coeff1 = gen_int_mode (factor, mode);
2653 coeff1 = aarch64_force_temporary (mode, temp2, coeff1);
2654 val = gen_rtx_MULT (mode, val, coeff1);
2655 }
2656
2657 if (shift > 0)
2658 {
2659 /* Multiply by 1 << SHIFT. */
2660 val = aarch64_force_temporary (mode, temp1, val);
2661 val = gen_rtx_ASHIFT (mode, val, GEN_INT (shift));
2662 }
2663 else if (shift == -1)
2664 {
2665 /* Divide by 2. */
2666 val = aarch64_force_temporary (mode, temp1, val);
2667 val = gen_rtx_ASHIFTRT (mode, val, const1_rtx);
2668 }
2669
2670 /* Calculate SRC +/- CNTD * FACTOR / 2. */
2671 if (src != const0_rtx)
2672 {
2673 val = aarch64_force_temporary (mode, temp1, val);
2674 val = gen_rtx_fmt_ee (code, mode, src, val);
2675 }
2676 else if (code == MINUS)
2677 {
2678 val = aarch64_force_temporary (mode, temp1, val);
2679 val = gen_rtx_NEG (mode, val);
2680 }
2681
2682 if (constant == 0 || frame_related_p)
2683 {
2684 rtx_insn *insn = emit_insn (gen_rtx_SET (dest, val));
2685 if (frame_related_p)
2686 {
2687 RTX_FRAME_RELATED_P (insn) = true;
2688 add_reg_note (insn, REG_CFA_ADJUST_CFA,
2689 gen_rtx_SET (dest, plus_constant (Pmode, src,
2690 poly_offset)));
2691 }
2692 src = dest;
2693 if (constant == 0)
2694 return;
2695 }
2696 else
2697 {
2698 src = aarch64_force_temporary (mode, temp1, val);
2699 temp1 = temp2;
2700 temp2 = NULL_RTX;
2701 }
2702
2703 emit_move_imm = true;
2704 }
2705
2706 aarch64_add_offset_1 (mode, dest, src, constant, temp1,
2707 frame_related_p, emit_move_imm);
2708 }
2709
2710 /* Like aarch64_add_offset, but the offset is given as an rtx rather
2711 than a poly_int64. */
2712
2713 void
2714 aarch64_split_add_offset (scalar_int_mode mode, rtx dest, rtx src,
2715 rtx offset_rtx, rtx temp1, rtx temp2)
2716 {
2717 aarch64_add_offset (mode, dest, src, rtx_to_poly_int64 (offset_rtx),
2718 temp1, temp2, false);
2719 }
2720
2721 /* Add DELTA to the stack pointer, marking the instructions frame-related.
2722 TEMP1 is available as a temporary if nonnull. EMIT_MOVE_IMM is false
2723 if TEMP1 already contains abs (DELTA). */
2724
2725 static inline void
2726 aarch64_add_sp (rtx temp1, rtx temp2, poly_int64 delta, bool emit_move_imm)
2727 {
2728 aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, delta,
2729 temp1, temp2, true, emit_move_imm);
2730 }
2731
2732 /* Subtract DELTA from the stack pointer, marking the instructions
2733 frame-related if FRAME_RELATED_P. TEMP1 is available as a temporary
2734 if nonnull. */
2735
2736 static inline void
2737 aarch64_sub_sp (rtx temp1, rtx temp2, poly_int64 delta, bool frame_related_p)
2738 {
2739 aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, -delta,
2740 temp1, temp2, frame_related_p);
2741 }
2742
2743 /* Set DEST to (vec_series BASE STEP). */
2744
2745 static void
2746 aarch64_expand_vec_series (rtx dest, rtx base, rtx step)
2747 {
2748 machine_mode mode = GET_MODE (dest);
2749 scalar_mode inner = GET_MODE_INNER (mode);
2750
2751 /* Each operand can be a register or an immediate in the range [-16, 15]. */
2752 if (!aarch64_sve_index_immediate_p (base))
2753 base = force_reg (inner, base);
2754 if (!aarch64_sve_index_immediate_p (step))
2755 step = force_reg (inner, step);
2756
2757 emit_set_insn (dest, gen_rtx_VEC_SERIES (mode, base, step));
2758 }
2759
2760 /* Try to duplicate SRC into SVE register DEST, given that SRC is an
2761 integer of mode INT_MODE. Return true on success. */
2762
2763 static bool
2764 aarch64_expand_sve_widened_duplicate (rtx dest, scalar_int_mode src_mode,
2765 rtx src)
2766 {
2767 /* If the constant is smaller than 128 bits, we can do the move
2768 using a vector of SRC_MODEs. */
2769 if (src_mode != TImode)
2770 {
2771 poly_uint64 count = exact_div (GET_MODE_SIZE (GET_MODE (dest)),
2772 GET_MODE_SIZE (src_mode));
2773 machine_mode dup_mode = mode_for_vector (src_mode, count).require ();
2774 emit_move_insn (gen_lowpart (dup_mode, dest),
2775 gen_const_vec_duplicate (dup_mode, src));
2776 return true;
2777 }
2778
2779 /* Use LD1RQ[BHWD] to load the 128 bits from memory. */
2780 src = force_const_mem (src_mode, src);
2781 if (!src)
2782 return false;
2783
2784 /* Make sure that the address is legitimate. */
2785 if (!aarch64_sve_ld1r_operand_p (src))
2786 {
2787 rtx addr = force_reg (Pmode, XEXP (src, 0));
2788 src = replace_equiv_address (src, addr);
2789 }
2790
2791 machine_mode mode = GET_MODE (dest);
2792 unsigned int elem_bytes = GET_MODE_UNIT_SIZE (mode);
2793 machine_mode pred_mode = aarch64_sve_pred_mode (elem_bytes).require ();
2794 rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
2795 src = gen_rtx_UNSPEC (mode, gen_rtvec (2, ptrue, src), UNSPEC_LD1RQ);
2796 emit_insn (gen_rtx_SET (dest, src));
2797 return true;
2798 }
2799
2800 /* Expand a move of general CONST_VECTOR SRC into DEST, given that it
2801 isn't a simple duplicate or series. */
2802
2803 static void
2804 aarch64_expand_sve_const_vector (rtx dest, rtx src)
2805 {
2806 machine_mode mode = GET_MODE (src);
2807 unsigned int npatterns = CONST_VECTOR_NPATTERNS (src);
2808 unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (src);
2809 gcc_assert (npatterns > 1);
2810
2811 if (nelts_per_pattern == 1)
2812 {
2813 /* The constant is a repeating seqeuence of at least two elements,
2814 where the repeating elements occupy no more than 128 bits.
2815 Get an integer representation of the replicated value. */
2816 scalar_int_mode int_mode;
2817 if (BYTES_BIG_ENDIAN)
2818 /* For now, always use LD1RQ to load the value on big-endian
2819 targets, since the handling of smaller integers includes a
2820 subreg that is semantically an element reverse. */
2821 int_mode = TImode;
2822 else
2823 {
2824 unsigned int int_bits = GET_MODE_UNIT_BITSIZE (mode) * npatterns;
2825 gcc_assert (int_bits <= 128);
2826 int_mode = int_mode_for_size (int_bits, 0).require ();
2827 }
2828 rtx int_value = simplify_gen_subreg (int_mode, src, mode, 0);
2829 if (int_value
2830 && aarch64_expand_sve_widened_duplicate (dest, int_mode, int_value))
2831 return;
2832 }
2833
2834 /* Expand each pattern individually. */
2835 rtx_vector_builder builder;
2836 auto_vec<rtx, 16> vectors (npatterns);
2837 for (unsigned int i = 0; i < npatterns; ++i)
2838 {
2839 builder.new_vector (mode, 1, nelts_per_pattern);
2840 for (unsigned int j = 0; j < nelts_per_pattern; ++j)
2841 builder.quick_push (CONST_VECTOR_ELT (src, i + j * npatterns));
2842 vectors.quick_push (force_reg (mode, builder.build ()));
2843 }
2844
2845 /* Use permutes to interleave the separate vectors. */
2846 while (npatterns > 1)
2847 {
2848 npatterns /= 2;
2849 for (unsigned int i = 0; i < npatterns; ++i)
2850 {
2851 rtx tmp = (npatterns == 1 ? dest : gen_reg_rtx (mode));
2852 rtvec v = gen_rtvec (2, vectors[i], vectors[i + npatterns]);
2853 emit_set_insn (tmp, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
2854 vectors[i] = tmp;
2855 }
2856 }
2857 gcc_assert (vectors[0] == dest);
2858 }
2859
2860 /* Set DEST to immediate IMM. For SVE vector modes, GEN_VEC_DUPLICATE
2861 is a pattern that can be used to set DEST to a replicated scalar
2862 element. */
2863
2864 void
2865 aarch64_expand_mov_immediate (rtx dest, rtx imm,
2866 rtx (*gen_vec_duplicate) (rtx, rtx))
2867 {
2868 machine_mode mode = GET_MODE (dest);
2869
2870 /* Check on what type of symbol it is. */
2871 scalar_int_mode int_mode;
2872 if ((GET_CODE (imm) == SYMBOL_REF
2873 || GET_CODE (imm) == LABEL_REF
2874 || GET_CODE (imm) == CONST
2875 || GET_CODE (imm) == CONST_POLY_INT)
2876 && is_a <scalar_int_mode> (mode, &int_mode))
2877 {
2878 rtx mem;
2879 poly_int64 offset;
2880 HOST_WIDE_INT const_offset;
2881 enum aarch64_symbol_type sty;
2882
2883 /* If we have (const (plus symbol offset)), separate out the offset
2884 before we start classifying the symbol. */
2885 rtx base = strip_offset (imm, &offset);
2886
2887 /* We must always add an offset involving VL separately, rather than
2888 folding it into the relocation. */
2889 if (!offset.is_constant (&const_offset))
2890 {
2891 if (base == const0_rtx && aarch64_sve_cnt_immediate_p (offset))
2892 emit_insn (gen_rtx_SET (dest, imm));
2893 else
2894 {
2895 /* Do arithmetic on 32-bit values if the result is smaller
2896 than that. */
2897 if (partial_subreg_p (int_mode, SImode))
2898 {
2899 /* It is invalid to do symbol calculations in modes
2900 narrower than SImode. */
2901 gcc_assert (base == const0_rtx);
2902 dest = gen_lowpart (SImode, dest);
2903 int_mode = SImode;
2904 }
2905 if (base != const0_rtx)
2906 {
2907 base = aarch64_force_temporary (int_mode, dest, base);
2908 aarch64_add_offset (int_mode, dest, base, offset,
2909 NULL_RTX, NULL_RTX, false);
2910 }
2911 else
2912 aarch64_add_offset (int_mode, dest, base, offset,
2913 dest, NULL_RTX, false);
2914 }
2915 return;
2916 }
2917
2918 sty = aarch64_classify_symbol (base, const_offset);
2919 switch (sty)
2920 {
2921 case SYMBOL_FORCE_TO_MEM:
2922 if (const_offset != 0
2923 && targetm.cannot_force_const_mem (int_mode, imm))
2924 {
2925 gcc_assert (can_create_pseudo_p ());
2926 base = aarch64_force_temporary (int_mode, dest, base);
2927 aarch64_add_offset (int_mode, dest, base, const_offset,
2928 NULL_RTX, NULL_RTX, false);
2929 return;
2930 }
2931
2932 mem = force_const_mem (ptr_mode, imm);
2933 gcc_assert (mem);
2934
2935 /* If we aren't generating PC relative literals, then
2936 we need to expand the literal pool access carefully.
2937 This is something that needs to be done in a number
2938 of places, so could well live as a separate function. */
2939 if (!aarch64_pcrelative_literal_loads)
2940 {
2941 gcc_assert (can_create_pseudo_p ());
2942 base = gen_reg_rtx (ptr_mode);
2943 aarch64_expand_mov_immediate (base, XEXP (mem, 0));
2944 if (ptr_mode != Pmode)
2945 base = convert_memory_address (Pmode, base);
2946 mem = gen_rtx_MEM (ptr_mode, base);
2947 }
2948
2949 if (int_mode != ptr_mode)
2950 mem = gen_rtx_ZERO_EXTEND (int_mode, mem);
2951
2952 emit_insn (gen_rtx_SET (dest, mem));
2953
2954 return;
2955
2956 case SYMBOL_SMALL_TLSGD:
2957 case SYMBOL_SMALL_TLSDESC:
2958 case SYMBOL_SMALL_TLSIE:
2959 case SYMBOL_SMALL_GOT_28K:
2960 case SYMBOL_SMALL_GOT_4G:
2961 case SYMBOL_TINY_GOT:
2962 case SYMBOL_TINY_TLSIE:
2963 if (const_offset != 0)
2964 {
2965 gcc_assert(can_create_pseudo_p ());
2966 base = aarch64_force_temporary (int_mode, dest, base);
2967 aarch64_add_offset (int_mode, dest, base, const_offset,
2968 NULL_RTX, NULL_RTX, false);
2969 return;
2970 }
2971 /* FALLTHRU */
2972
2973 case SYMBOL_SMALL_ABSOLUTE:
2974 case SYMBOL_TINY_ABSOLUTE:
2975 case SYMBOL_TLSLE12:
2976 case SYMBOL_TLSLE24:
2977 case SYMBOL_TLSLE32:
2978 case SYMBOL_TLSLE48:
2979 aarch64_load_symref_appropriately (dest, imm, sty);
2980 return;
2981
2982 default:
2983 gcc_unreachable ();
2984 }
2985 }
2986
2987 if (!CONST_INT_P (imm))
2988 {
2989 rtx base, step, value;
2990 if (GET_CODE (imm) == HIGH
2991 || aarch64_simd_valid_immediate (imm, NULL))
2992 emit_insn (gen_rtx_SET (dest, imm));
2993 else if (const_vec_series_p (imm, &base, &step))
2994 aarch64_expand_vec_series (dest, base, step);
2995 else if (const_vec_duplicate_p (imm, &value))
2996 {
2997 /* If the constant is out of range of an SVE vector move,
2998 load it from memory if we can, otherwise move it into
2999 a register and use a DUP. */
3000 scalar_mode inner_mode = GET_MODE_INNER (mode);
3001 rtx op = force_const_mem (inner_mode, value);
3002 if (!op)
3003 op = force_reg (inner_mode, value);
3004 else if (!aarch64_sve_ld1r_operand_p (op))
3005 {
3006 rtx addr = force_reg (Pmode, XEXP (op, 0));
3007 op = replace_equiv_address (op, addr);
3008 }
3009 emit_insn (gen_vec_duplicate (dest, op));
3010 }
3011 else if (GET_CODE (imm) == CONST_VECTOR
3012 && !GET_MODE_NUNITS (GET_MODE (imm)).is_constant ())
3013 aarch64_expand_sve_const_vector (dest, imm);
3014 else
3015 {
3016 rtx mem = force_const_mem (mode, imm);
3017 gcc_assert (mem);
3018 emit_move_insn (dest, mem);
3019 }
3020
3021 return;
3022 }
3023
3024 aarch64_internal_mov_immediate (dest, imm, true,
3025 as_a <scalar_int_mode> (mode));
3026 }
3027
3028 /* Emit an SVE predicated move from SRC to DEST. PRED is a predicate
3029 that is known to contain PTRUE. */
3030
3031 void
3032 aarch64_emit_sve_pred_move (rtx dest, rtx pred, rtx src)
3033 {
3034 emit_insn (gen_rtx_SET (dest, gen_rtx_UNSPEC (GET_MODE (dest),
3035 gen_rtvec (2, pred, src),
3036 UNSPEC_MERGE_PTRUE)));
3037 }
3038
3039 /* Expand a pre-RA SVE data move from SRC to DEST in which at least one
3040 operand is in memory. In this case we need to use the predicated LD1
3041 and ST1 instead of LDR and STR, both for correctness on big-endian
3042 targets and because LD1 and ST1 support a wider range of addressing modes.
3043 PRED_MODE is the mode of the predicate.
3044
3045 See the comment at the head of aarch64-sve.md for details about the
3046 big-endian handling. */
3047
3048 void
3049 aarch64_expand_sve_mem_move (rtx dest, rtx src, machine_mode pred_mode)
3050 {
3051 machine_mode mode = GET_MODE (dest);
3052 rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
3053 if (!register_operand (src, mode)
3054 && !register_operand (dest, mode))
3055 {
3056 rtx tmp = gen_reg_rtx (mode);
3057 if (MEM_P (src))
3058 aarch64_emit_sve_pred_move (tmp, ptrue, src);
3059 else
3060 emit_move_insn (tmp, src);
3061 src = tmp;
3062 }
3063 aarch64_emit_sve_pred_move (dest, ptrue, src);
3064 }
3065
3066 /* Called only on big-endian targets. See whether an SVE vector move
3067 from SRC to DEST is effectively a REV[BHW] instruction, because at
3068 least one operand is a subreg of an SVE vector that has wider or
3069 narrower elements. Return true and emit the instruction if so.
3070
3071 For example:
3072
3073 (set (reg:VNx8HI R1) (subreg:VNx8HI (reg:VNx16QI R2) 0))
3074
3075 represents a VIEW_CONVERT between the following vectors, viewed
3076 in memory order:
3077
3078 R2: { [0].high, [0].low, [1].high, [1].low, ... }
3079 R1: { [0], [1], [2], [3], ... }
3080
3081 The high part of lane X in R2 should therefore correspond to lane X*2
3082 of R1, but the register representations are:
3083
3084 msb lsb
3085 R2: ...... [1].high [1].low [0].high [0].low
3086 R1: ...... [3] [2] [1] [0]
3087
3088 where the low part of lane X in R2 corresponds to lane X*2 in R1.
3089 We therefore need a reverse operation to swap the high and low values
3090 around.
3091
3092 This is purely an optimization. Without it we would spill the
3093 subreg operand to the stack in one mode and reload it in the
3094 other mode, which has the same effect as the REV. */
3095
3096 bool
3097 aarch64_maybe_expand_sve_subreg_move (rtx dest, rtx src)
3098 {
3099 gcc_assert (BYTES_BIG_ENDIAN);
3100 if (GET_CODE (dest) == SUBREG)
3101 dest = SUBREG_REG (dest);
3102 if (GET_CODE (src) == SUBREG)
3103 src = SUBREG_REG (src);
3104
3105 /* The optimization handles two single SVE REGs with different element
3106 sizes. */
3107 if (!REG_P (dest)
3108 || !REG_P (src)
3109 || aarch64_classify_vector_mode (GET_MODE (dest)) != VEC_SVE_DATA
3110 || aarch64_classify_vector_mode (GET_MODE (src)) != VEC_SVE_DATA
3111 || (GET_MODE_UNIT_SIZE (GET_MODE (dest))
3112 == GET_MODE_UNIT_SIZE (GET_MODE (src))))
3113 return false;
3114
3115 /* Generate *aarch64_sve_mov<mode>_subreg_be. */
3116 rtx ptrue = force_reg (VNx16BImode, CONSTM1_RTX (VNx16BImode));
3117 rtx unspec = gen_rtx_UNSPEC (GET_MODE (dest), gen_rtvec (2, ptrue, src),
3118 UNSPEC_REV_SUBREG);
3119 emit_insn (gen_rtx_SET (dest, unspec));
3120 return true;
3121 }
3122
3123 /* Return a copy of X with mode MODE, without changing its other
3124 attributes. Unlike gen_lowpart, this doesn't care whether the
3125 mode change is valid. */
3126
3127 static rtx
3128 aarch64_replace_reg_mode (rtx x, machine_mode mode)
3129 {
3130 if (GET_MODE (x) == mode)
3131 return x;
3132
3133 x = shallow_copy_rtx (x);
3134 set_mode_and_regno (x, mode, REGNO (x));
3135 return x;
3136 }
3137
3138 /* Split a *aarch64_sve_mov<mode>_subreg_be pattern with the given
3139 operands. */
3140
3141 void
3142 aarch64_split_sve_subreg_move (rtx dest, rtx ptrue, rtx src)
3143 {
3144 /* Decide which REV operation we need. The mode with narrower elements
3145 determines the mode of the operands and the mode with the wider
3146 elements determines the reverse width. */
3147 machine_mode mode_with_wider_elts = GET_MODE (dest);
3148 machine_mode mode_with_narrower_elts = GET_MODE (src);
3149 if (GET_MODE_UNIT_SIZE (mode_with_wider_elts)
3150 < GET_MODE_UNIT_SIZE (mode_with_narrower_elts))
3151 std::swap (mode_with_wider_elts, mode_with_narrower_elts);
3152
3153 unsigned int wider_bytes = GET_MODE_UNIT_SIZE (mode_with_wider_elts);
3154 unsigned int unspec;
3155 if (wider_bytes == 8)
3156 unspec = UNSPEC_REV64;
3157 else if (wider_bytes == 4)
3158 unspec = UNSPEC_REV32;
3159 else if (wider_bytes == 2)
3160 unspec = UNSPEC_REV16;
3161 else
3162 gcc_unreachable ();
3163 machine_mode pred_mode = aarch64_sve_pred_mode (wider_bytes).require ();
3164
3165 /* Emit:
3166
3167 (set DEST (unspec [PTRUE (unspec [SRC] UNSPEC_REV<nn>)]
3168 UNSPEC_MERGE_PTRUE))
3169
3170 with the appropriate modes. */
3171 ptrue = gen_lowpart (pred_mode, ptrue);
3172 dest = aarch64_replace_reg_mode (dest, mode_with_narrower_elts);
3173 src = aarch64_replace_reg_mode (src, mode_with_narrower_elts);
3174 src = gen_rtx_UNSPEC (mode_with_narrower_elts, gen_rtvec (1, src), unspec);
3175 src = gen_rtx_UNSPEC (mode_with_narrower_elts, gen_rtvec (2, ptrue, src),
3176 UNSPEC_MERGE_PTRUE);
3177 emit_insn (gen_rtx_SET (dest, src));
3178 }
3179
3180 static bool
3181 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
3182 tree exp ATTRIBUTE_UNUSED)
3183 {
3184 /* Currently, always true. */
3185 return true;
3186 }
3187
3188 /* Implement TARGET_PASS_BY_REFERENCE. */
3189
3190 static bool
3191 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED,
3192 machine_mode mode,
3193 const_tree type,
3194 bool named ATTRIBUTE_UNUSED)
3195 {
3196 HOST_WIDE_INT size;
3197 machine_mode dummymode;
3198 int nregs;
3199
3200 /* GET_MODE_SIZE (BLKmode) is useless since it is 0. */
3201 if (mode == BLKmode && type)
3202 size = int_size_in_bytes (type);
3203 else
3204 /* No frontends can create types with variable-sized modes, so we
3205 shouldn't be asked to pass or return them. */
3206 size = GET_MODE_SIZE (mode).to_constant ();
3207
3208 /* Aggregates are passed by reference based on their size. */
3209 if (type && AGGREGATE_TYPE_P (type))
3210 {
3211 size = int_size_in_bytes (type);
3212 }
3213
3214 /* Variable sized arguments are always returned by reference. */
3215 if (size < 0)
3216 return true;
3217
3218 /* Can this be a candidate to be passed in fp/simd register(s)? */
3219 if (aarch64_vfp_is_call_or_return_candidate (mode, type,
3220 &dummymode, &nregs,
3221 NULL))
3222 return false;
3223
3224 /* Arguments which are variable sized or larger than 2 registers are
3225 passed by reference unless they are a homogenous floating point
3226 aggregate. */
3227 return size > 2 * UNITS_PER_WORD;
3228 }
3229
3230 /* Return TRUE if VALTYPE is padded to its least significant bits. */
3231 static bool
3232 aarch64_return_in_msb (const_tree valtype)
3233 {
3234 machine_mode dummy_mode;
3235 int dummy_int;
3236
3237 /* Never happens in little-endian mode. */
3238 if (!BYTES_BIG_ENDIAN)
3239 return false;
3240
3241 /* Only composite types smaller than or equal to 16 bytes can
3242 be potentially returned in registers. */
3243 if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
3244 || int_size_in_bytes (valtype) <= 0
3245 || int_size_in_bytes (valtype) > 16)
3246 return false;
3247
3248 /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
3249 or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
3250 is always passed/returned in the least significant bits of fp/simd
3251 register(s). */
3252 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
3253 &dummy_mode, &dummy_int, NULL))
3254 return false;
3255
3256 return true;
3257 }
3258
3259 /* Implement TARGET_FUNCTION_VALUE.
3260 Define how to find the value returned by a function. */
3261
3262 static rtx
3263 aarch64_function_value (const_tree type, const_tree func,
3264 bool outgoing ATTRIBUTE_UNUSED)
3265 {
3266 machine_mode mode;
3267 int unsignedp;
3268 int count;
3269 machine_mode ag_mode;
3270
3271 mode = TYPE_MODE (type);
3272 if (INTEGRAL_TYPE_P (type))
3273 mode = promote_function_mode (type, mode, &unsignedp, func, 1);
3274
3275 if (aarch64_return_in_msb (type))
3276 {
3277 HOST_WIDE_INT size = int_size_in_bytes (type);
3278
3279 if (size % UNITS_PER_WORD != 0)
3280 {
3281 size += UNITS_PER_WORD - size % UNITS_PER_WORD;
3282 mode = int_mode_for_size (size * BITS_PER_UNIT, 0).require ();
3283 }
3284 }
3285
3286 if (aarch64_vfp_is_call_or_return_candidate (mode, type,
3287 &ag_mode, &count, NULL))
3288 {
3289 if (!aarch64_composite_type_p (type, mode))
3290 {
3291 gcc_assert (count == 1 && mode == ag_mode);
3292 return gen_rtx_REG (mode, V0_REGNUM);
3293 }
3294 else
3295 {
3296 int i;
3297 rtx par;
3298
3299 par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
3300 for (i = 0; i < count; i++)
3301 {
3302 rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
3303 rtx offset = gen_int_mode (i * GET_MODE_SIZE (ag_mode), Pmode);
3304 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
3305 XVECEXP (par, 0, i) = tmp;
3306 }
3307 return par;
3308 }
3309 }
3310 else
3311 return gen_rtx_REG (mode, R0_REGNUM);
3312 }
3313
3314 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
3315 Return true if REGNO is the number of a hard register in which the values
3316 of called function may come back. */
3317
3318 static bool
3319 aarch64_function_value_regno_p (const unsigned int regno)
3320 {
3321 /* Maximum of 16 bytes can be returned in the general registers. Examples
3322 of 16-byte return values are: 128-bit integers and 16-byte small
3323 structures (excluding homogeneous floating-point aggregates). */
3324 if (regno == R0_REGNUM || regno == R1_REGNUM)
3325 return true;
3326
3327 /* Up to four fp/simd registers can return a function value, e.g. a
3328 homogeneous floating-point aggregate having four members. */
3329 if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
3330 return TARGET_FLOAT;
3331
3332 return false;
3333 }
3334
3335 /* Implement TARGET_RETURN_IN_MEMORY.
3336
3337 If the type T of the result of a function is such that
3338 void func (T arg)
3339 would require that arg be passed as a value in a register (or set of
3340 registers) according to the parameter passing rules, then the result
3341 is returned in the same registers as would be used for such an
3342 argument. */
3343
3344 static bool
3345 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
3346 {
3347 HOST_WIDE_INT size;
3348 machine_mode ag_mode;
3349 int count;
3350
3351 if (!AGGREGATE_TYPE_P (type)
3352 && TREE_CODE (type) != COMPLEX_TYPE
3353 && TREE_CODE (type) != VECTOR_TYPE)
3354 /* Simple scalar types always returned in registers. */
3355 return false;
3356
3357 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
3358 type,
3359 &ag_mode,
3360 &count,
3361 NULL))
3362 return false;
3363
3364 /* Types larger than 2 registers returned in memory. */
3365 size = int_size_in_bytes (type);
3366 return (size < 0 || size > 2 * UNITS_PER_WORD);
3367 }
3368
3369 static bool
3370 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
3371 const_tree type, int *nregs)
3372 {
3373 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3374 return aarch64_vfp_is_call_or_return_candidate (mode,
3375 type,
3376 &pcum->aapcs_vfp_rmode,
3377 nregs,
3378 NULL);
3379 }
3380
3381 /* Given MODE and TYPE of a function argument, return the alignment in
3382 bits. The idea is to suppress any stronger alignment requested by
3383 the user and opt for the natural alignment (specified in AAPCS64 \S 4.1).
3384 This is a helper function for local use only. */
3385
3386 static unsigned int
3387 aarch64_function_arg_alignment (machine_mode mode, const_tree type)
3388 {
3389 if (!type)
3390 return GET_MODE_ALIGNMENT (mode);
3391
3392 if (integer_zerop (TYPE_SIZE (type)))
3393 return 0;
3394
3395 gcc_assert (TYPE_MODE (type) == mode);
3396
3397 if (!AGGREGATE_TYPE_P (type))
3398 return TYPE_ALIGN (TYPE_MAIN_VARIANT (type));
3399
3400 if (TREE_CODE (type) == ARRAY_TYPE)
3401 return TYPE_ALIGN (TREE_TYPE (type));
3402
3403 unsigned int alignment = 0;
3404 for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
3405 if (TREE_CODE (field) == FIELD_DECL)
3406 alignment = std::max (alignment, DECL_ALIGN (field));
3407
3408 return alignment;
3409 }
3410
3411 /* Layout a function argument according to the AAPCS64 rules. The rule
3412 numbers refer to the rule numbers in the AAPCS64. */
3413
3414 static void
3415 aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
3416 const_tree type,
3417 bool named ATTRIBUTE_UNUSED)
3418 {
3419 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3420 int ncrn, nvrn, nregs;
3421 bool allocate_ncrn, allocate_nvrn;
3422 HOST_WIDE_INT size;
3423
3424 /* We need to do this once per argument. */
3425 if (pcum->aapcs_arg_processed)
3426 return;
3427
3428 pcum->aapcs_arg_processed = true;
3429
3430 /* Size in bytes, rounded to the nearest multiple of 8 bytes. */
3431 if (type)
3432 size = int_size_in_bytes (type);
3433 else
3434 /* No frontends can create types with variable-sized modes, so we
3435 shouldn't be asked to pass or return them. */
3436 size = GET_MODE_SIZE (mode).to_constant ();
3437 size = ROUND_UP (size, UNITS_PER_WORD);
3438
3439 allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
3440 allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
3441 mode,
3442 type,
3443 &nregs);
3444
3445 /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
3446 The following code thus handles passing by SIMD/FP registers first. */
3447
3448 nvrn = pcum->aapcs_nvrn;
3449
3450 /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
3451 and homogenous short-vector aggregates (HVA). */
3452 if (allocate_nvrn)
3453 {
3454 if (!TARGET_FLOAT)
3455 aarch64_err_no_fpadvsimd (mode);
3456
3457 if (nvrn + nregs <= NUM_FP_ARG_REGS)
3458 {
3459 pcum->aapcs_nextnvrn = nvrn + nregs;
3460 if (!aarch64_composite_type_p (type, mode))
3461 {
3462 gcc_assert (nregs == 1);
3463 pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
3464 }
3465 else
3466 {
3467 rtx par;
3468 int i;
3469 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
3470 for (i = 0; i < nregs; i++)
3471 {
3472 rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
3473 V0_REGNUM + nvrn + i);
3474 rtx offset = gen_int_mode
3475 (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode), Pmode);
3476 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
3477 XVECEXP (par, 0, i) = tmp;
3478 }
3479 pcum->aapcs_reg = par;
3480 }
3481 return;
3482 }
3483 else
3484 {
3485 /* C.3 NSRN is set to 8. */
3486 pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
3487 goto on_stack;
3488 }
3489 }
3490
3491 ncrn = pcum->aapcs_ncrn;
3492 nregs = size / UNITS_PER_WORD;
3493
3494 /* C6 - C9. though the sign and zero extension semantics are
3495 handled elsewhere. This is the case where the argument fits
3496 entirely general registers. */
3497 if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
3498 {
3499
3500 gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
3501
3502 /* C.8 if the argument has an alignment of 16 then the NGRN is
3503 rounded up to the next even number. */
3504 if (nregs == 2
3505 && ncrn % 2
3506 /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
3507 comparison is there because for > 16 * BITS_PER_UNIT
3508 alignment nregs should be > 2 and therefore it should be
3509 passed by reference rather than value. */
3510 && aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
3511 {
3512 ++ncrn;
3513 gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
3514 }
3515
3516 /* NREGS can be 0 when e.g. an empty structure is to be passed.
3517 A reg is still generated for it, but the caller should be smart
3518 enough not to use it. */
3519 if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
3520 pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
3521 else
3522 {
3523 rtx par;
3524 int i;
3525
3526 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
3527 for (i = 0; i < nregs; i++)
3528 {
3529 rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
3530 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
3531 GEN_INT (i * UNITS_PER_WORD));
3532 XVECEXP (par, 0, i) = tmp;
3533 }
3534 pcum->aapcs_reg = par;
3535 }
3536
3537 pcum->aapcs_nextncrn = ncrn + nregs;
3538 return;
3539 }
3540
3541 /* C.11 */
3542 pcum->aapcs_nextncrn = NUM_ARG_REGS;
3543
3544 /* The argument is passed on stack; record the needed number of words for
3545 this argument and align the total size if necessary. */
3546 on_stack:
3547 pcum->aapcs_stack_words = size / UNITS_PER_WORD;
3548
3549 if (aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
3550 pcum->aapcs_stack_size = ROUND_UP (pcum->aapcs_stack_size,
3551 16 / UNITS_PER_WORD);
3552 return;
3553 }
3554
3555 /* Implement TARGET_FUNCTION_ARG. */
3556
3557 static rtx
3558 aarch64_function_arg (cumulative_args_t pcum_v, machine_mode mode,
3559 const_tree type, bool named)
3560 {
3561 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3562 gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64);
3563
3564 if (mode == VOIDmode)
3565 return NULL_RTX;
3566
3567 aarch64_layout_arg (pcum_v, mode, type, named);
3568 return pcum->aapcs_reg;
3569 }
3570
3571 void
3572 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
3573 const_tree fntype ATTRIBUTE_UNUSED,
3574 rtx libname ATTRIBUTE_UNUSED,
3575 const_tree fndecl ATTRIBUTE_UNUSED,
3576 unsigned n_named ATTRIBUTE_UNUSED)
3577 {
3578 pcum->aapcs_ncrn = 0;
3579 pcum->aapcs_nvrn = 0;
3580 pcum->aapcs_nextncrn = 0;
3581 pcum->aapcs_nextnvrn = 0;
3582 pcum->pcs_variant = ARM_PCS_AAPCS64;
3583 pcum->aapcs_reg = NULL_RTX;
3584 pcum->aapcs_arg_processed = false;
3585 pcum->aapcs_stack_words = 0;
3586 pcum->aapcs_stack_size = 0;
3587
3588 if (!TARGET_FLOAT
3589 && fndecl && TREE_PUBLIC (fndecl)
3590 && fntype && fntype != error_mark_node)
3591 {
3592 const_tree type = TREE_TYPE (fntype);
3593 machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument. */
3594 int nregs ATTRIBUTE_UNUSED; /* Likewise. */
3595 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
3596 &mode, &nregs, NULL))
3597 aarch64_err_no_fpadvsimd (TYPE_MODE (type));
3598 }
3599 return;
3600 }
3601
3602 static void
3603 aarch64_function_arg_advance (cumulative_args_t pcum_v,
3604 machine_mode mode,
3605 const_tree type,
3606 bool named)
3607 {
3608 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3609 if (pcum->pcs_variant == ARM_PCS_AAPCS64)
3610 {
3611 aarch64_layout_arg (pcum_v, mode, type, named);
3612 gcc_assert ((pcum->aapcs_reg != NULL_RTX)
3613 != (pcum->aapcs_stack_words != 0));
3614 pcum->aapcs_arg_processed = false;
3615 pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
3616 pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
3617 pcum->aapcs_stack_size += pcum->aapcs_stack_words;
3618 pcum->aapcs_stack_words = 0;
3619 pcum->aapcs_reg = NULL_RTX;
3620 }
3621 }
3622
3623 bool
3624 aarch64_function_arg_regno_p (unsigned regno)
3625 {
3626 return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
3627 || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
3628 }
3629
3630 /* Implement FUNCTION_ARG_BOUNDARY. Every parameter gets at least
3631 PARM_BOUNDARY bits of alignment, but will be given anything up
3632 to STACK_BOUNDARY bits if the type requires it. This makes sure
3633 that both before and after the layout of each argument, the Next
3634 Stacked Argument Address (NSAA) will have a minimum alignment of
3635 8 bytes. */
3636
3637 static unsigned int
3638 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
3639 {
3640 unsigned int alignment = aarch64_function_arg_alignment (mode, type);
3641 return MIN (MAX (alignment, PARM_BOUNDARY), STACK_BOUNDARY);
3642 }
3643
3644 /* Implement TARGET_GET_RAW_RESULT_MODE and TARGET_GET_RAW_ARG_MODE. */
3645
3646 static fixed_size_mode
3647 aarch64_get_reg_raw_mode (int regno)
3648 {
3649 if (TARGET_SVE && FP_REGNUM_P (regno))
3650 /* Don't use the SVE part of the register for __builtin_apply and
3651 __builtin_return. The SVE registers aren't used by the normal PCS,
3652 so using them there would be a waste of time. The PCS extensions
3653 for SVE types are fundamentally incompatible with the
3654 __builtin_return/__builtin_apply interface. */
3655 return as_a <fixed_size_mode> (V16QImode);
3656 return default_get_reg_raw_mode (regno);
3657 }
3658
3659 /* Implement TARGET_FUNCTION_ARG_PADDING.
3660
3661 Small aggregate types are placed in the lowest memory address.
3662
3663 The related parameter passing rules are B.4, C.3, C.5 and C.14. */
3664
3665 static pad_direction
3666 aarch64_function_arg_padding (machine_mode mode, const_tree type)
3667 {
3668 /* On little-endian targets, the least significant byte of every stack
3669 argument is passed at the lowest byte address of the stack slot. */
3670 if (!BYTES_BIG_ENDIAN)
3671 return PAD_UPWARD;
3672
3673 /* Otherwise, integral, floating-point and pointer types are padded downward:
3674 the least significant byte of a stack argument is passed at the highest
3675 byte address of the stack slot. */
3676 if (type
3677 ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
3678 || POINTER_TYPE_P (type))
3679 : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
3680 return PAD_DOWNWARD;
3681
3682 /* Everything else padded upward, i.e. data in first byte of stack slot. */
3683 return PAD_UPWARD;
3684 }
3685
3686 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
3687
3688 It specifies padding for the last (may also be the only)
3689 element of a block move between registers and memory. If
3690 assuming the block is in the memory, padding upward means that
3691 the last element is padded after its highest significant byte,
3692 while in downward padding, the last element is padded at the
3693 its least significant byte side.
3694
3695 Small aggregates and small complex types are always padded
3696 upwards.
3697
3698 We don't need to worry about homogeneous floating-point or
3699 short-vector aggregates; their move is not affected by the
3700 padding direction determined here. Regardless of endianness,
3701 each element of such an aggregate is put in the least
3702 significant bits of a fp/simd register.
3703
3704 Return !BYTES_BIG_ENDIAN if the least significant byte of the
3705 register has useful data, and return the opposite if the most
3706 significant byte does. */
3707
3708 bool
3709 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
3710 bool first ATTRIBUTE_UNUSED)
3711 {
3712
3713 /* Small composite types are always padded upward. */
3714 if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
3715 {
3716 HOST_WIDE_INT size;
3717 if (type)
3718 size = int_size_in_bytes (type);
3719 else
3720 /* No frontends can create types with variable-sized modes, so we
3721 shouldn't be asked to pass or return them. */
3722 size = GET_MODE_SIZE (mode).to_constant ();
3723 if (size < 2 * UNITS_PER_WORD)
3724 return true;
3725 }
3726
3727 /* Otherwise, use the default padding. */
3728 return !BYTES_BIG_ENDIAN;
3729 }
3730
3731 static scalar_int_mode
3732 aarch64_libgcc_cmp_return_mode (void)
3733 {
3734 return SImode;
3735 }
3736
3737 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
3738
3739 /* We use the 12-bit shifted immediate arithmetic instructions so values
3740 must be multiple of (1 << 12), i.e. 4096. */
3741 #define ARITH_FACTOR 4096
3742
3743 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
3744 #error Cannot use simple address calculation for stack probing
3745 #endif
3746
3747 /* The pair of scratch registers used for stack probing. */
3748 #define PROBE_STACK_FIRST_REG 9
3749 #define PROBE_STACK_SECOND_REG 10
3750
3751 /* Emit code to probe a range of stack addresses from FIRST to FIRST+POLY_SIZE,
3752 inclusive. These are offsets from the current stack pointer. */
3753
3754 static void
3755 aarch64_emit_probe_stack_range (HOST_WIDE_INT first, poly_int64 poly_size)
3756 {
3757 HOST_WIDE_INT size;
3758 if (!poly_size.is_constant (&size))
3759 {
3760 sorry ("stack probes for SVE frames");
3761 return;
3762 }
3763
3764 rtx reg1 = gen_rtx_REG (Pmode, PROBE_STACK_FIRST_REG);
3765
3766 /* See the same assertion on PROBE_INTERVAL above. */
3767 gcc_assert ((first % ARITH_FACTOR) == 0);
3768
3769 /* See if we have a constant small number of probes to generate. If so,
3770 that's the easy case. */
3771 if (size <= PROBE_INTERVAL)
3772 {
3773 const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR);
3774
3775 emit_set_insn (reg1,
3776 plus_constant (Pmode,
3777 stack_pointer_rtx, -(first + base)));
3778 emit_stack_probe (plus_constant (Pmode, reg1, base - size));
3779 }
3780
3781 /* The run-time loop is made up of 8 insns in the generic case while the
3782 compile-time loop is made up of 4+2*(n-2) insns for n # of intervals. */
3783 else if (size <= 4 * PROBE_INTERVAL)
3784 {
3785 HOST_WIDE_INT i, rem;
3786
3787 emit_set_insn (reg1,
3788 plus_constant (Pmode,
3789 stack_pointer_rtx,
3790 -(first + PROBE_INTERVAL)));
3791 emit_stack_probe (reg1);
3792
3793 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
3794 it exceeds SIZE. If only two probes are needed, this will not
3795 generate any code. Then probe at FIRST + SIZE. */
3796 for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
3797 {
3798 emit_set_insn (reg1,
3799 plus_constant (Pmode, reg1, -PROBE_INTERVAL));
3800 emit_stack_probe (reg1);
3801 }
3802
3803 rem = size - (i - PROBE_INTERVAL);
3804 if (rem > 256)
3805 {
3806 const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
3807
3808 emit_set_insn (reg1, plus_constant (Pmode, reg1, -base));
3809 emit_stack_probe (plus_constant (Pmode, reg1, base - rem));
3810 }
3811 else
3812 emit_stack_probe (plus_constant (Pmode, reg1, -rem));
3813 }
3814
3815 /* Otherwise, do the same as above, but in a loop. Note that we must be
3816 extra careful with variables wrapping around because we might be at
3817 the very top (or the very bottom) of the address space and we have
3818 to be able to handle this case properly; in particular, we use an
3819 equality test for the loop condition. */
3820 else
3821 {
3822 rtx reg2 = gen_rtx_REG (Pmode, PROBE_STACK_SECOND_REG);
3823
3824 /* Step 1: round SIZE to the previous multiple of the interval. */
3825
3826 HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
3827
3828
3829 /* Step 2: compute initial and final value of the loop counter. */
3830
3831 /* TEST_ADDR = SP + FIRST. */
3832 emit_set_insn (reg1,
3833 plus_constant (Pmode, stack_pointer_rtx, -first));
3834
3835 /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE. */
3836 HOST_WIDE_INT adjustment = - (first + rounded_size);
3837 if (! aarch64_uimm12_shift (adjustment))
3838 {
3839 aarch64_internal_mov_immediate (reg2, GEN_INT (adjustment),
3840 true, Pmode);
3841 emit_set_insn (reg2, gen_rtx_PLUS (Pmode, stack_pointer_rtx, reg2));
3842 }
3843 else
3844 emit_set_insn (reg2,
3845 plus_constant (Pmode, stack_pointer_rtx, adjustment));
3846
3847 /* Step 3: the loop
3848
3849 do
3850 {
3851 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
3852 probe at TEST_ADDR
3853 }
3854 while (TEST_ADDR != LAST_ADDR)
3855
3856 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
3857 until it is equal to ROUNDED_SIZE. */
3858
3859 emit_insn (gen_probe_stack_range (reg1, reg1, reg2));
3860
3861
3862 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
3863 that SIZE is equal to ROUNDED_SIZE. */
3864
3865 if (size != rounded_size)
3866 {
3867 HOST_WIDE_INT rem = size - rounded_size;
3868
3869 if (rem > 256)
3870 {
3871 const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
3872
3873 emit_set_insn (reg2, plus_constant (Pmode, reg2, -base));
3874 emit_stack_probe (plus_constant (Pmode, reg2, base - rem));
3875 }
3876 else
3877 emit_stack_probe (plus_constant (Pmode, reg2, -rem));
3878 }
3879 }
3880
3881 /* Make sure nothing is scheduled before we are done. */
3882 emit_insn (gen_blockage ());
3883 }
3884
3885 /* Probe a range of stack addresses from REG1 to REG2 inclusive. These are
3886 absolute addresses. */
3887
3888 const char *
3889 aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
3890 {
3891 static int labelno = 0;
3892 char loop_lab[32];
3893 rtx xops[2];
3894
3895 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
3896
3897 /* Loop. */
3898 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
3899
3900 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
3901 xops[0] = reg1;
3902 xops[1] = GEN_INT (PROBE_INTERVAL);
3903 output_asm_insn ("sub\t%0, %0, %1", xops);
3904
3905 /* Probe at TEST_ADDR. */
3906 output_asm_insn ("str\txzr, [%0]", xops);
3907
3908 /* Test if TEST_ADDR == LAST_ADDR. */
3909 xops[1] = reg2;
3910 output_asm_insn ("cmp\t%0, %1", xops);
3911
3912 /* Branch. */
3913 fputs ("\tb.ne\t", asm_out_file);
3914 assemble_name_raw (asm_out_file, loop_lab);
3915 fputc ('\n', asm_out_file);
3916
3917 return "";
3918 }
3919
3920 /* Determine whether a frame chain needs to be generated. */
3921 static bool
3922 aarch64_needs_frame_chain (void)
3923 {
3924 /* Force a frame chain for EH returns so the return address is at FP+8. */
3925 if (frame_pointer_needed || crtl->calls_eh_return)
3926 return true;
3927
3928 /* A leaf function cannot have calls or write LR. */
3929 bool is_leaf = crtl->is_leaf && !df_regs_ever_live_p (LR_REGNUM);
3930
3931 /* Don't use a frame chain in leaf functions if leaf frame pointers
3932 are disabled. */
3933 if (flag_omit_leaf_frame_pointer && is_leaf)
3934 return false;
3935
3936 return aarch64_use_frame_pointer;
3937 }
3938
3939 /* Mark the registers that need to be saved by the callee and calculate
3940 the size of the callee-saved registers area and frame record (both FP
3941 and LR may be omitted). */
3942 static void
3943 aarch64_layout_frame (void)
3944 {
3945 HOST_WIDE_INT offset = 0;
3946 int regno, last_fp_reg = INVALID_REGNUM;
3947
3948 cfun->machine->frame.emit_frame_chain = aarch64_needs_frame_chain ();
3949
3950 #define SLOT_NOT_REQUIRED (-2)
3951 #define SLOT_REQUIRED (-1)
3952
3953 cfun->machine->frame.wb_candidate1 = INVALID_REGNUM;
3954 cfun->machine->frame.wb_candidate2 = INVALID_REGNUM;
3955
3956 /* First mark all the registers that really need to be saved... */
3957 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
3958 cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
3959
3960 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
3961 cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
3962
3963 /* ... that includes the eh data registers (if needed)... */
3964 if (crtl->calls_eh_return)
3965 for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
3966 cfun->machine->frame.reg_offset[EH_RETURN_DATA_REGNO (regno)]
3967 = SLOT_REQUIRED;
3968
3969 /* ... and any callee saved register that dataflow says is live. */
3970 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
3971 if (df_regs_ever_live_p (regno)
3972 && (regno == R30_REGNUM
3973 || !call_used_regs[regno]))
3974 cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
3975
3976 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
3977 if (df_regs_ever_live_p (regno)
3978 && !call_used_regs[regno])
3979 {
3980 cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
3981 last_fp_reg = regno;
3982 }
3983
3984 if (cfun->machine->frame.emit_frame_chain)
3985 {
3986 /* FP and LR are placed in the linkage record. */
3987 cfun->machine->frame.reg_offset[R29_REGNUM] = 0;
3988 cfun->machine->frame.wb_candidate1 = R29_REGNUM;
3989 cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
3990 cfun->machine->frame.wb_candidate2 = R30_REGNUM;
3991 offset = 2 * UNITS_PER_WORD;
3992 }
3993
3994 /* Now assign stack slots for them. */
3995 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
3996 if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
3997 {
3998 cfun->machine->frame.reg_offset[regno] = offset;
3999 if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
4000 cfun->machine->frame.wb_candidate1 = regno;
4001 else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM)
4002 cfun->machine->frame.wb_candidate2 = regno;
4003 offset += UNITS_PER_WORD;
4004 }
4005
4006 HOST_WIDE_INT max_int_offset = offset;
4007 offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
4008 bool has_align_gap = offset != max_int_offset;
4009
4010 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
4011 if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
4012 {
4013 /* If there is an alignment gap between integer and fp callee-saves,
4014 allocate the last fp register to it if possible. */
4015 if (regno == last_fp_reg && has_align_gap && (offset & 8) == 0)
4016 {
4017 cfun->machine->frame.reg_offset[regno] = max_int_offset;
4018 break;
4019 }
4020
4021 cfun->machine->frame.reg_offset[regno] = offset;
4022 if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
4023 cfun->machine->frame.wb_candidate1 = regno;
4024 else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM
4025 && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
4026 cfun->machine->frame.wb_candidate2 = regno;
4027 offset += UNITS_PER_WORD;
4028 }
4029
4030 offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
4031
4032 cfun->machine->frame.saved_regs_size = offset;
4033
4034 HOST_WIDE_INT varargs_and_saved_regs_size
4035 = offset + cfun->machine->frame.saved_varargs_size;
4036
4037 cfun->machine->frame.hard_fp_offset
4038 = aligned_upper_bound (varargs_and_saved_regs_size
4039 + get_frame_size (),
4040 STACK_BOUNDARY / BITS_PER_UNIT);
4041
4042 /* Both these values are already aligned. */
4043 gcc_assert (multiple_p (crtl->outgoing_args_size,
4044 STACK_BOUNDARY / BITS_PER_UNIT));
4045 cfun->machine->frame.frame_size
4046 = (cfun->machine->frame.hard_fp_offset
4047 + crtl->outgoing_args_size);
4048
4049 cfun->machine->frame.locals_offset = cfun->machine->frame.saved_varargs_size;
4050
4051 cfun->machine->frame.initial_adjust = 0;
4052 cfun->machine->frame.final_adjust = 0;
4053 cfun->machine->frame.callee_adjust = 0;
4054 cfun->machine->frame.callee_offset = 0;
4055
4056 HOST_WIDE_INT max_push_offset = 0;
4057 if (cfun->machine->frame.wb_candidate2 != INVALID_REGNUM)
4058 max_push_offset = 512;
4059 else if (cfun->machine->frame.wb_candidate1 != INVALID_REGNUM)
4060 max_push_offset = 256;
4061
4062 HOST_WIDE_INT const_size, const_fp_offset;
4063 if (cfun->machine->frame.frame_size.is_constant (&const_size)
4064 && const_size < max_push_offset
4065 && known_eq (crtl->outgoing_args_size, 0))
4066 {
4067 /* Simple, small frame with no outgoing arguments:
4068 stp reg1, reg2, [sp, -frame_size]!
4069 stp reg3, reg4, [sp, 16] */
4070 cfun->machine->frame.callee_adjust = const_size;
4071 }
4072 else if (known_lt (crtl->outgoing_args_size
4073 + cfun->machine->frame.saved_regs_size, 512)
4074 && !(cfun->calls_alloca
4075 && known_lt (cfun->machine->frame.hard_fp_offset,
4076 max_push_offset)))
4077 {
4078 /* Frame with small outgoing arguments:
4079 sub sp, sp, frame_size
4080 stp reg1, reg2, [sp, outgoing_args_size]
4081 stp reg3, reg4, [sp, outgoing_args_size + 16] */
4082 cfun->machine->frame.initial_adjust = cfun->machine->frame.frame_size;
4083 cfun->machine->frame.callee_offset
4084 = cfun->machine->frame.frame_size - cfun->machine->frame.hard_fp_offset;
4085 }
4086 else if (cfun->machine->frame.hard_fp_offset.is_constant (&const_fp_offset)
4087 && const_fp_offset < max_push_offset)
4088 {
4089 /* Frame with large outgoing arguments but a small local area:
4090 stp reg1, reg2, [sp, -hard_fp_offset]!
4091 stp reg3, reg4, [sp, 16]
4092 sub sp, sp, outgoing_args_size */
4093 cfun->machine->frame.callee_adjust = const_fp_offset;
4094 cfun->machine->frame.final_adjust
4095 = cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
4096 }
4097 else
4098 {
4099 /* Frame with large local area and outgoing arguments using frame pointer:
4100 sub sp, sp, hard_fp_offset
4101 stp x29, x30, [sp, 0]
4102 add x29, sp, 0
4103 stp reg3, reg4, [sp, 16]
4104 sub sp, sp, outgoing_args_size */
4105 cfun->machine->frame.initial_adjust = cfun->machine->frame.hard_fp_offset;
4106 cfun->machine->frame.final_adjust
4107 = cfun->machine->frame.frame_size - cfun->machine->frame.initial_adjust;
4108 }
4109
4110 cfun->machine->frame.laid_out = true;
4111 }
4112
4113 /* Return true if the register REGNO is saved on entry to
4114 the current function. */
4115
4116 static bool
4117 aarch64_register_saved_on_entry (int regno)
4118 {
4119 return cfun->machine->frame.reg_offset[regno] >= 0;
4120 }
4121
4122 /* Return the next register up from REGNO up to LIMIT for the callee
4123 to save. */
4124
4125 static unsigned
4126 aarch64_next_callee_save (unsigned regno, unsigned limit)
4127 {
4128 while (regno <= limit && !aarch64_register_saved_on_entry (regno))
4129 regno ++;
4130 return regno;
4131 }
4132
4133 /* Push the register number REGNO of mode MODE to the stack with write-back
4134 adjusting the stack by ADJUSTMENT. */
4135
4136 static void
4137 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
4138 HOST_WIDE_INT adjustment)
4139 {
4140 rtx base_rtx = stack_pointer_rtx;
4141 rtx insn, reg, mem;
4142
4143 reg = gen_rtx_REG (mode, regno);
4144 mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
4145 plus_constant (Pmode, base_rtx, -adjustment));
4146 mem = gen_frame_mem (mode, mem);
4147
4148 insn = emit_move_insn (mem, reg);
4149 RTX_FRAME_RELATED_P (insn) = 1;
4150 }
4151
4152 /* Generate and return an instruction to store the pair of registers
4153 REG and REG2 of mode MODE to location BASE with write-back adjusting
4154 the stack location BASE by ADJUSTMENT. */
4155
4156 static rtx
4157 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
4158 HOST_WIDE_INT adjustment)
4159 {
4160 switch (mode)
4161 {
4162 case E_DImode:
4163 return gen_storewb_pairdi_di (base, base, reg, reg2,
4164 GEN_INT (-adjustment),
4165 GEN_INT (UNITS_PER_WORD - adjustment));
4166 case E_DFmode:
4167 return gen_storewb_pairdf_di (base, base, reg, reg2,
4168 GEN_INT (-adjustment),
4169 GEN_INT (UNITS_PER_WORD - adjustment));
4170 default:
4171 gcc_unreachable ();
4172 }
4173 }
4174
4175 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
4176 stack pointer by ADJUSTMENT. */
4177
4178 static void
4179 aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
4180 {
4181 rtx_insn *insn;
4182 machine_mode mode = (regno1 <= R30_REGNUM) ? E_DImode : E_DFmode;
4183
4184 if (regno2 == INVALID_REGNUM)
4185 return aarch64_pushwb_single_reg (mode, regno1, adjustment);
4186
4187 rtx reg1 = gen_rtx_REG (mode, regno1);
4188 rtx reg2 = gen_rtx_REG (mode, regno2);
4189
4190 insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
4191 reg2, adjustment));
4192 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
4193 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
4194 RTX_FRAME_RELATED_P (insn) = 1;
4195 }
4196
4197 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
4198 adjusting it by ADJUSTMENT afterwards. */
4199
4200 static rtx
4201 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
4202 HOST_WIDE_INT adjustment)
4203 {
4204 switch (mode)
4205 {
4206 case E_DImode:
4207 return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
4208 GEN_INT (UNITS_PER_WORD));
4209 case E_DFmode:
4210 return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
4211 GEN_INT (UNITS_PER_WORD));
4212 default:
4213 gcc_unreachable ();
4214 }
4215 }
4216
4217 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
4218 afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
4219 into CFI_OPS. */
4220
4221 static void
4222 aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
4223 rtx *cfi_ops)
4224 {
4225 machine_mode mode = (regno1 <= R30_REGNUM) ? E_DImode : E_DFmode;
4226 rtx reg1 = gen_rtx_REG (mode, regno1);
4227
4228 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
4229
4230 if (regno2 == INVALID_REGNUM)
4231 {
4232 rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment);
4233 mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
4234 emit_move_insn (reg1, gen_frame_mem (mode, mem));
4235 }
4236 else
4237 {
4238 rtx reg2 = gen_rtx_REG (mode, regno2);
4239 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
4240 emit_insn (aarch64_gen_loadwb_pair (mode, stack_pointer_rtx, reg1,
4241 reg2, adjustment));
4242 }
4243 }
4244
4245 /* Generate and return a store pair instruction of mode MODE to store
4246 register REG1 to MEM1 and register REG2 to MEM2. */
4247
4248 static rtx
4249 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
4250 rtx reg2)
4251 {
4252 switch (mode)
4253 {
4254 case E_DImode:
4255 return gen_store_pair_dw_didi (mem1, reg1, mem2, reg2);
4256
4257 case E_DFmode:
4258 return gen_store_pair_dw_dfdf (mem1, reg1, mem2, reg2);
4259
4260 default:
4261 gcc_unreachable ();
4262 }
4263 }
4264
4265 /* Generate and regurn a load pair isntruction of mode MODE to load register
4266 REG1 from MEM1 and register REG2 from MEM2. */
4267
4268 static rtx
4269 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
4270 rtx mem2)
4271 {
4272 switch (mode)
4273 {
4274 case E_DImode:
4275 return gen_load_pair_dw_didi (reg1, mem1, reg2, mem2);
4276
4277 case E_DFmode:
4278 return gen_load_pair_dw_dfdf (reg1, mem1, reg2, mem2);
4279
4280 default:
4281 gcc_unreachable ();
4282 }
4283 }
4284
4285 /* Return TRUE if return address signing should be enabled for the current
4286 function, otherwise return FALSE. */
4287
4288 bool
4289 aarch64_return_address_signing_enabled (void)
4290 {
4291 /* This function should only be called after frame laid out. */
4292 gcc_assert (cfun->machine->frame.laid_out);
4293
4294 /* If signing scope is AARCH64_FUNCTION_NON_LEAF, we only sign a leaf function
4295 if it's LR is pushed onto stack. */
4296 return (aarch64_ra_sign_scope == AARCH64_FUNCTION_ALL
4297 || (aarch64_ra_sign_scope == AARCH64_FUNCTION_NON_LEAF
4298 && cfun->machine->frame.reg_offset[LR_REGNUM] >= 0));
4299 }
4300
4301 /* Emit code to save the callee-saved registers from register number START
4302 to LIMIT to the stack at the location starting at offset START_OFFSET,
4303 skipping any write-back candidates if SKIP_WB is true. */
4304
4305 static void
4306 aarch64_save_callee_saves (machine_mode mode, poly_int64 start_offset,
4307 unsigned start, unsigned limit, bool skip_wb)
4308 {
4309 rtx_insn *insn;
4310 unsigned regno;
4311 unsigned regno2;
4312
4313 for (regno = aarch64_next_callee_save (start, limit);
4314 regno <= limit;
4315 regno = aarch64_next_callee_save (regno + 1, limit))
4316 {
4317 rtx reg, mem;
4318 poly_int64 offset;
4319
4320 if (skip_wb
4321 && (regno == cfun->machine->frame.wb_candidate1
4322 || regno == cfun->machine->frame.wb_candidate2))
4323 continue;
4324
4325 if (cfun->machine->reg_is_wrapped_separately[regno])
4326 continue;
4327
4328 reg = gen_rtx_REG (mode, regno);
4329 offset = start_offset + cfun->machine->frame.reg_offset[regno];
4330 mem = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
4331 offset));
4332
4333 regno2 = aarch64_next_callee_save (regno + 1, limit);
4334
4335 if (regno2 <= limit
4336 && !cfun->machine->reg_is_wrapped_separately[regno2]
4337 && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
4338 == cfun->machine->frame.reg_offset[regno2]))
4339
4340 {
4341 rtx reg2 = gen_rtx_REG (mode, regno2);
4342 rtx mem2;
4343
4344 offset = start_offset + cfun->machine->frame.reg_offset[regno2];
4345 mem2 = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
4346 offset));
4347 insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
4348 reg2));
4349
4350 /* The first part of a frame-related parallel insn is
4351 always assumed to be relevant to the frame
4352 calculations; subsequent parts, are only
4353 frame-related if explicitly marked. */
4354 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
4355 regno = regno2;
4356 }
4357 else
4358 insn = emit_move_insn (mem, reg);
4359
4360 RTX_FRAME_RELATED_P (insn) = 1;
4361 }
4362 }
4363
4364 /* Emit code to restore the callee registers of mode MODE from register
4365 number START up to and including LIMIT. Restore from the stack offset
4366 START_OFFSET, skipping any write-back candidates if SKIP_WB is true.
4367 Write the appropriate REG_CFA_RESTORE notes into CFI_OPS. */
4368
4369 static void
4370 aarch64_restore_callee_saves (machine_mode mode,
4371 poly_int64 start_offset, unsigned start,
4372 unsigned limit, bool skip_wb, rtx *cfi_ops)
4373 {
4374 rtx base_rtx = stack_pointer_rtx;
4375 unsigned regno;
4376 unsigned regno2;
4377 poly_int64 offset;
4378
4379 for (regno = aarch64_next_callee_save (start, limit);
4380 regno <= limit;
4381 regno = aarch64_next_callee_save (regno + 1, limit))
4382 {
4383 if (cfun->machine->reg_is_wrapped_separately[regno])
4384 continue;
4385
4386 rtx reg, mem;
4387
4388 if (skip_wb
4389 && (regno == cfun->machine->frame.wb_candidate1
4390 || regno == cfun->machine->frame.wb_candidate2))
4391 continue;
4392
4393 reg = gen_rtx_REG (mode, regno);
4394 offset = start_offset + cfun->machine->frame.reg_offset[regno];
4395 mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
4396
4397 regno2 = aarch64_next_callee_save (regno + 1, limit);
4398
4399 if (regno2 <= limit
4400 && !cfun->machine->reg_is_wrapped_separately[regno2]
4401 && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
4402 == cfun->machine->frame.reg_offset[regno2]))
4403 {
4404 rtx reg2 = gen_rtx_REG (mode, regno2);
4405 rtx mem2;
4406
4407 offset = start_offset + cfun->machine->frame.reg_offset[regno2];
4408 mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
4409 emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
4410
4411 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
4412 regno = regno2;
4413 }
4414 else
4415 emit_move_insn (reg, mem);
4416 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
4417 }
4418 }
4419
4420 /* Return true if OFFSET is a signed 4-bit value multiplied by the size
4421 of MODE. */
4422
4423 static inline bool
4424 offset_4bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
4425 {
4426 HOST_WIDE_INT multiple;
4427 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4428 && IN_RANGE (multiple, -8, 7));
4429 }
4430
4431 /* Return true if OFFSET is a unsigned 6-bit value multiplied by the size
4432 of MODE. */
4433
4434 static inline bool
4435 offset_6bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
4436 {
4437 HOST_WIDE_INT multiple;
4438 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4439 && IN_RANGE (multiple, 0, 63));
4440 }
4441
4442 /* Return true if OFFSET is a signed 7-bit value multiplied by the size
4443 of MODE. */
4444
4445 bool
4446 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
4447 {
4448 HOST_WIDE_INT multiple;
4449 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4450 && IN_RANGE (multiple, -64, 63));
4451 }
4452
4453 /* Return true if OFFSET is a signed 9-bit value. */
4454
4455 bool
4456 aarch64_offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
4457 poly_int64 offset)
4458 {
4459 HOST_WIDE_INT const_offset;
4460 return (offset.is_constant (&const_offset)
4461 && IN_RANGE (const_offset, -256, 255));
4462 }
4463
4464 /* Return true if OFFSET is a signed 9-bit value multiplied by the size
4465 of MODE. */
4466
4467 static inline bool
4468 offset_9bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
4469 {
4470 HOST_WIDE_INT multiple;
4471 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4472 && IN_RANGE (multiple, -256, 255));
4473 }
4474
4475 /* Return true if OFFSET is an unsigned 12-bit value multiplied by the size
4476 of MODE. */
4477
4478 static inline bool
4479 offset_12bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
4480 {
4481 HOST_WIDE_INT multiple;
4482 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4483 && IN_RANGE (multiple, 0, 4095));
4484 }
4485
4486 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS. */
4487
4488 static sbitmap
4489 aarch64_get_separate_components (void)
4490 {
4491 sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
4492 bitmap_clear (components);
4493
4494 /* The registers we need saved to the frame. */
4495 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
4496 if (aarch64_register_saved_on_entry (regno))
4497 {
4498 poly_int64 offset = cfun->machine->frame.reg_offset[regno];
4499 if (!frame_pointer_needed)
4500 offset += cfun->machine->frame.frame_size
4501 - cfun->machine->frame.hard_fp_offset;
4502 /* Check that we can access the stack slot of the register with one
4503 direct load with no adjustments needed. */
4504 if (offset_12bit_unsigned_scaled_p (DImode, offset))
4505 bitmap_set_bit (components, regno);
4506 }
4507
4508 /* Don't mess with the hard frame pointer. */
4509 if (frame_pointer_needed)
4510 bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM);
4511
4512 unsigned reg1 = cfun->machine->frame.wb_candidate1;
4513 unsigned reg2 = cfun->machine->frame.wb_candidate2;
4514 /* If registers have been chosen to be stored/restored with
4515 writeback don't interfere with them to avoid having to output explicit
4516 stack adjustment instructions. */
4517 if (reg2 != INVALID_REGNUM)
4518 bitmap_clear_bit (components, reg2);
4519 if (reg1 != INVALID_REGNUM)
4520 bitmap_clear_bit (components, reg1);
4521
4522 bitmap_clear_bit (components, LR_REGNUM);
4523 bitmap_clear_bit (components, SP_REGNUM);
4524
4525 return components;
4526 }
4527
4528 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB. */
4529
4530 static sbitmap
4531 aarch64_components_for_bb (basic_block bb)
4532 {
4533 bitmap in = DF_LIVE_IN (bb);
4534 bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
4535 bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
4536
4537 sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
4538 bitmap_clear (components);
4539
4540 /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets. */
4541 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
4542 if ((!call_used_regs[regno])
4543 && (bitmap_bit_p (in, regno)
4544 || bitmap_bit_p (gen, regno)
4545 || bitmap_bit_p (kill, regno)))
4546 {
4547 unsigned regno2, offset, offset2;
4548 bitmap_set_bit (components, regno);
4549
4550 /* If there is a callee-save at an adjacent offset, add it too
4551 to increase the use of LDP/STP. */
4552 offset = cfun->machine->frame.reg_offset[regno];
4553 regno2 = ((offset & 8) == 0) ? regno + 1 : regno - 1;
4554
4555 if (regno2 <= LAST_SAVED_REGNUM)
4556 {
4557 offset2 = cfun->machine->frame.reg_offset[regno2];
4558 if ((offset & ~8) == (offset2 & ~8))
4559 bitmap_set_bit (components, regno2);
4560 }
4561 }
4562
4563 return components;
4564 }
4565
4566 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
4567 Nothing to do for aarch64. */
4568
4569 static void
4570 aarch64_disqualify_components (sbitmap, edge, sbitmap, bool)
4571 {
4572 }
4573
4574 /* Return the next set bit in BMP from START onwards. Return the total number
4575 of bits in BMP if no set bit is found at or after START. */
4576
4577 static unsigned int
4578 aarch64_get_next_set_bit (sbitmap bmp, unsigned int start)
4579 {
4580 unsigned int nbits = SBITMAP_SIZE (bmp);
4581 if (start == nbits)
4582 return start;
4583
4584 gcc_assert (start < nbits);
4585 for (unsigned int i = start; i < nbits; i++)
4586 if (bitmap_bit_p (bmp, i))
4587 return i;
4588
4589 return nbits;
4590 }
4591
4592 /* Do the work for aarch64_emit_prologue_components and
4593 aarch64_emit_epilogue_components. COMPONENTS is the bitmap of registers
4594 to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
4595 for these components or the epilogue sequence. That is, it determines
4596 whether we should emit stores or loads and what kind of CFA notes to attach
4597 to the insns. Otherwise the logic for the two sequences is very
4598 similar. */
4599
4600 static void
4601 aarch64_process_components (sbitmap components, bool prologue_p)
4602 {
4603 rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
4604 ? HARD_FRAME_POINTER_REGNUM
4605 : STACK_POINTER_REGNUM);
4606
4607 unsigned last_regno = SBITMAP_SIZE (components);
4608 unsigned regno = aarch64_get_next_set_bit (components, R0_REGNUM);
4609 rtx_insn *insn = NULL;
4610
4611 while (regno != last_regno)
4612 {
4613 /* AAPCS64 section 5.1.2 requires only the bottom 64 bits to be saved
4614 so DFmode for the vector registers is enough. */
4615 machine_mode mode = GP_REGNUM_P (regno) ? E_DImode : E_DFmode;
4616 rtx reg = gen_rtx_REG (mode, regno);
4617 poly_int64 offset = cfun->machine->frame.reg_offset[regno];
4618 if (!frame_pointer_needed)
4619 offset += cfun->machine->frame.frame_size
4620 - cfun->machine->frame.hard_fp_offset;
4621 rtx addr = plus_constant (Pmode, ptr_reg, offset);
4622 rtx mem = gen_frame_mem (mode, addr);
4623
4624 rtx set = prologue_p ? gen_rtx_SET (mem, reg) : gen_rtx_SET (reg, mem);
4625 unsigned regno2 = aarch64_get_next_set_bit (components, regno + 1);
4626 /* No more registers to handle after REGNO.
4627 Emit a single save/restore and exit. */
4628 if (regno2 == last_regno)
4629 {
4630 insn = emit_insn (set);
4631 RTX_FRAME_RELATED_P (insn) = 1;
4632 if (prologue_p)
4633 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
4634 else
4635 add_reg_note (insn, REG_CFA_RESTORE, reg);
4636 break;
4637 }
4638
4639 poly_int64 offset2 = cfun->machine->frame.reg_offset[regno2];
4640 /* The next register is not of the same class or its offset is not
4641 mergeable with the current one into a pair. */
4642 if (!satisfies_constraint_Ump (mem)
4643 || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
4644 || maybe_ne ((offset2 - cfun->machine->frame.reg_offset[regno]),
4645 GET_MODE_SIZE (mode)))
4646 {
4647 insn = emit_insn (set);
4648 RTX_FRAME_RELATED_P (insn) = 1;
4649 if (prologue_p)
4650 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
4651 else
4652 add_reg_note (insn, REG_CFA_RESTORE, reg);
4653
4654 regno = regno2;
4655 continue;
4656 }
4657
4658 /* REGNO2 can be saved/restored in a pair with REGNO. */
4659 rtx reg2 = gen_rtx_REG (mode, regno2);
4660 if (!frame_pointer_needed)
4661 offset2 += cfun->machine->frame.frame_size
4662 - cfun->machine->frame.hard_fp_offset;
4663 rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
4664 rtx mem2 = gen_frame_mem (mode, addr2);
4665 rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
4666 : gen_rtx_SET (reg2, mem2);
4667
4668 if (prologue_p)
4669 insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2, reg2));
4670 else
4671 insn = emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
4672
4673 RTX_FRAME_RELATED_P (insn) = 1;
4674 if (prologue_p)
4675 {
4676 add_reg_note (insn, REG_CFA_OFFSET, set);
4677 add_reg_note (insn, REG_CFA_OFFSET, set2);
4678 }
4679 else
4680 {
4681 add_reg_note (insn, REG_CFA_RESTORE, reg);
4682 add_reg_note (insn, REG_CFA_RESTORE, reg2);
4683 }
4684
4685 regno = aarch64_get_next_set_bit (components, regno2 + 1);
4686 }
4687 }
4688
4689 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS. */
4690
4691 static void
4692 aarch64_emit_prologue_components (sbitmap components)
4693 {
4694 aarch64_process_components (components, true);
4695 }
4696
4697 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS. */
4698
4699 static void
4700 aarch64_emit_epilogue_components (sbitmap components)
4701 {
4702 aarch64_process_components (components, false);
4703 }
4704
4705 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS. */
4706
4707 static void
4708 aarch64_set_handled_components (sbitmap components)
4709 {
4710 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
4711 if (bitmap_bit_p (components, regno))
4712 cfun->machine->reg_is_wrapped_separately[regno] = true;
4713 }
4714
4715 /* Add a REG_CFA_EXPRESSION note to INSN to say that register REG
4716 is saved at BASE + OFFSET. */
4717
4718 static void
4719 aarch64_add_cfa_expression (rtx_insn *insn, unsigned int reg,
4720 rtx base, poly_int64 offset)
4721 {
4722 rtx mem = gen_frame_mem (DImode, plus_constant (Pmode, base, offset));
4723 add_reg_note (insn, REG_CFA_EXPRESSION,
4724 gen_rtx_SET (mem, regno_reg_rtx[reg]));
4725 }
4726
4727 /* AArch64 stack frames generated by this compiler look like:
4728
4729 +-------------------------------+
4730 | |
4731 | incoming stack arguments |
4732 | |
4733 +-------------------------------+
4734 | | <-- incoming stack pointer (aligned)
4735 | callee-allocated save area |
4736 | for register varargs |
4737 | |
4738 +-------------------------------+
4739 | local variables | <-- frame_pointer_rtx
4740 | |
4741 +-------------------------------+
4742 | padding0 | \
4743 +-------------------------------+ |
4744 | callee-saved registers | | frame.saved_regs_size
4745 +-------------------------------+ |
4746 | LR' | |
4747 +-------------------------------+ |
4748 | FP' | / <- hard_frame_pointer_rtx (aligned)
4749 +-------------------------------+
4750 | dynamic allocation |
4751 +-------------------------------+
4752 | padding |
4753 +-------------------------------+
4754 | outgoing stack arguments | <-- arg_pointer
4755 | |
4756 +-------------------------------+
4757 | | <-- stack_pointer_rtx (aligned)
4758
4759 Dynamic stack allocations via alloca() decrease stack_pointer_rtx
4760 but leave frame_pointer_rtx and hard_frame_pointer_rtx
4761 unchanged. */
4762
4763 /* Generate the prologue instructions for entry into a function.
4764 Establish the stack frame by decreasing the stack pointer with a
4765 properly calculated size and, if necessary, create a frame record
4766 filled with the values of LR and previous frame pointer. The
4767 current FP is also set up if it is in use. */
4768
4769 void
4770 aarch64_expand_prologue (void)
4771 {
4772 poly_int64 frame_size = cfun->machine->frame.frame_size;
4773 poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
4774 HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
4775 poly_int64 final_adjust = cfun->machine->frame.final_adjust;
4776 poly_int64 callee_offset = cfun->machine->frame.callee_offset;
4777 unsigned reg1 = cfun->machine->frame.wb_candidate1;
4778 unsigned reg2 = cfun->machine->frame.wb_candidate2;
4779 bool emit_frame_chain = cfun->machine->frame.emit_frame_chain;
4780 rtx_insn *insn;
4781
4782 /* Sign return address for functions. */
4783 if (aarch64_return_address_signing_enabled ())
4784 {
4785 insn = emit_insn (gen_pacisp ());
4786 add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
4787 RTX_FRAME_RELATED_P (insn) = 1;
4788 }
4789
4790 if (flag_stack_usage_info)
4791 current_function_static_stack_size = constant_lower_bound (frame_size);
4792
4793 if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
4794 {
4795 if (crtl->is_leaf && !cfun->calls_alloca)
4796 {
4797 if (maybe_gt (frame_size, PROBE_INTERVAL)
4798 && maybe_gt (frame_size, get_stack_check_protect ()))
4799 aarch64_emit_probe_stack_range (get_stack_check_protect (),
4800 (frame_size
4801 - get_stack_check_protect ()));
4802 }
4803 else if (maybe_gt (frame_size, 0))
4804 aarch64_emit_probe_stack_range (get_stack_check_protect (), frame_size);
4805 }
4806
4807 rtx ip0_rtx = gen_rtx_REG (Pmode, IP0_REGNUM);
4808 rtx ip1_rtx = gen_rtx_REG (Pmode, IP1_REGNUM);
4809
4810 aarch64_sub_sp (ip0_rtx, ip1_rtx, initial_adjust, true);
4811
4812 if (callee_adjust != 0)
4813 aarch64_push_regs (reg1, reg2, callee_adjust);
4814
4815 if (emit_frame_chain)
4816 {
4817 poly_int64 reg_offset = callee_adjust;
4818 if (callee_adjust == 0)
4819 {
4820 reg1 = R29_REGNUM;
4821 reg2 = R30_REGNUM;
4822 reg_offset = callee_offset;
4823 aarch64_save_callee_saves (DImode, reg_offset, reg1, reg2, false);
4824 }
4825 aarch64_add_offset (Pmode, hard_frame_pointer_rtx,
4826 stack_pointer_rtx, callee_offset,
4827 ip1_rtx, ip0_rtx, frame_pointer_needed);
4828 if (frame_pointer_needed && !frame_size.is_constant ())
4829 {
4830 /* Variable-sized frames need to describe the save slot
4831 address using DW_CFA_expression rather than DW_CFA_offset.
4832 This means that, without taking further action, the
4833 locations of the registers that we've already saved would
4834 remain based on the stack pointer even after we redefine
4835 the CFA based on the frame pointer. We therefore need new
4836 DW_CFA_expressions to re-express the save slots with addresses
4837 based on the frame pointer. */
4838 rtx_insn *insn = get_last_insn ();
4839 gcc_assert (RTX_FRAME_RELATED_P (insn));
4840
4841 /* Add an explicit CFA definition if this was previously
4842 implicit. */
4843 if (!find_reg_note (insn, REG_CFA_ADJUST_CFA, NULL_RTX))
4844 {
4845 rtx src = plus_constant (Pmode, stack_pointer_rtx,
4846 callee_offset);
4847 add_reg_note (insn, REG_CFA_ADJUST_CFA,
4848 gen_rtx_SET (hard_frame_pointer_rtx, src));
4849 }
4850
4851 /* Change the save slot expressions for the registers that
4852 we've already saved. */
4853 reg_offset -= callee_offset;
4854 aarch64_add_cfa_expression (insn, reg2, hard_frame_pointer_rtx,
4855 reg_offset + UNITS_PER_WORD);
4856 aarch64_add_cfa_expression (insn, reg1, hard_frame_pointer_rtx,
4857 reg_offset);
4858 }
4859 emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
4860 }
4861
4862 aarch64_save_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
4863 callee_adjust != 0 || emit_frame_chain);
4864 aarch64_save_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
4865 callee_adjust != 0 || emit_frame_chain);
4866 aarch64_sub_sp (ip1_rtx, ip0_rtx, final_adjust, !frame_pointer_needed);
4867 }
4868
4869 /* Return TRUE if we can use a simple_return insn.
4870
4871 This function checks whether the callee saved stack is empty, which
4872 means no restore actions are need. The pro_and_epilogue will use
4873 this to check whether shrink-wrapping opt is feasible. */
4874
4875 bool
4876 aarch64_use_return_insn_p (void)
4877 {
4878 if (!reload_completed)
4879 return false;
4880
4881 if (crtl->profile)
4882 return false;
4883
4884 return known_eq (cfun->machine->frame.frame_size, 0);
4885 }
4886
4887 /* Generate the epilogue instructions for returning from a function.
4888 This is almost exactly the reverse of the prolog sequence, except
4889 that we need to insert barriers to avoid scheduling loads that read
4890 from a deallocated stack, and we optimize the unwind records by
4891 emitting them all together if possible. */
4892 void
4893 aarch64_expand_epilogue (bool for_sibcall)
4894 {
4895 poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
4896 HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
4897 poly_int64 final_adjust = cfun->machine->frame.final_adjust;
4898 poly_int64 callee_offset = cfun->machine->frame.callee_offset;
4899 unsigned reg1 = cfun->machine->frame.wb_candidate1;
4900 unsigned reg2 = cfun->machine->frame.wb_candidate2;
4901 rtx cfi_ops = NULL;
4902 rtx_insn *insn;
4903 /* A stack clash protection prologue may not have left IP0_REGNUM or
4904 IP1_REGNUM in a usable state. The same is true for allocations
4905 with an SVE component, since we then need both temporary registers
4906 for each allocation. */
4907 bool can_inherit_p = (initial_adjust.is_constant ()
4908 && final_adjust.is_constant ()
4909 && !flag_stack_clash_protection);
4910
4911 /* We need to add memory barrier to prevent read from deallocated stack. */
4912 bool need_barrier_p
4913 = maybe_ne (get_frame_size ()
4914 + cfun->machine->frame.saved_varargs_size, 0);
4915
4916 /* Emit a barrier to prevent loads from a deallocated stack. */
4917 if (maybe_gt (final_adjust, crtl->outgoing_args_size)
4918 || cfun->calls_alloca
4919 || crtl->calls_eh_return)
4920 {
4921 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
4922 need_barrier_p = false;
4923 }
4924
4925 /* Restore the stack pointer from the frame pointer if it may not
4926 be the same as the stack pointer. */
4927 rtx ip0_rtx = gen_rtx_REG (Pmode, IP0_REGNUM);
4928 rtx ip1_rtx = gen_rtx_REG (Pmode, IP1_REGNUM);
4929 if (frame_pointer_needed
4930 && (maybe_ne (final_adjust, 0) || cfun->calls_alloca))
4931 /* If writeback is used when restoring callee-saves, the CFA
4932 is restored on the instruction doing the writeback. */
4933 aarch64_add_offset (Pmode, stack_pointer_rtx,
4934 hard_frame_pointer_rtx, -callee_offset,
4935 ip1_rtx, ip0_rtx, callee_adjust == 0);
4936 else
4937 aarch64_add_sp (ip1_rtx, ip0_rtx, final_adjust,
4938 !can_inherit_p || df_regs_ever_live_p (IP1_REGNUM));
4939
4940 aarch64_restore_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
4941 callee_adjust != 0, &cfi_ops);
4942 aarch64_restore_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
4943 callee_adjust != 0, &cfi_ops);
4944
4945 if (need_barrier_p)
4946 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
4947
4948 if (callee_adjust != 0)
4949 aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops);
4950
4951 if (callee_adjust != 0 || maybe_gt (initial_adjust, 65536))
4952 {
4953 /* Emit delayed restores and set the CFA to be SP + initial_adjust. */
4954 insn = get_last_insn ();
4955 rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust);
4956 REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
4957 RTX_FRAME_RELATED_P (insn) = 1;
4958 cfi_ops = NULL;
4959 }
4960
4961 aarch64_add_sp (ip0_rtx, ip1_rtx, initial_adjust,
4962 !can_inherit_p || df_regs_ever_live_p (IP0_REGNUM));
4963
4964 if (cfi_ops)
4965 {
4966 /* Emit delayed restores and reset the CFA to be SP. */
4967 insn = get_last_insn ();
4968 cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
4969 REG_NOTES (insn) = cfi_ops;
4970 RTX_FRAME_RELATED_P (insn) = 1;
4971 }
4972
4973 /* We prefer to emit the combined return/authenticate instruction RETAA,
4974 however there are three cases in which we must instead emit an explicit
4975 authentication instruction.
4976
4977 1) Sibcalls don't return in a normal way, so if we're about to call one
4978 we must authenticate.
4979
4980 2) The RETAA instruction is not available before ARMv8.3-A, so if we are
4981 generating code for !TARGET_ARMV8_3 we can't use it and must
4982 explicitly authenticate.
4983
4984 3) On an eh_return path we make extra stack adjustments to update the
4985 canonical frame address to be the exception handler's CFA. We want
4986 to authenticate using the CFA of the function which calls eh_return.
4987 */
4988 if (aarch64_return_address_signing_enabled ()
4989 && (for_sibcall || !TARGET_ARMV8_3 || crtl->calls_eh_return))
4990 {
4991 insn = emit_insn (gen_autisp ());
4992 add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
4993 RTX_FRAME_RELATED_P (insn) = 1;
4994 }
4995
4996 /* Stack adjustment for exception handler. */
4997 if (crtl->calls_eh_return)
4998 {
4999 /* We need to unwind the stack by the offset computed by
5000 EH_RETURN_STACKADJ_RTX. We have already reset the CFA
5001 to be SP; letting the CFA move during this adjustment
5002 is just as correct as retaining the CFA from the body
5003 of the function. Therefore, do nothing special. */
5004 emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
5005 }
5006
5007 emit_use (gen_rtx_REG (DImode, LR_REGNUM));
5008 if (!for_sibcall)
5009 emit_jump_insn (ret_rtx);
5010 }
5011
5012 /* Implement EH_RETURN_HANDLER_RTX. EH returns need to either return
5013 normally or return to a previous frame after unwinding.
5014
5015 An EH return uses a single shared return sequence. The epilogue is
5016 exactly like a normal epilogue except that it has an extra input
5017 register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
5018 that must be applied after the frame has been destroyed. An extra label
5019 is inserted before the epilogue which initializes this register to zero,
5020 and this is the entry point for a normal return.
5021
5022 An actual EH return updates the return address, initializes the stack
5023 adjustment and jumps directly into the epilogue (bypassing the zeroing
5024 of the adjustment). Since the return address is typically saved on the
5025 stack when a function makes a call, the saved LR must be updated outside
5026 the epilogue.
5027
5028 This poses problems as the store is generated well before the epilogue,
5029 so the offset of LR is not known yet. Also optimizations will remove the
5030 store as it appears dead, even after the epilogue is generated (as the
5031 base or offset for loading LR is different in many cases).
5032
5033 To avoid these problems this implementation forces the frame pointer
5034 in eh_return functions so that the location of LR is fixed and known early.
5035 It also marks the store volatile, so no optimization is permitted to
5036 remove the store. */
5037 rtx
5038 aarch64_eh_return_handler_rtx (void)
5039 {
5040 rtx tmp = gen_frame_mem (Pmode,
5041 plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
5042
5043 /* Mark the store volatile, so no optimization is permitted to remove it. */
5044 MEM_VOLATILE_P (tmp) = true;
5045 return tmp;
5046 }
5047
5048 /* Output code to add DELTA to the first argument, and then jump
5049 to FUNCTION. Used for C++ multiple inheritance. */
5050 static void
5051 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
5052 HOST_WIDE_INT delta,
5053 HOST_WIDE_INT vcall_offset,
5054 tree function)
5055 {
5056 /* The this pointer is always in x0. Note that this differs from
5057 Arm where the this pointer maybe bumped to r1 if r0 is required
5058 to return a pointer to an aggregate. On AArch64 a result value
5059 pointer will be in x8. */
5060 int this_regno = R0_REGNUM;
5061 rtx this_rtx, temp0, temp1, addr, funexp;
5062 rtx_insn *insn;
5063
5064 reload_completed = 1;
5065 emit_note (NOTE_INSN_PROLOGUE_END);
5066
5067 this_rtx = gen_rtx_REG (Pmode, this_regno);
5068 temp0 = gen_rtx_REG (Pmode, IP0_REGNUM);
5069 temp1 = gen_rtx_REG (Pmode, IP1_REGNUM);
5070
5071 if (vcall_offset == 0)
5072 aarch64_add_offset (Pmode, this_rtx, this_rtx, delta, temp1, temp0, false);
5073 else
5074 {
5075 gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
5076
5077 addr = this_rtx;
5078 if (delta != 0)
5079 {
5080 if (delta >= -256 && delta < 256)
5081 addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
5082 plus_constant (Pmode, this_rtx, delta));
5083 else
5084 aarch64_add_offset (Pmode, this_rtx, this_rtx, delta,
5085 temp1, temp0, false);
5086 }
5087
5088 if (Pmode == ptr_mode)
5089 aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
5090 else
5091 aarch64_emit_move (temp0,
5092 gen_rtx_ZERO_EXTEND (Pmode,
5093 gen_rtx_MEM (ptr_mode, addr)));
5094
5095 if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
5096 addr = plus_constant (Pmode, temp0, vcall_offset);
5097 else
5098 {
5099 aarch64_internal_mov_immediate (temp1, GEN_INT (vcall_offset), true,
5100 Pmode);
5101 addr = gen_rtx_PLUS (Pmode, temp0, temp1);
5102 }
5103
5104 if (Pmode == ptr_mode)
5105 aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
5106 else
5107 aarch64_emit_move (temp1,
5108 gen_rtx_SIGN_EXTEND (Pmode,
5109 gen_rtx_MEM (ptr_mode, addr)));
5110
5111 emit_insn (gen_add2_insn (this_rtx, temp1));
5112 }
5113
5114 /* Generate a tail call to the target function. */
5115 if (!TREE_USED (function))
5116 {
5117 assemble_external (function);
5118 TREE_USED (function) = 1;
5119 }
5120 funexp = XEXP (DECL_RTL (function), 0);
5121 funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
5122 insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, NULL_RTX));
5123 SIBLING_CALL_P (insn) = 1;
5124
5125 insn = get_insns ();
5126 shorten_branches (insn);
5127 final_start_function (insn, file, 1);
5128 final (insn, file, 1);
5129 final_end_function ();
5130
5131 /* Stop pretending to be a post-reload pass. */
5132 reload_completed = 0;
5133 }
5134
5135 static bool
5136 aarch64_tls_referenced_p (rtx x)
5137 {
5138 if (!TARGET_HAVE_TLS)
5139 return false;
5140 subrtx_iterator::array_type array;
5141 FOR_EACH_SUBRTX (iter, array, x, ALL)
5142 {
5143 const_rtx x = *iter;
5144 if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
5145 return true;
5146 /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
5147 TLS offsets, not real symbol references. */
5148 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
5149 iter.skip_subrtxes ();
5150 }
5151 return false;
5152 }
5153
5154
5155 /* Return true if val can be encoded as a 12-bit unsigned immediate with
5156 a left shift of 0 or 12 bits. */
5157 bool
5158 aarch64_uimm12_shift (HOST_WIDE_INT val)
5159 {
5160 return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
5161 || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
5162 );
5163 }
5164
5165
5166 /* Return true if val is an immediate that can be loaded into a
5167 register by a MOVZ instruction. */
5168 static bool
5169 aarch64_movw_imm (HOST_WIDE_INT val, scalar_int_mode mode)
5170 {
5171 if (GET_MODE_SIZE (mode) > 4)
5172 {
5173 if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
5174 || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
5175 return 1;
5176 }
5177 else
5178 {
5179 /* Ignore sign extension. */
5180 val &= (HOST_WIDE_INT) 0xffffffff;
5181 }
5182 return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
5183 || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
5184 }
5185
5186 /* VAL is a value with the inner mode of MODE. Replicate it to fill a
5187 64-bit (DImode) integer. */
5188
5189 static unsigned HOST_WIDE_INT
5190 aarch64_replicate_bitmask_imm (unsigned HOST_WIDE_INT val, machine_mode mode)
5191 {
5192 unsigned int size = GET_MODE_UNIT_PRECISION (mode);
5193 while (size < 64)
5194 {
5195 val &= (HOST_WIDE_INT_1U << size) - 1;
5196 val |= val << size;
5197 size *= 2;
5198 }
5199 return val;
5200 }
5201
5202 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2. */
5203
5204 static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
5205 {
5206 0x0000000100000001ull,
5207 0x0001000100010001ull,
5208 0x0101010101010101ull,
5209 0x1111111111111111ull,
5210 0x5555555555555555ull,
5211 };
5212
5213
5214 /* Return true if val is a valid bitmask immediate. */
5215
5216 bool
5217 aarch64_bitmask_imm (HOST_WIDE_INT val_in, machine_mode mode)
5218 {
5219 unsigned HOST_WIDE_INT val, tmp, mask, first_one, next_one;
5220 int bits;
5221
5222 /* Check for a single sequence of one bits and return quickly if so.
5223 The special cases of all ones and all zeroes returns false. */
5224 val = aarch64_replicate_bitmask_imm (val_in, mode);
5225 tmp = val + (val & -val);
5226
5227 if (tmp == (tmp & -tmp))
5228 return (val + 1) > 1;
5229
5230 /* Replicate 32-bit immediates so we can treat them as 64-bit. */
5231 if (mode == SImode)
5232 val = (val << 32) | (val & 0xffffffff);
5233
5234 /* Invert if the immediate doesn't start with a zero bit - this means we
5235 only need to search for sequences of one bits. */
5236 if (val & 1)
5237 val = ~val;
5238
5239 /* Find the first set bit and set tmp to val with the first sequence of one
5240 bits removed. Return success if there is a single sequence of ones. */
5241 first_one = val & -val;
5242 tmp = val & (val + first_one);
5243
5244 if (tmp == 0)
5245 return true;
5246
5247 /* Find the next set bit and compute the difference in bit position. */
5248 next_one = tmp & -tmp;
5249 bits = clz_hwi (first_one) - clz_hwi (next_one);
5250 mask = val ^ tmp;
5251
5252 /* Check the bit position difference is a power of 2, and that the first
5253 sequence of one bits fits within 'bits' bits. */
5254 if ((mask >> bits) != 0 || bits != (bits & -bits))
5255 return false;
5256
5257 /* Check the sequence of one bits is repeated 64/bits times. */
5258 return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
5259 }
5260
5261 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
5262 Assumed precondition: VAL_IN Is not zero. */
5263
5264 unsigned HOST_WIDE_INT
5265 aarch64_and_split_imm1 (HOST_WIDE_INT val_in)
5266 {
5267 int lowest_bit_set = ctz_hwi (val_in);
5268 int highest_bit_set = floor_log2 (val_in);
5269 gcc_assert (val_in != 0);
5270
5271 return ((HOST_WIDE_INT_UC (2) << highest_bit_set) -
5272 (HOST_WIDE_INT_1U << lowest_bit_set));
5273 }
5274
5275 /* Create constant where bits outside of lowest bit set to highest bit set
5276 are set to 1. */
5277
5278 unsigned HOST_WIDE_INT
5279 aarch64_and_split_imm2 (HOST_WIDE_INT val_in)
5280 {
5281 return val_in | ~aarch64_and_split_imm1 (val_in);
5282 }
5283
5284 /* Return true if VAL_IN is a valid 'and' bitmask immediate. */
5285
5286 bool
5287 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode)
5288 {
5289 scalar_int_mode int_mode;
5290 if (!is_a <scalar_int_mode> (mode, &int_mode))
5291 return false;
5292
5293 if (aarch64_bitmask_imm (val_in, int_mode))
5294 return false;
5295
5296 if (aarch64_move_imm (val_in, int_mode))
5297 return false;
5298
5299 unsigned HOST_WIDE_INT imm2 = aarch64_and_split_imm2 (val_in);
5300
5301 return aarch64_bitmask_imm (imm2, int_mode);
5302 }
5303
5304 /* Return true if val is an immediate that can be loaded into a
5305 register in a single instruction. */
5306 bool
5307 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
5308 {
5309 scalar_int_mode int_mode;
5310 if (!is_a <scalar_int_mode> (mode, &int_mode))
5311 return false;
5312
5313 if (aarch64_movw_imm (val, int_mode) || aarch64_movw_imm (~val, int_mode))
5314 return 1;
5315 return aarch64_bitmask_imm (val, int_mode);
5316 }
5317
5318 static bool
5319 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
5320 {
5321 rtx base, offset;
5322
5323 if (GET_CODE (x) == HIGH)
5324 return true;
5325
5326 /* There's no way to calculate VL-based values using relocations. */
5327 subrtx_iterator::array_type array;
5328 FOR_EACH_SUBRTX (iter, array, x, ALL)
5329 if (GET_CODE (*iter) == CONST_POLY_INT)
5330 return true;
5331
5332 split_const (x, &base, &offset);
5333 if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
5334 {
5335 if (aarch64_classify_symbol (base, INTVAL (offset))
5336 != SYMBOL_FORCE_TO_MEM)
5337 return true;
5338 else
5339 /* Avoid generating a 64-bit relocation in ILP32; leave
5340 to aarch64_expand_mov_immediate to handle it properly. */
5341 return mode != ptr_mode;
5342 }
5343
5344 return aarch64_tls_referenced_p (x);
5345 }
5346
5347 /* Implement TARGET_CASE_VALUES_THRESHOLD.
5348 The expansion for a table switch is quite expensive due to the number
5349 of instructions, the table lookup and hard to predict indirect jump.
5350 When optimizing for speed, and -O3 enabled, use the per-core tuning if
5351 set, otherwise use tables for > 16 cases as a tradeoff between size and
5352 performance. When optimizing for size, use the default setting. */
5353
5354 static unsigned int
5355 aarch64_case_values_threshold (void)
5356 {
5357 /* Use the specified limit for the number of cases before using jump
5358 tables at higher optimization levels. */
5359 if (optimize > 2
5360 && selected_cpu->tune->max_case_values != 0)
5361 return selected_cpu->tune->max_case_values;
5362 else
5363 return optimize_size ? default_case_values_threshold () : 17;
5364 }
5365
5366 /* Return true if register REGNO is a valid index register.
5367 STRICT_P is true if REG_OK_STRICT is in effect. */
5368
5369 bool
5370 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
5371 {
5372 if (!HARD_REGISTER_NUM_P (regno))
5373 {
5374 if (!strict_p)
5375 return true;
5376
5377 if (!reg_renumber)
5378 return false;
5379
5380 regno = reg_renumber[regno];
5381 }
5382 return GP_REGNUM_P (regno);
5383 }
5384
5385 /* Return true if register REGNO is a valid base register for mode MODE.
5386 STRICT_P is true if REG_OK_STRICT is in effect. */
5387
5388 bool
5389 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
5390 {
5391 if (!HARD_REGISTER_NUM_P (regno))
5392 {
5393 if (!strict_p)
5394 return true;
5395
5396 if (!reg_renumber)
5397 return false;
5398
5399 regno = reg_renumber[regno];
5400 }
5401
5402 /* The fake registers will be eliminated to either the stack or
5403 hard frame pointer, both of which are usually valid base registers.
5404 Reload deals with the cases where the eliminated form isn't valid. */
5405 return (GP_REGNUM_P (regno)
5406 || regno == SP_REGNUM
5407 || regno == FRAME_POINTER_REGNUM
5408 || regno == ARG_POINTER_REGNUM);
5409 }
5410
5411 /* Return true if X is a valid base register for mode MODE.
5412 STRICT_P is true if REG_OK_STRICT is in effect. */
5413
5414 static bool
5415 aarch64_base_register_rtx_p (rtx x, bool strict_p)
5416 {
5417 if (!strict_p
5418 && GET_CODE (x) == SUBREG
5419 && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (x))])
5420 x = SUBREG_REG (x);
5421
5422 return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
5423 }
5424
5425 /* Return true if address offset is a valid index. If it is, fill in INFO
5426 appropriately. STRICT_P is true if REG_OK_STRICT is in effect. */
5427
5428 static bool
5429 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
5430 machine_mode mode, bool strict_p)
5431 {
5432 enum aarch64_address_type type;
5433 rtx index;
5434 int shift;
5435
5436 /* (reg:P) */
5437 if ((REG_P (x) || GET_CODE (x) == SUBREG)
5438 && GET_MODE (x) == Pmode)
5439 {
5440 type = ADDRESS_REG_REG;
5441 index = x;
5442 shift = 0;
5443 }
5444 /* (sign_extend:DI (reg:SI)) */
5445 else if ((GET_CODE (x) == SIGN_EXTEND
5446 || GET_CODE (x) == ZERO_EXTEND)
5447 && GET_MODE (x) == DImode
5448 && GET_MODE (XEXP (x, 0)) == SImode)
5449 {
5450 type = (GET_CODE (x) == SIGN_EXTEND)
5451 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
5452 index = XEXP (x, 0);
5453 shift = 0;
5454 }
5455 /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
5456 else if (GET_CODE (x) == MULT
5457 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
5458 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
5459 && GET_MODE (XEXP (x, 0)) == DImode
5460 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
5461 && CONST_INT_P (XEXP (x, 1)))
5462 {
5463 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
5464 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
5465 index = XEXP (XEXP (x, 0), 0);
5466 shift = exact_log2 (INTVAL (XEXP (x, 1)));
5467 }
5468 /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
5469 else if (GET_CODE (x) == ASHIFT
5470 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
5471 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
5472 && GET_MODE (XEXP (x, 0)) == DImode
5473 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
5474 && CONST_INT_P (XEXP (x, 1)))
5475 {
5476 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
5477 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
5478 index = XEXP (XEXP (x, 0), 0);
5479 shift = INTVAL (XEXP (x, 1));
5480 }
5481 /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
5482 else if ((GET_CODE (x) == SIGN_EXTRACT
5483 || GET_CODE (x) == ZERO_EXTRACT)
5484 && GET_MODE (x) == DImode
5485 && GET_CODE (XEXP (x, 0)) == MULT
5486 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
5487 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
5488 {
5489 type = (GET_CODE (x) == SIGN_EXTRACT)
5490 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
5491 index = XEXP (XEXP (x, 0), 0);
5492 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
5493 if (INTVAL (XEXP (x, 1)) != 32 + shift
5494 || INTVAL (XEXP (x, 2)) != 0)
5495 shift = -1;
5496 }
5497 /* (and:DI (mult:DI (reg:DI) (const_int scale))
5498 (const_int 0xffffffff<<shift)) */
5499 else if (GET_CODE (x) == AND
5500 && GET_MODE (x) == DImode
5501 && GET_CODE (XEXP (x, 0)) == MULT
5502 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
5503 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
5504 && CONST_INT_P (XEXP (x, 1)))
5505 {
5506 type = ADDRESS_REG_UXTW;
5507 index = XEXP (XEXP (x, 0), 0);
5508 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
5509 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
5510 shift = -1;
5511 }
5512 /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
5513 else if ((GET_CODE (x) == SIGN_EXTRACT
5514 || GET_CODE (x) == ZERO_EXTRACT)
5515 && GET_MODE (x) == DImode
5516 && GET_CODE (XEXP (x, 0)) == ASHIFT
5517 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
5518 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
5519 {
5520 type = (GET_CODE (x) == SIGN_EXTRACT)
5521 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
5522 index = XEXP (XEXP (x, 0), 0);
5523 shift = INTVAL (XEXP (XEXP (x, 0), 1));
5524 if (INTVAL (XEXP (x, 1)) != 32 + shift
5525 || INTVAL (XEXP (x, 2)) != 0)
5526 shift = -1;
5527 }
5528 /* (and:DI (ashift:DI (reg:DI) (const_int shift))
5529 (const_int 0xffffffff<<shift)) */
5530 else if (GET_CODE (x) == AND
5531 && GET_MODE (x) == DImode
5532 && GET_CODE (XEXP (x, 0)) == ASHIFT
5533 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
5534 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
5535 && CONST_INT_P (XEXP (x, 1)))
5536 {
5537 type = ADDRESS_REG_UXTW;
5538 index = XEXP (XEXP (x, 0), 0);
5539 shift = INTVAL (XEXP (XEXP (x, 0), 1));
5540 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
5541 shift = -1;
5542 }
5543 /* (mult:P (reg:P) (const_int scale)) */
5544 else if (GET_CODE (x) == MULT
5545 && GET_MODE (x) == Pmode
5546 && GET_MODE (XEXP (x, 0)) == Pmode
5547 && CONST_INT_P (XEXP (x, 1)))
5548 {
5549 type = ADDRESS_REG_REG;
5550 index = XEXP (x, 0);
5551 shift = exact_log2 (INTVAL (XEXP (x, 1)));
5552 }
5553 /* (ashift:P (reg:P) (const_int shift)) */
5554 else if (GET_CODE (x) == ASHIFT
5555 && GET_MODE (x) == Pmode
5556 && GET_MODE (XEXP (x, 0)) == Pmode
5557 && CONST_INT_P (XEXP (x, 1)))
5558 {
5559 type = ADDRESS_REG_REG;
5560 index = XEXP (x, 0);
5561 shift = INTVAL (XEXP (x, 1));
5562 }
5563 else
5564 return false;
5565
5566 if (!strict_p
5567 && GET_CODE (index) == SUBREG
5568 && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (index))])
5569 index = SUBREG_REG (index);
5570
5571 if (aarch64_sve_data_mode_p (mode))
5572 {
5573 if (type != ADDRESS_REG_REG
5574 || (1 << shift) != GET_MODE_UNIT_SIZE (mode))
5575 return false;
5576 }
5577 else
5578 {
5579 if (shift != 0
5580 && !(IN_RANGE (shift, 1, 3)
5581 && known_eq (1 << shift, GET_MODE_SIZE (mode))))
5582 return false;
5583 }
5584
5585 if (REG_P (index)
5586 && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
5587 {
5588 info->type = type;
5589 info->offset = index;
5590 info->shift = shift;
5591 return true;
5592 }
5593
5594 return false;
5595 }
5596
5597 /* Return true if MODE is one of the modes for which we
5598 support LDP/STP operations. */
5599
5600 static bool
5601 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
5602 {
5603 return mode == SImode || mode == DImode
5604 || mode == SFmode || mode == DFmode
5605 || (aarch64_vector_mode_supported_p (mode)
5606 && (known_eq (GET_MODE_SIZE (mode), 8)
5607 || (known_eq (GET_MODE_SIZE (mode), 16)
5608 && (aarch64_tune_params.extra_tuning_flags
5609 & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS) == 0)));
5610 }
5611
5612 /* Return true if REGNO is a virtual pointer register, or an eliminable
5613 "soft" frame register. Like REGNO_PTR_FRAME_P except that we don't
5614 include stack_pointer or hard_frame_pointer. */
5615 static bool
5616 virt_or_elim_regno_p (unsigned regno)
5617 {
5618 return ((regno >= FIRST_VIRTUAL_REGISTER
5619 && regno <= LAST_VIRTUAL_POINTER_REGISTER)
5620 || regno == FRAME_POINTER_REGNUM
5621 || regno == ARG_POINTER_REGNUM);
5622 }
5623
5624 /* Return true if X is a valid address of type TYPE for machine mode MODE.
5625 If it is, fill in INFO appropriately. STRICT_P is true if
5626 REG_OK_STRICT is in effect. */
5627
5628 bool
5629 aarch64_classify_address (struct aarch64_address_info *info,
5630 rtx x, machine_mode mode, bool strict_p,
5631 aarch64_addr_query_type type)
5632 {
5633 enum rtx_code code = GET_CODE (x);
5634 rtx op0, op1;
5635 poly_int64 offset;
5636
5637 HOST_WIDE_INT const_size;
5638
5639 /* On BE, we use load/store pair for all large int mode load/stores.
5640 TI/TFmode may also use a load/store pair. */
5641 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
5642 bool advsimd_struct_p = (vec_flags == (VEC_ADVSIMD | VEC_STRUCT));
5643 bool load_store_pair_p = (type == ADDR_QUERY_LDP_STP
5644 || type == ADDR_QUERY_LDP_STP_N
5645 || mode == TImode
5646 || mode == TFmode
5647 || (BYTES_BIG_ENDIAN && advsimd_struct_p));
5648
5649 /* If we are dealing with ADDR_QUERY_LDP_STP_N that means the incoming mode
5650 corresponds to the actual size of the memory being loaded/stored and the
5651 mode of the corresponding addressing mode is half of that. */
5652 if (type == ADDR_QUERY_LDP_STP_N
5653 && known_eq (GET_MODE_SIZE (mode), 16))
5654 mode = DFmode;
5655
5656 bool allow_reg_index_p = (!load_store_pair_p
5657 && (known_lt (GET_MODE_SIZE (mode), 16)
5658 || vec_flags == VEC_ADVSIMD
5659 || vec_flags == VEC_SVE_DATA));
5660
5661 /* For SVE, only accept [Rn], [Rn, Rm, LSL #shift] and
5662 [Rn, #offset, MUL VL]. */
5663 if ((vec_flags & (VEC_SVE_DATA | VEC_SVE_PRED)) != 0
5664 && (code != REG && code != PLUS))
5665 return false;
5666
5667 /* On LE, for AdvSIMD, don't support anything other than POST_INC or
5668 REG addressing. */
5669 if (advsimd_struct_p
5670 && !BYTES_BIG_ENDIAN
5671 && (code != POST_INC && code != REG))
5672 return false;
5673
5674 gcc_checking_assert (GET_MODE (x) == VOIDmode
5675 || SCALAR_INT_MODE_P (GET_MODE (x)));
5676
5677 switch (code)
5678 {
5679 case REG:
5680 case SUBREG:
5681 info->type = ADDRESS_REG_IMM;
5682 info->base = x;
5683 info->offset = const0_rtx;
5684 info->const_offset = 0;
5685 return aarch64_base_register_rtx_p (x, strict_p);
5686
5687 case PLUS:
5688 op0 = XEXP (x, 0);
5689 op1 = XEXP (x, 1);
5690
5691 if (! strict_p
5692 && REG_P (op0)
5693 && virt_or_elim_regno_p (REGNO (op0))
5694 && poly_int_rtx_p (op1, &offset))
5695 {
5696 info->type = ADDRESS_REG_IMM;
5697 info->base = op0;
5698 info->offset = op1;
5699 info->const_offset = offset;
5700
5701 return true;
5702 }
5703
5704 if (maybe_ne (GET_MODE_SIZE (mode), 0)
5705 && aarch64_base_register_rtx_p (op0, strict_p)
5706 && poly_int_rtx_p (op1, &offset))
5707 {
5708 info->type = ADDRESS_REG_IMM;
5709 info->base = op0;
5710 info->offset = op1;
5711 info->const_offset = offset;
5712
5713 /* TImode and TFmode values are allowed in both pairs of X
5714 registers and individual Q registers. The available
5715 address modes are:
5716 X,X: 7-bit signed scaled offset
5717 Q: 9-bit signed offset
5718 We conservatively require an offset representable in either mode.
5719 When performing the check for pairs of X registers i.e. LDP/STP
5720 pass down DImode since that is the natural size of the LDP/STP
5721 instruction memory accesses. */
5722 if (mode == TImode || mode == TFmode)
5723 return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
5724 && (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
5725 || offset_12bit_unsigned_scaled_p (mode, offset)));
5726
5727 /* A 7bit offset check because OImode will emit a ldp/stp
5728 instruction (only big endian will get here).
5729 For ldp/stp instructions, the offset is scaled for the size of a
5730 single element of the pair. */
5731 if (mode == OImode)
5732 return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
5733
5734 /* Three 9/12 bit offsets checks because CImode will emit three
5735 ldr/str instructions (only big endian will get here). */
5736 if (mode == CImode)
5737 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
5738 && (aarch64_offset_9bit_signed_unscaled_p (V16QImode,
5739 offset + 32)
5740 || offset_12bit_unsigned_scaled_p (V16QImode,
5741 offset + 32)));
5742
5743 /* Two 7bit offsets checks because XImode will emit two ldp/stp
5744 instructions (only big endian will get here). */
5745 if (mode == XImode)
5746 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
5747 && aarch64_offset_7bit_signed_scaled_p (TImode,
5748 offset + 32));
5749
5750 /* Make "m" use the LD1 offset range for SVE data modes, so
5751 that pre-RTL optimizers like ivopts will work to that
5752 instead of the wider LDR/STR range. */
5753 if (vec_flags == VEC_SVE_DATA)
5754 return (type == ADDR_QUERY_M
5755 ? offset_4bit_signed_scaled_p (mode, offset)
5756 : offset_9bit_signed_scaled_p (mode, offset));
5757
5758 if (vec_flags == (VEC_SVE_DATA | VEC_STRUCT))
5759 {
5760 poly_int64 end_offset = (offset
5761 + GET_MODE_SIZE (mode)
5762 - BYTES_PER_SVE_VECTOR);
5763 return (type == ADDR_QUERY_M
5764 ? offset_4bit_signed_scaled_p (mode, offset)
5765 : (offset_9bit_signed_scaled_p (SVE_BYTE_MODE, offset)
5766 && offset_9bit_signed_scaled_p (SVE_BYTE_MODE,
5767 end_offset)));
5768 }
5769
5770 if (vec_flags == VEC_SVE_PRED)
5771 return offset_9bit_signed_scaled_p (mode, offset);
5772
5773 if (load_store_pair_p)
5774 return ((known_eq (GET_MODE_SIZE (mode), 4)
5775 || known_eq (GET_MODE_SIZE (mode), 8)
5776 || known_eq (GET_MODE_SIZE (mode), 16))
5777 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
5778 else
5779 return (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
5780 || offset_12bit_unsigned_scaled_p (mode, offset));
5781 }
5782
5783 if (allow_reg_index_p)
5784 {
5785 /* Look for base + (scaled/extended) index register. */
5786 if (aarch64_base_register_rtx_p (op0, strict_p)
5787 && aarch64_classify_index (info, op1, mode, strict_p))
5788 {
5789 info->base = op0;
5790 return true;
5791 }
5792 if (aarch64_base_register_rtx_p (op1, strict_p)
5793 && aarch64_classify_index (info, op0, mode, strict_p))
5794 {
5795 info->base = op1;
5796 return true;
5797 }
5798 }
5799
5800 return false;
5801
5802 case POST_INC:
5803 case POST_DEC:
5804 case PRE_INC:
5805 case PRE_DEC:
5806 info->type = ADDRESS_REG_WB;
5807 info->base = XEXP (x, 0);
5808 info->offset = NULL_RTX;
5809 return aarch64_base_register_rtx_p (info->base, strict_p);
5810
5811 case POST_MODIFY:
5812 case PRE_MODIFY:
5813 info->type = ADDRESS_REG_WB;
5814 info->base = XEXP (x, 0);
5815 if (GET_CODE (XEXP (x, 1)) == PLUS
5816 && poly_int_rtx_p (XEXP (XEXP (x, 1), 1), &offset)
5817 && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
5818 && aarch64_base_register_rtx_p (info->base, strict_p))
5819 {
5820 info->offset = XEXP (XEXP (x, 1), 1);
5821 info->const_offset = offset;
5822
5823 /* TImode and TFmode values are allowed in both pairs of X
5824 registers and individual Q registers. The available
5825 address modes are:
5826 X,X: 7-bit signed scaled offset
5827 Q: 9-bit signed offset
5828 We conservatively require an offset representable in either mode.
5829 */
5830 if (mode == TImode || mode == TFmode)
5831 return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
5832 && aarch64_offset_9bit_signed_unscaled_p (mode, offset));
5833
5834 if (load_store_pair_p)
5835 return ((known_eq (GET_MODE_SIZE (mode), 4)
5836 || known_eq (GET_MODE_SIZE (mode), 8)
5837 || known_eq (GET_MODE_SIZE (mode), 16))
5838 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
5839 else
5840 return aarch64_offset_9bit_signed_unscaled_p (mode, offset);
5841 }
5842 return false;
5843
5844 case CONST:
5845 case SYMBOL_REF:
5846 case LABEL_REF:
5847 /* load literal: pc-relative constant pool entry. Only supported
5848 for SI mode or larger. */
5849 info->type = ADDRESS_SYMBOLIC;
5850
5851 if (!load_store_pair_p
5852 && GET_MODE_SIZE (mode).is_constant (&const_size)
5853 && const_size >= 4)
5854 {
5855 rtx sym, addend;
5856
5857 split_const (x, &sym, &addend);
5858 return ((GET_CODE (sym) == LABEL_REF
5859 || (GET_CODE (sym) == SYMBOL_REF
5860 && CONSTANT_POOL_ADDRESS_P (sym)
5861 && aarch64_pcrelative_literal_loads)));
5862 }
5863 return false;
5864
5865 case LO_SUM:
5866 info->type = ADDRESS_LO_SUM;
5867 info->base = XEXP (x, 0);
5868 info->offset = XEXP (x, 1);
5869 if (allow_reg_index_p
5870 && aarch64_base_register_rtx_p (info->base, strict_p))
5871 {
5872 rtx sym, offs;
5873 split_const (info->offset, &sym, &offs);
5874 if (GET_CODE (sym) == SYMBOL_REF
5875 && (aarch64_classify_symbol (sym, INTVAL (offs))
5876 == SYMBOL_SMALL_ABSOLUTE))
5877 {
5878 /* The symbol and offset must be aligned to the access size. */
5879 unsigned int align;
5880
5881 if (CONSTANT_POOL_ADDRESS_P (sym))
5882 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
5883 else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
5884 {
5885 tree exp = SYMBOL_REF_DECL (sym);
5886 align = TYPE_ALIGN (TREE_TYPE (exp));
5887 align = aarch64_constant_alignment (exp, align);
5888 }
5889 else if (SYMBOL_REF_DECL (sym))
5890 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
5891 else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
5892 && SYMBOL_REF_BLOCK (sym) != NULL)
5893 align = SYMBOL_REF_BLOCK (sym)->alignment;
5894 else
5895 align = BITS_PER_UNIT;
5896
5897 poly_int64 ref_size = GET_MODE_SIZE (mode);
5898 if (known_eq (ref_size, 0))
5899 ref_size = GET_MODE_SIZE (DImode);
5900
5901 return (multiple_p (INTVAL (offs), ref_size)
5902 && multiple_p (align / BITS_PER_UNIT, ref_size));
5903 }
5904 }
5905 return false;
5906
5907 default:
5908 return false;
5909 }
5910 }
5911
5912 /* Return true if the address X is valid for a PRFM instruction.
5913 STRICT_P is true if we should do strict checking with
5914 aarch64_classify_address. */
5915
5916 bool
5917 aarch64_address_valid_for_prefetch_p (rtx x, bool strict_p)
5918 {
5919 struct aarch64_address_info addr;
5920
5921 /* PRFM accepts the same addresses as DImode... */
5922 bool res = aarch64_classify_address (&addr, x, DImode, strict_p);
5923 if (!res)
5924 return false;
5925
5926 /* ... except writeback forms. */
5927 return addr.type != ADDRESS_REG_WB;
5928 }
5929
5930 bool
5931 aarch64_symbolic_address_p (rtx x)
5932 {
5933 rtx offset;
5934
5935 split_const (x, &x, &offset);
5936 return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
5937 }
5938
5939 /* Classify the base of symbolic expression X. */
5940
5941 enum aarch64_symbol_type
5942 aarch64_classify_symbolic_expression (rtx x)
5943 {
5944 rtx offset;
5945
5946 split_const (x, &x, &offset);
5947 return aarch64_classify_symbol (x, INTVAL (offset));
5948 }
5949
5950
5951 /* Return TRUE if X is a legitimate address for accessing memory in
5952 mode MODE. */
5953 static bool
5954 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
5955 {
5956 struct aarch64_address_info addr;
5957
5958 return aarch64_classify_address (&addr, x, mode, strict_p);
5959 }
5960
5961 /* Return TRUE if X is a legitimate address of type TYPE for accessing
5962 memory in mode MODE. STRICT_P is true if REG_OK_STRICT is in effect. */
5963 bool
5964 aarch64_legitimate_address_p (machine_mode mode, rtx x, bool strict_p,
5965 aarch64_addr_query_type type)
5966 {
5967 struct aarch64_address_info addr;
5968
5969 return aarch64_classify_address (&addr, x, mode, strict_p, type);
5970 }
5971
5972 /* Implement TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT. */
5973
5974 static bool
5975 aarch64_legitimize_address_displacement (rtx *offset1, rtx *offset2,
5976 poly_int64 orig_offset,
5977 machine_mode mode)
5978 {
5979 HOST_WIDE_INT size;
5980 if (GET_MODE_SIZE (mode).is_constant (&size))
5981 {
5982 HOST_WIDE_INT const_offset, second_offset;
5983
5984 /* A general SVE offset is A * VQ + B. Remove the A component from
5985 coefficient 0 in order to get the constant B. */
5986 const_offset = orig_offset.coeffs[0] - orig_offset.coeffs[1];
5987
5988 /* Split an out-of-range address displacement into a base and
5989 offset. Use 4KB range for 1- and 2-byte accesses and a 16KB
5990 range otherwise to increase opportunities for sharing the base
5991 address of different sizes. Unaligned accesses use the signed
5992 9-bit range, TImode/TFmode use the intersection of signed
5993 scaled 7-bit and signed 9-bit offset. */
5994 if (mode == TImode || mode == TFmode)
5995 second_offset = ((const_offset + 0x100) & 0x1f8) - 0x100;
5996 else if ((const_offset & (size - 1)) != 0)
5997 second_offset = ((const_offset + 0x100) & 0x1ff) - 0x100;
5998 else
5999 second_offset = const_offset & (size < 4 ? 0xfff : 0x3ffc);
6000
6001 if (second_offset == 0 || known_eq (orig_offset, second_offset))
6002 return false;
6003
6004 /* Split the offset into second_offset and the rest. */
6005 *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
6006 *offset2 = gen_int_mode (second_offset, Pmode);
6007 return true;
6008 }
6009 else
6010 {
6011 /* Get the mode we should use as the basis of the range. For structure
6012 modes this is the mode of one vector. */
6013 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
6014 machine_mode step_mode
6015 = (vec_flags & VEC_STRUCT) != 0 ? SVE_BYTE_MODE : mode;
6016
6017 /* Get the "mul vl" multiplier we'd like to use. */
6018 HOST_WIDE_INT factor = GET_MODE_SIZE (step_mode).coeffs[1];
6019 HOST_WIDE_INT vnum = orig_offset.coeffs[1] / factor;
6020 if (vec_flags & VEC_SVE_DATA)
6021 /* LDR supports a 9-bit range, but the move patterns for
6022 structure modes require all vectors to be in range of the
6023 same base. The simplest way of accomodating that while still
6024 promoting reuse of anchor points between different modes is
6025 to use an 8-bit range unconditionally. */
6026 vnum = ((vnum + 128) & 255) - 128;
6027 else
6028 /* Predicates are only handled singly, so we might as well use
6029 the full range. */
6030 vnum = ((vnum + 256) & 511) - 256;
6031 if (vnum == 0)
6032 return false;
6033
6034 /* Convert the "mul vl" multiplier into a byte offset. */
6035 poly_int64 second_offset = GET_MODE_SIZE (step_mode) * vnum;
6036 if (known_eq (second_offset, orig_offset))
6037 return false;
6038
6039 /* Split the offset into second_offset and the rest. */
6040 *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
6041 *offset2 = gen_int_mode (second_offset, Pmode);
6042 return true;
6043 }
6044 }
6045
6046 /* Return the binary representation of floating point constant VALUE in INTVAL.
6047 If the value cannot be converted, return false without setting INTVAL.
6048 The conversion is done in the given MODE. */
6049 bool
6050 aarch64_reinterpret_float_as_int (rtx value, unsigned HOST_WIDE_INT *intval)
6051 {
6052
6053 /* We make a general exception for 0. */
6054 if (aarch64_float_const_zero_rtx_p (value))
6055 {
6056 *intval = 0;
6057 return true;
6058 }
6059
6060 scalar_float_mode mode;
6061 if (GET_CODE (value) != CONST_DOUBLE
6062 || !is_a <scalar_float_mode> (GET_MODE (value), &mode)
6063 || GET_MODE_BITSIZE (mode) > HOST_BITS_PER_WIDE_INT
6064 /* Only support up to DF mode. */
6065 || GET_MODE_BITSIZE (mode) > GET_MODE_BITSIZE (DFmode))
6066 return false;
6067
6068 unsigned HOST_WIDE_INT ival = 0;
6069
6070 long res[2];
6071 real_to_target (res,
6072 CONST_DOUBLE_REAL_VALUE (value),
6073 REAL_MODE_FORMAT (mode));
6074
6075 if (mode == DFmode)
6076 {
6077 int order = BYTES_BIG_ENDIAN ? 1 : 0;
6078 ival = zext_hwi (res[order], 32);
6079 ival |= (zext_hwi (res[1 - order], 32) << 32);
6080 }
6081 else
6082 ival = zext_hwi (res[0], 32);
6083
6084 *intval = ival;
6085 return true;
6086 }
6087
6088 /* Return TRUE if rtx X is an immediate constant that can be moved using a
6089 single MOV(+MOVK) followed by an FMOV. */
6090 bool
6091 aarch64_float_const_rtx_p (rtx x)
6092 {
6093 machine_mode mode = GET_MODE (x);
6094 if (mode == VOIDmode)
6095 return false;
6096
6097 /* Determine whether it's cheaper to write float constants as
6098 mov/movk pairs over ldr/adrp pairs. */
6099 unsigned HOST_WIDE_INT ival;
6100
6101 if (GET_CODE (x) == CONST_DOUBLE
6102 && SCALAR_FLOAT_MODE_P (mode)
6103 && aarch64_reinterpret_float_as_int (x, &ival))
6104 {
6105 scalar_int_mode imode = (mode == HFmode
6106 ? SImode
6107 : int_mode_for_mode (mode).require ());
6108 int num_instr = aarch64_internal_mov_immediate
6109 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
6110 return num_instr < 3;
6111 }
6112
6113 return false;
6114 }
6115
6116 /* Return TRUE if rtx X is immediate constant 0.0 */
6117 bool
6118 aarch64_float_const_zero_rtx_p (rtx x)
6119 {
6120 if (GET_MODE (x) == VOIDmode)
6121 return false;
6122
6123 if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
6124 return !HONOR_SIGNED_ZEROS (GET_MODE (x));
6125 return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
6126 }
6127
6128 /* Return TRUE if rtx X is immediate constant that fits in a single
6129 MOVI immediate operation. */
6130 bool
6131 aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode)
6132 {
6133 if (!TARGET_SIMD)
6134 return false;
6135
6136 machine_mode vmode;
6137 scalar_int_mode imode;
6138 unsigned HOST_WIDE_INT ival;
6139
6140 if (GET_CODE (x) == CONST_DOUBLE
6141 && SCALAR_FLOAT_MODE_P (mode))
6142 {
6143 if (!aarch64_reinterpret_float_as_int (x, &ival))
6144 return false;
6145
6146 /* We make a general exception for 0. */
6147 if (aarch64_float_const_zero_rtx_p (x))
6148 return true;
6149
6150 imode = int_mode_for_mode (mode).require ();
6151 }
6152 else if (GET_CODE (x) == CONST_INT
6153 && is_a <scalar_int_mode> (mode, &imode))
6154 ival = INTVAL (x);
6155 else
6156 return false;
6157
6158 /* use a 64 bit mode for everything except for DI/DF mode, where we use
6159 a 128 bit vector mode. */
6160 int width = GET_MODE_BITSIZE (imode) == 64 ? 128 : 64;
6161
6162 vmode = aarch64_simd_container_mode (imode, width);
6163 rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, ival);
6164
6165 return aarch64_simd_valid_immediate (v_op, NULL);
6166 }
6167
6168
6169 /* Return the fixed registers used for condition codes. */
6170
6171 static bool
6172 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
6173 {
6174 *p1 = CC_REGNUM;
6175 *p2 = INVALID_REGNUM;
6176 return true;
6177 }
6178
6179 /* This function is used by the call expanders of the machine description.
6180 RESULT is the register in which the result is returned. It's NULL for
6181 "call" and "sibcall".
6182 MEM is the location of the function call.
6183 SIBCALL indicates whether this function call is normal call or sibling call.
6184 It will generate different pattern accordingly. */
6185
6186 void
6187 aarch64_expand_call (rtx result, rtx mem, bool sibcall)
6188 {
6189 rtx call, callee, tmp;
6190 rtvec vec;
6191 machine_mode mode;
6192
6193 gcc_assert (MEM_P (mem));
6194 callee = XEXP (mem, 0);
6195 mode = GET_MODE (callee);
6196 gcc_assert (mode == Pmode);
6197
6198 /* Decide if we should generate indirect calls by loading the
6199 address of the callee into a register before performing
6200 the branch-and-link. */
6201 if (SYMBOL_REF_P (callee)
6202 ? (aarch64_is_long_call_p (callee)
6203 || aarch64_is_noplt_call_p (callee))
6204 : !REG_P (callee))
6205 XEXP (mem, 0) = force_reg (mode, callee);
6206
6207 call = gen_rtx_CALL (VOIDmode, mem, const0_rtx);
6208
6209 if (result != NULL_RTX)
6210 call = gen_rtx_SET (result, call);
6211
6212 if (sibcall)
6213 tmp = ret_rtx;
6214 else
6215 tmp = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (Pmode, LR_REGNUM));
6216
6217 vec = gen_rtvec (2, call, tmp);
6218 call = gen_rtx_PARALLEL (VOIDmode, vec);
6219
6220 aarch64_emit_call_insn (call);
6221 }
6222
6223 /* Emit call insn with PAT and do aarch64-specific handling. */
6224
6225 void
6226 aarch64_emit_call_insn (rtx pat)
6227 {
6228 rtx insn = emit_call_insn (pat);
6229
6230 rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
6231 clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
6232 clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
6233 }
6234
6235 machine_mode
6236 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
6237 {
6238 /* All floating point compares return CCFP if it is an equality
6239 comparison, and CCFPE otherwise. */
6240 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
6241 {
6242 switch (code)
6243 {
6244 case EQ:
6245 case NE:
6246 case UNORDERED:
6247 case ORDERED:
6248 case UNLT:
6249 case UNLE:
6250 case UNGT:
6251 case UNGE:
6252 case UNEQ:
6253 return CCFPmode;
6254
6255 case LT:
6256 case LE:
6257 case GT:
6258 case GE:
6259 case LTGT:
6260 return CCFPEmode;
6261
6262 default:
6263 gcc_unreachable ();
6264 }
6265 }
6266
6267 /* Equality comparisons of short modes against zero can be performed
6268 using the TST instruction with the appropriate bitmask. */
6269 if (y == const0_rtx && REG_P (x)
6270 && (code == EQ || code == NE)
6271 && (GET_MODE (x) == HImode || GET_MODE (x) == QImode))
6272 return CC_NZmode;
6273
6274 /* Similarly, comparisons of zero_extends from shorter modes can
6275 be performed using an ANDS with an immediate mask. */
6276 if (y == const0_rtx && GET_CODE (x) == ZERO_EXTEND
6277 && (GET_MODE (x) == SImode || GET_MODE (x) == DImode)
6278 && (GET_MODE (XEXP (x, 0)) == HImode || GET_MODE (XEXP (x, 0)) == QImode)
6279 && (code == EQ || code == NE))
6280 return CC_NZmode;
6281
6282 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
6283 && y == const0_rtx
6284 && (code == EQ || code == NE || code == LT || code == GE)
6285 && (GET_CODE (x) == PLUS || GET_CODE (x) == MINUS || GET_CODE (x) == AND
6286 || GET_CODE (x) == NEG
6287 || (GET_CODE (x) == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
6288 && CONST_INT_P (XEXP (x, 2)))))
6289 return CC_NZmode;
6290
6291 /* A compare with a shifted operand. Because of canonicalization,
6292 the comparison will have to be swapped when we emit the assembly
6293 code. */
6294 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
6295 && (REG_P (y) || GET_CODE (y) == SUBREG || y == const0_rtx)
6296 && (GET_CODE (x) == ASHIFT || GET_CODE (x) == ASHIFTRT
6297 || GET_CODE (x) == LSHIFTRT
6298 || GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND))
6299 return CC_SWPmode;
6300
6301 /* Similarly for a negated operand, but we can only do this for
6302 equalities. */
6303 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
6304 && (REG_P (y) || GET_CODE (y) == SUBREG)
6305 && (code == EQ || code == NE)
6306 && GET_CODE (x) == NEG)
6307 return CC_Zmode;
6308
6309 /* A test for unsigned overflow. */
6310 if ((GET_MODE (x) == DImode || GET_MODE (x) == TImode)
6311 && code == NE
6312 && GET_CODE (x) == PLUS
6313 && GET_CODE (y) == ZERO_EXTEND)
6314 return CC_Cmode;
6315
6316 /* A test for signed overflow. */
6317 if ((GET_MODE (x) == DImode || GET_MODE (x) == TImode)
6318 && code == NE
6319 && GET_CODE (x) == PLUS
6320 && GET_CODE (y) == SIGN_EXTEND)
6321 return CC_Vmode;
6322
6323 /* For everything else, return CCmode. */
6324 return CCmode;
6325 }
6326
6327 static int
6328 aarch64_get_condition_code_1 (machine_mode, enum rtx_code);
6329
6330 int
6331 aarch64_get_condition_code (rtx x)
6332 {
6333 machine_mode mode = GET_MODE (XEXP (x, 0));
6334 enum rtx_code comp_code = GET_CODE (x);
6335
6336 if (GET_MODE_CLASS (mode) != MODE_CC)
6337 mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
6338 return aarch64_get_condition_code_1 (mode, comp_code);
6339 }
6340
6341 static int
6342 aarch64_get_condition_code_1 (machine_mode mode, enum rtx_code comp_code)
6343 {
6344 switch (mode)
6345 {
6346 case E_CCFPmode:
6347 case E_CCFPEmode:
6348 switch (comp_code)
6349 {
6350 case GE: return AARCH64_GE;
6351 case GT: return AARCH64_GT;
6352 case LE: return AARCH64_LS;
6353 case LT: return AARCH64_MI;
6354 case NE: return AARCH64_NE;
6355 case EQ: return AARCH64_EQ;
6356 case ORDERED: return AARCH64_VC;
6357 case UNORDERED: return AARCH64_VS;
6358 case UNLT: return AARCH64_LT;
6359 case UNLE: return AARCH64_LE;
6360 case UNGT: return AARCH64_HI;
6361 case UNGE: return AARCH64_PL;
6362 default: return -1;
6363 }
6364 break;
6365
6366 case E_CCmode:
6367 switch (comp_code)
6368 {
6369 case NE: return AARCH64_NE;
6370 case EQ: return AARCH64_EQ;
6371 case GE: return AARCH64_GE;
6372 case GT: return AARCH64_GT;
6373 case LE: return AARCH64_LE;
6374 case LT: return AARCH64_LT;
6375 case GEU: return AARCH64_CS;
6376 case GTU: return AARCH64_HI;
6377 case LEU: return AARCH64_LS;
6378 case LTU: return AARCH64_CC;
6379 default: return -1;
6380 }
6381 break;
6382
6383 case E_CC_SWPmode:
6384 switch (comp_code)
6385 {
6386 case NE: return AARCH64_NE;
6387 case EQ: return AARCH64_EQ;
6388 case GE: return AARCH64_LE;
6389 case GT: return AARCH64_LT;
6390 case LE: return AARCH64_GE;
6391 case LT: return AARCH64_GT;
6392 case GEU: return AARCH64_LS;
6393 case GTU: return AARCH64_CC;
6394 case LEU: return AARCH64_CS;
6395 case LTU: return AARCH64_HI;
6396 default: return -1;
6397 }
6398 break;
6399
6400 case E_CC_NZmode:
6401 switch (comp_code)
6402 {
6403 case NE: return AARCH64_NE;
6404 case EQ: return AARCH64_EQ;
6405 case GE: return AARCH64_PL;
6406 case LT: return AARCH64_MI;
6407 default: return -1;
6408 }
6409 break;
6410
6411 case E_CC_Zmode:
6412 switch (comp_code)
6413 {
6414 case NE: return AARCH64_NE;
6415 case EQ: return AARCH64_EQ;
6416 default: return -1;
6417 }
6418 break;
6419
6420 case E_CC_Cmode:
6421 switch (comp_code)
6422 {
6423 case NE: return AARCH64_CS;
6424 case EQ: return AARCH64_CC;
6425 default: return -1;
6426 }
6427 break;
6428
6429 case E_CC_Vmode:
6430 switch (comp_code)
6431 {
6432 case NE: return AARCH64_VS;
6433 case EQ: return AARCH64_VC;
6434 default: return -1;
6435 }
6436 break;
6437
6438 default:
6439 return -1;
6440 }
6441
6442 return -1;
6443 }
6444
6445 bool
6446 aarch64_const_vec_all_same_in_range_p (rtx x,
6447 HOST_WIDE_INT minval,
6448 HOST_WIDE_INT maxval)
6449 {
6450 rtx elt;
6451 return (const_vec_duplicate_p (x, &elt)
6452 && CONST_INT_P (elt)
6453 && IN_RANGE (INTVAL (elt), minval, maxval));
6454 }
6455
6456 bool
6457 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
6458 {
6459 return aarch64_const_vec_all_same_in_range_p (x, val, val);
6460 }
6461
6462 /* Return true if VEC is a constant in which every element is in the range
6463 [MINVAL, MAXVAL]. The elements do not need to have the same value. */
6464
6465 static bool
6466 aarch64_const_vec_all_in_range_p (rtx vec,
6467 HOST_WIDE_INT minval,
6468 HOST_WIDE_INT maxval)
6469 {
6470 if (GET_CODE (vec) != CONST_VECTOR
6471 || GET_MODE_CLASS (GET_MODE (vec)) != MODE_VECTOR_INT)
6472 return false;
6473
6474 int nunits;
6475 if (!CONST_VECTOR_STEPPED_P (vec))
6476 nunits = const_vector_encoded_nelts (vec);
6477 else if (!CONST_VECTOR_NUNITS (vec).is_constant (&nunits))
6478 return false;
6479
6480 for (int i = 0; i < nunits; i++)
6481 {
6482 rtx vec_elem = CONST_VECTOR_ELT (vec, i);
6483 if (!CONST_INT_P (vec_elem)
6484 || !IN_RANGE (INTVAL (vec_elem), minval, maxval))
6485 return false;
6486 }
6487 return true;
6488 }
6489
6490 /* N Z C V. */
6491 #define AARCH64_CC_V 1
6492 #define AARCH64_CC_C (1 << 1)
6493 #define AARCH64_CC_Z (1 << 2)
6494 #define AARCH64_CC_N (1 << 3)
6495
6496 /* N Z C V flags for ccmp. Indexed by AARCH64_COND_CODE. */
6497 static const int aarch64_nzcv_codes[] =
6498 {
6499 0, /* EQ, Z == 1. */
6500 AARCH64_CC_Z, /* NE, Z == 0. */
6501 0, /* CS, C == 1. */
6502 AARCH64_CC_C, /* CC, C == 0. */
6503 0, /* MI, N == 1. */
6504 AARCH64_CC_N, /* PL, N == 0. */
6505 0, /* VS, V == 1. */
6506 AARCH64_CC_V, /* VC, V == 0. */
6507 0, /* HI, C ==1 && Z == 0. */
6508 AARCH64_CC_C, /* LS, !(C == 1 && Z == 0). */
6509 AARCH64_CC_V, /* GE, N == V. */
6510 0, /* LT, N != V. */
6511 AARCH64_CC_Z, /* GT, Z == 0 && N == V. */
6512 0, /* LE, !(Z == 0 && N == V). */
6513 0, /* AL, Any. */
6514 0 /* NV, Any. */
6515 };
6516
6517 /* Print floating-point vector immediate operand X to F, negating it
6518 first if NEGATE is true. Return true on success, false if it isn't
6519 a constant we can handle. */
6520
6521 static bool
6522 aarch64_print_vector_float_operand (FILE *f, rtx x, bool negate)
6523 {
6524 rtx elt;
6525
6526 if (!const_vec_duplicate_p (x, &elt))
6527 return false;
6528
6529 REAL_VALUE_TYPE r = *CONST_DOUBLE_REAL_VALUE (elt);
6530 if (negate)
6531 r = real_value_negate (&r);
6532
6533 /* We only handle the SVE single-bit immediates here. */
6534 if (real_equal (&r, &dconst0))
6535 asm_fprintf (f, "0.0");
6536 else if (real_equal (&r, &dconst1))
6537 asm_fprintf (f, "1.0");
6538 else if (real_equal (&r, &dconsthalf))
6539 asm_fprintf (f, "0.5");
6540 else
6541 return false;
6542
6543 return true;
6544 }
6545
6546 /* Return the equivalent letter for size. */
6547 static char
6548 sizetochar (int size)
6549 {
6550 switch (size)
6551 {
6552 case 64: return 'd';
6553 case 32: return 's';
6554 case 16: return 'h';
6555 case 8 : return 'b';
6556 default: gcc_unreachable ();
6557 }
6558 }
6559
6560 /* Print operand X to file F in a target specific manner according to CODE.
6561 The acceptable formatting commands given by CODE are:
6562 'c': An integer or symbol address without a preceding #
6563 sign.
6564 'C': Take the duplicated element in a vector constant
6565 and print it in hex.
6566 'D': Take the duplicated element in a vector constant
6567 and print it as an unsigned integer, in decimal.
6568 'e': Print the sign/zero-extend size as a character 8->b,
6569 16->h, 32->w.
6570 'p': Prints N such that 2^N == X (X must be power of 2 and
6571 const int).
6572 'P': Print the number of non-zero bits in X (a const_int).
6573 'H': Print the higher numbered register of a pair (TImode)
6574 of regs.
6575 'm': Print a condition (eq, ne, etc).
6576 'M': Same as 'm', but invert condition.
6577 'N': Take the duplicated element in a vector constant
6578 and print the negative of it in decimal.
6579 'b/h/s/d/q': Print a scalar FP/SIMD register name.
6580 'S/T/U/V': Print a FP/SIMD register name for a register list.
6581 The register printed is the FP/SIMD register name
6582 of X + 0/1/2/3 for S/T/U/V.
6583 'R': Print a scalar FP/SIMD register name + 1.
6584 'X': Print bottom 16 bits of integer constant in hex.
6585 'w/x': Print a general register name or the zero register
6586 (32-bit or 64-bit).
6587 '0': Print a normal operand, if it's a general register,
6588 then we assume DImode.
6589 'k': Print NZCV for conditional compare instructions.
6590 'A': Output address constant representing the first
6591 argument of X, specifying a relocation offset
6592 if appropriate.
6593 'L': Output constant address specified by X
6594 with a relocation offset if appropriate.
6595 'G': Prints address of X, specifying a PC relative
6596 relocation mode if appropriate.
6597 'y': Output address of LDP or STP - this is used for
6598 some LDP/STPs which don't use a PARALLEL in their
6599 pattern (so the mode needs to be adjusted).
6600 'z': Output address of a typical LDP or STP. */
6601
6602 static void
6603 aarch64_print_operand (FILE *f, rtx x, int code)
6604 {
6605 rtx elt;
6606 switch (code)
6607 {
6608 case 'c':
6609 switch (GET_CODE (x))
6610 {
6611 case CONST_INT:
6612 fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
6613 break;
6614
6615 case SYMBOL_REF:
6616 output_addr_const (f, x);
6617 break;
6618
6619 case CONST:
6620 if (GET_CODE (XEXP (x, 0)) == PLUS
6621 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
6622 {
6623 output_addr_const (f, x);
6624 break;
6625 }
6626 /* Fall through. */
6627
6628 default:
6629 output_operand_lossage ("unsupported operand for code '%c'", code);
6630 }
6631 break;
6632
6633 case 'e':
6634 {
6635 int n;
6636
6637 if (!CONST_INT_P (x)
6638 || (n = exact_log2 (INTVAL (x) & ~7)) <= 0)
6639 {
6640 output_operand_lossage ("invalid operand for '%%%c'", code);
6641 return;
6642 }
6643
6644 switch (n)
6645 {
6646 case 3:
6647 fputc ('b', f);
6648 break;
6649 case 4:
6650 fputc ('h', f);
6651 break;
6652 case 5:
6653 fputc ('w', f);
6654 break;
6655 default:
6656 output_operand_lossage ("invalid operand for '%%%c'", code);
6657 return;
6658 }
6659 }
6660 break;
6661
6662 case 'p':
6663 {
6664 int n;
6665
6666 if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
6667 {
6668 output_operand_lossage ("invalid operand for '%%%c'", code);
6669 return;
6670 }
6671
6672 asm_fprintf (f, "%d", n);
6673 }
6674 break;
6675
6676 case 'P':
6677 if (!CONST_INT_P (x))
6678 {
6679 output_operand_lossage ("invalid operand for '%%%c'", code);
6680 return;
6681 }
6682
6683 asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
6684 break;
6685
6686 case 'H':
6687 if (x == const0_rtx)
6688 {
6689 asm_fprintf (f, "xzr");
6690 break;
6691 }
6692
6693 if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
6694 {
6695 output_operand_lossage ("invalid operand for '%%%c'", code);
6696 return;
6697 }
6698
6699 asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
6700 break;
6701
6702 case 'M':
6703 case 'm':
6704 {
6705 int cond_code;
6706 /* CONST_TRUE_RTX means al/nv (al is the default, don't print it). */
6707 if (x == const_true_rtx)
6708 {
6709 if (code == 'M')
6710 fputs ("nv", f);
6711 return;
6712 }
6713
6714 if (!COMPARISON_P (x))
6715 {
6716 output_operand_lossage ("invalid operand for '%%%c'", code);
6717 return;
6718 }
6719
6720 cond_code = aarch64_get_condition_code (x);
6721 gcc_assert (cond_code >= 0);
6722 if (code == 'M')
6723 cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code);
6724 fputs (aarch64_condition_codes[cond_code], f);
6725 }
6726 break;
6727
6728 case 'N':
6729 if (!const_vec_duplicate_p (x, &elt))
6730 {
6731 output_operand_lossage ("invalid vector constant");
6732 return;
6733 }
6734
6735 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
6736 asm_fprintf (f, "%wd", -INTVAL (elt));
6737 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
6738 && aarch64_print_vector_float_operand (f, x, true))
6739 ;
6740 else
6741 {
6742 output_operand_lossage ("invalid vector constant");
6743 return;
6744 }
6745 break;
6746
6747 case 'b':
6748 case 'h':
6749 case 's':
6750 case 'd':
6751 case 'q':
6752 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
6753 {
6754 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
6755 return;
6756 }
6757 asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
6758 break;
6759
6760 case 'S':
6761 case 'T':
6762 case 'U':
6763 case 'V':
6764 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
6765 {
6766 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
6767 return;
6768 }
6769 asm_fprintf (f, "%c%d",
6770 aarch64_sve_data_mode_p (GET_MODE (x)) ? 'z' : 'v',
6771 REGNO (x) - V0_REGNUM + (code - 'S'));
6772 break;
6773
6774 case 'R':
6775 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
6776 {
6777 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
6778 return;
6779 }
6780 asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
6781 break;
6782
6783 case 'X':
6784 if (!CONST_INT_P (x))
6785 {
6786 output_operand_lossage ("invalid operand for '%%%c'", code);
6787 return;
6788 }
6789 asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
6790 break;
6791
6792 case 'C':
6793 {
6794 /* Print a replicated constant in hex. */
6795 if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
6796 {
6797 output_operand_lossage ("invalid operand for '%%%c'", code);
6798 return;
6799 }
6800 scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
6801 asm_fprintf (f, "0x%wx", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
6802 }
6803 break;
6804
6805 case 'D':
6806 {
6807 /* Print a replicated constant in decimal, treating it as
6808 unsigned. */
6809 if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
6810 {
6811 output_operand_lossage ("invalid operand for '%%%c'", code);
6812 return;
6813 }
6814 scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
6815 asm_fprintf (f, "%wd", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
6816 }
6817 break;
6818
6819 case 'w':
6820 case 'x':
6821 if (x == const0_rtx
6822 || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
6823 {
6824 asm_fprintf (f, "%czr", code);
6825 break;
6826 }
6827
6828 if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
6829 {
6830 asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
6831 break;
6832 }
6833
6834 if (REG_P (x) && REGNO (x) == SP_REGNUM)
6835 {
6836 asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
6837 break;
6838 }
6839
6840 /* Fall through */
6841
6842 case 0:
6843 if (x == NULL)
6844 {
6845 output_operand_lossage ("missing operand");
6846 return;
6847 }
6848
6849 switch (GET_CODE (x))
6850 {
6851 case REG:
6852 if (aarch64_sve_data_mode_p (GET_MODE (x)))
6853 {
6854 if (REG_NREGS (x) == 1)
6855 asm_fprintf (f, "z%d", REGNO (x) - V0_REGNUM);
6856 else
6857 {
6858 char suffix
6859 = sizetochar (GET_MODE_UNIT_BITSIZE (GET_MODE (x)));
6860 asm_fprintf (f, "{z%d.%c - z%d.%c}",
6861 REGNO (x) - V0_REGNUM, suffix,
6862 END_REGNO (x) - V0_REGNUM - 1, suffix);
6863 }
6864 }
6865 else
6866 asm_fprintf (f, "%s", reg_names [REGNO (x)]);
6867 break;
6868
6869 case MEM:
6870 output_address (GET_MODE (x), XEXP (x, 0));
6871 break;
6872
6873 case LABEL_REF:
6874 case SYMBOL_REF:
6875 output_addr_const (asm_out_file, x);
6876 break;
6877
6878 case CONST_INT:
6879 asm_fprintf (f, "%wd", INTVAL (x));
6880 break;
6881
6882 case CONST:
6883 if (!VECTOR_MODE_P (GET_MODE (x)))
6884 {
6885 output_addr_const (asm_out_file, x);
6886 break;
6887 }
6888 /* fall through */
6889
6890 case CONST_VECTOR:
6891 if (!const_vec_duplicate_p (x, &elt))
6892 {
6893 output_operand_lossage ("invalid vector constant");
6894 return;
6895 }
6896
6897 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
6898 asm_fprintf (f, "%wd", INTVAL (elt));
6899 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
6900 && aarch64_print_vector_float_operand (f, x, false))
6901 ;
6902 else
6903 {
6904 output_operand_lossage ("invalid vector constant");
6905 return;
6906 }
6907 break;
6908
6909 case CONST_DOUBLE:
6910 /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
6911 be getting CONST_DOUBLEs holding integers. */
6912 gcc_assert (GET_MODE (x) != VOIDmode);
6913 if (aarch64_float_const_zero_rtx_p (x))
6914 {
6915 fputc ('0', f);
6916 break;
6917 }
6918 else if (aarch64_float_const_representable_p (x))
6919 {
6920 #define buf_size 20
6921 char float_buf[buf_size] = {'\0'};
6922 real_to_decimal_for_mode (float_buf,
6923 CONST_DOUBLE_REAL_VALUE (x),
6924 buf_size, buf_size,
6925 1, GET_MODE (x));
6926 asm_fprintf (asm_out_file, "%s", float_buf);
6927 break;
6928 #undef buf_size
6929 }
6930 output_operand_lossage ("invalid constant");
6931 return;
6932 default:
6933 output_operand_lossage ("invalid operand");
6934 return;
6935 }
6936 break;
6937
6938 case 'A':
6939 if (GET_CODE (x) == HIGH)
6940 x = XEXP (x, 0);
6941
6942 switch (aarch64_classify_symbolic_expression (x))
6943 {
6944 case SYMBOL_SMALL_GOT_4G:
6945 asm_fprintf (asm_out_file, ":got:");
6946 break;
6947
6948 case SYMBOL_SMALL_TLSGD:
6949 asm_fprintf (asm_out_file, ":tlsgd:");
6950 break;
6951
6952 case SYMBOL_SMALL_TLSDESC:
6953 asm_fprintf (asm_out_file, ":tlsdesc:");
6954 break;
6955
6956 case SYMBOL_SMALL_TLSIE:
6957 asm_fprintf (asm_out_file, ":gottprel:");
6958 break;
6959
6960 case SYMBOL_TLSLE24:
6961 asm_fprintf (asm_out_file, ":tprel:");
6962 break;
6963
6964 case SYMBOL_TINY_GOT:
6965 gcc_unreachable ();
6966 break;
6967
6968 default:
6969 break;
6970 }
6971 output_addr_const (asm_out_file, x);
6972 break;
6973
6974 case 'L':
6975 switch (aarch64_classify_symbolic_expression (x))
6976 {
6977 case SYMBOL_SMALL_GOT_4G:
6978 asm_fprintf (asm_out_file, ":lo12:");
6979 break;
6980
6981 case SYMBOL_SMALL_TLSGD:
6982 asm_fprintf (asm_out_file, ":tlsgd_lo12:");
6983 break;
6984
6985 case SYMBOL_SMALL_TLSDESC:
6986 asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
6987 break;
6988
6989 case SYMBOL_SMALL_TLSIE:
6990 asm_fprintf (asm_out_file, ":gottprel_lo12:");
6991 break;
6992
6993 case SYMBOL_TLSLE12:
6994 asm_fprintf (asm_out_file, ":tprel_lo12:");
6995 break;
6996
6997 case SYMBOL_TLSLE24:
6998 asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
6999 break;
7000
7001 case SYMBOL_TINY_GOT:
7002 asm_fprintf (asm_out_file, ":got:");
7003 break;
7004
7005 case SYMBOL_TINY_TLSIE:
7006 asm_fprintf (asm_out_file, ":gottprel:");
7007 break;
7008
7009 default:
7010 break;
7011 }
7012 output_addr_const (asm_out_file, x);
7013 break;
7014
7015 case 'G':
7016 switch (aarch64_classify_symbolic_expression (x))
7017 {
7018 case SYMBOL_TLSLE24:
7019 asm_fprintf (asm_out_file, ":tprel_hi12:");
7020 break;
7021 default:
7022 break;
7023 }
7024 output_addr_const (asm_out_file, x);
7025 break;
7026
7027 case 'k':
7028 {
7029 HOST_WIDE_INT cond_code;
7030
7031 if (!CONST_INT_P (x))
7032 {
7033 output_operand_lossage ("invalid operand for '%%%c'", code);
7034 return;
7035 }
7036
7037 cond_code = INTVAL (x);
7038 gcc_assert (cond_code >= 0 && cond_code <= AARCH64_NV);
7039 asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code]);
7040 }
7041 break;
7042
7043 case 'y':
7044 case 'z':
7045 {
7046 machine_mode mode = GET_MODE (x);
7047
7048 if (GET_CODE (x) != MEM
7049 || (code == 'y' && maybe_ne (GET_MODE_SIZE (mode), 16)))
7050 {
7051 output_operand_lossage ("invalid operand for '%%%c'", code);
7052 return;
7053 }
7054
7055 if (!aarch64_print_address_internal (f, mode, XEXP (x, 0),
7056 code == 'y'
7057 ? ADDR_QUERY_LDP_STP_N
7058 : ADDR_QUERY_LDP_STP))
7059 output_operand_lossage ("invalid operand prefix '%%%c'", code);
7060 }
7061 break;
7062
7063 default:
7064 output_operand_lossage ("invalid operand prefix '%%%c'", code);
7065 return;
7066 }
7067 }
7068
7069 /* Print address 'x' of a memory access with mode 'mode'.
7070 'op' is the context required by aarch64_classify_address. It can either be
7071 MEM for a normal memory access or PARALLEL for LDP/STP. */
7072 static bool
7073 aarch64_print_address_internal (FILE *f, machine_mode mode, rtx x,
7074 aarch64_addr_query_type type)
7075 {
7076 struct aarch64_address_info addr;
7077 unsigned int size;
7078
7079 /* Check all addresses are Pmode - including ILP32. */
7080 if (GET_MODE (x) != Pmode)
7081 output_operand_lossage ("invalid address mode");
7082
7083 if (aarch64_classify_address (&addr, x, mode, true, type))
7084 switch (addr.type)
7085 {
7086 case ADDRESS_REG_IMM:
7087 if (known_eq (addr.const_offset, 0))
7088 asm_fprintf (f, "[%s]", reg_names [REGNO (addr.base)]);
7089 else if (aarch64_sve_data_mode_p (mode))
7090 {
7091 HOST_WIDE_INT vnum
7092 = exact_div (addr.const_offset,
7093 BYTES_PER_SVE_VECTOR).to_constant ();
7094 asm_fprintf (f, "[%s, #%wd, mul vl]",
7095 reg_names[REGNO (addr.base)], vnum);
7096 }
7097 else if (aarch64_sve_pred_mode_p (mode))
7098 {
7099 HOST_WIDE_INT vnum
7100 = exact_div (addr.const_offset,
7101 BYTES_PER_SVE_PRED).to_constant ();
7102 asm_fprintf (f, "[%s, #%wd, mul vl]",
7103 reg_names[REGNO (addr.base)], vnum);
7104 }
7105 else
7106 asm_fprintf (f, "[%s, %wd]", reg_names [REGNO (addr.base)],
7107 INTVAL (addr.offset));
7108 return true;
7109
7110 case ADDRESS_REG_REG:
7111 if (addr.shift == 0)
7112 asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
7113 reg_names [REGNO (addr.offset)]);
7114 else
7115 asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
7116 reg_names [REGNO (addr.offset)], addr.shift);
7117 return true;
7118
7119 case ADDRESS_REG_UXTW:
7120 if (addr.shift == 0)
7121 asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
7122 REGNO (addr.offset) - R0_REGNUM);
7123 else
7124 asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
7125 REGNO (addr.offset) - R0_REGNUM, addr.shift);
7126 return true;
7127
7128 case ADDRESS_REG_SXTW:
7129 if (addr.shift == 0)
7130 asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
7131 REGNO (addr.offset) - R0_REGNUM);
7132 else
7133 asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
7134 REGNO (addr.offset) - R0_REGNUM, addr.shift);
7135 return true;
7136
7137 case ADDRESS_REG_WB:
7138 /* Writeback is only supported for fixed-width modes. */
7139 size = GET_MODE_SIZE (mode).to_constant ();
7140 switch (GET_CODE (x))
7141 {
7142 case PRE_INC:
7143 asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)], size);
7144 return true;
7145 case POST_INC:
7146 asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)], size);
7147 return true;
7148 case PRE_DEC:
7149 asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)], size);
7150 return true;
7151 case POST_DEC:
7152 asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)], size);
7153 return true;
7154 case PRE_MODIFY:
7155 asm_fprintf (f, "[%s, %wd]!", reg_names[REGNO (addr.base)],
7156 INTVAL (addr.offset));
7157 return true;
7158 case POST_MODIFY:
7159 asm_fprintf (f, "[%s], %wd", reg_names[REGNO (addr.base)],
7160 INTVAL (addr.offset));
7161 return true;
7162 default:
7163 break;
7164 }
7165 break;
7166
7167 case ADDRESS_LO_SUM:
7168 asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
7169 output_addr_const (f, addr.offset);
7170 asm_fprintf (f, "]");
7171 return true;
7172
7173 case ADDRESS_SYMBOLIC:
7174 output_addr_const (f, x);
7175 return true;
7176 }
7177
7178 return false;
7179 }
7180
7181 /* Print address 'x' of a memory access with mode 'mode'. */
7182 static void
7183 aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
7184 {
7185 if (!aarch64_print_address_internal (f, mode, x, ADDR_QUERY_ANY))
7186 output_addr_const (f, x);
7187 }
7188
7189 bool
7190 aarch64_label_mentioned_p (rtx x)
7191 {
7192 const char *fmt;
7193 int i;
7194
7195 if (GET_CODE (x) == LABEL_REF)
7196 return true;
7197
7198 /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
7199 referencing instruction, but they are constant offsets, not
7200 symbols. */
7201 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
7202 return false;
7203
7204 fmt = GET_RTX_FORMAT (GET_CODE (x));
7205 for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
7206 {
7207 if (fmt[i] == 'E')
7208 {
7209 int j;
7210
7211 for (j = XVECLEN (x, i) - 1; j >= 0; j--)
7212 if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
7213 return 1;
7214 }
7215 else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
7216 return 1;
7217 }
7218
7219 return 0;
7220 }
7221
7222 /* Implement REGNO_REG_CLASS. */
7223
7224 enum reg_class
7225 aarch64_regno_regclass (unsigned regno)
7226 {
7227 if (GP_REGNUM_P (regno))
7228 return GENERAL_REGS;
7229
7230 if (regno == SP_REGNUM)
7231 return STACK_REG;
7232
7233 if (regno == FRAME_POINTER_REGNUM
7234 || regno == ARG_POINTER_REGNUM)
7235 return POINTER_REGS;
7236
7237 if (FP_REGNUM_P (regno))
7238 return FP_LO_REGNUM_P (regno) ? FP_LO_REGS : FP_REGS;
7239
7240 if (PR_REGNUM_P (regno))
7241 return PR_LO_REGNUM_P (regno) ? PR_LO_REGS : PR_HI_REGS;
7242
7243 return NO_REGS;
7244 }
7245
7246 /* OFFSET is an address offset for mode MODE, which has SIZE bytes.
7247 If OFFSET is out of range, return an offset of an anchor point
7248 that is in range. Return 0 otherwise. */
7249
7250 static HOST_WIDE_INT
7251 aarch64_anchor_offset (HOST_WIDE_INT offset, HOST_WIDE_INT size,
7252 machine_mode mode)
7253 {
7254 /* Does it look like we'll need a 16-byte load/store-pair operation? */
7255 if (size > 16)
7256 return (offset + 0x400) & ~0x7f0;
7257
7258 /* For offsets that aren't a multiple of the access size, the limit is
7259 -256...255. */
7260 if (offset & (size - 1))
7261 {
7262 /* BLKmode typically uses LDP of X-registers. */
7263 if (mode == BLKmode)
7264 return (offset + 512) & ~0x3ff;
7265 return (offset + 0x100) & ~0x1ff;
7266 }
7267
7268 /* Small negative offsets are supported. */
7269 if (IN_RANGE (offset, -256, 0))
7270 return 0;
7271
7272 if (mode == TImode || mode == TFmode)
7273 return (offset + 0x100) & ~0x1ff;
7274
7275 /* Use 12-bit offset by access size. */
7276 return offset & (~0xfff * size);
7277 }
7278
7279 static rtx
7280 aarch64_legitimize_address (rtx x, rtx /* orig_x */, machine_mode mode)
7281 {
7282 /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
7283 where mask is selected by alignment and size of the offset.
7284 We try to pick as large a range for the offset as possible to
7285 maximize the chance of a CSE. However, for aligned addresses
7286 we limit the range to 4k so that structures with different sized
7287 elements are likely to use the same base. We need to be careful
7288 not to split a CONST for some forms of address expression, otherwise
7289 it will generate sub-optimal code. */
7290
7291 if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
7292 {
7293 rtx base = XEXP (x, 0);
7294 rtx offset_rtx = XEXP (x, 1);
7295 HOST_WIDE_INT offset = INTVAL (offset_rtx);
7296
7297 if (GET_CODE (base) == PLUS)
7298 {
7299 rtx op0 = XEXP (base, 0);
7300 rtx op1 = XEXP (base, 1);
7301
7302 /* Force any scaling into a temp for CSE. */
7303 op0 = force_reg (Pmode, op0);
7304 op1 = force_reg (Pmode, op1);
7305
7306 /* Let the pointer register be in op0. */
7307 if (REG_POINTER (op1))
7308 std::swap (op0, op1);
7309
7310 /* If the pointer is virtual or frame related, then we know that
7311 virtual register instantiation or register elimination is going
7312 to apply a second constant. We want the two constants folded
7313 together easily. Therefore, emit as (OP0 + CONST) + OP1. */
7314 if (virt_or_elim_regno_p (REGNO (op0)))
7315 {
7316 base = expand_binop (Pmode, add_optab, op0, offset_rtx,
7317 NULL_RTX, true, OPTAB_DIRECT);
7318 return gen_rtx_PLUS (Pmode, base, op1);
7319 }
7320
7321 /* Otherwise, in order to encourage CSE (and thence loop strength
7322 reduce) scaled addresses, emit as (OP0 + OP1) + CONST. */
7323 base = expand_binop (Pmode, add_optab, op0, op1,
7324 NULL_RTX, true, OPTAB_DIRECT);
7325 x = gen_rtx_PLUS (Pmode, base, offset_rtx);
7326 }
7327
7328 HOST_WIDE_INT size;
7329 if (GET_MODE_SIZE (mode).is_constant (&size))
7330 {
7331 HOST_WIDE_INT base_offset = aarch64_anchor_offset (offset, size,
7332 mode);
7333 if (base_offset != 0)
7334 {
7335 base = plus_constant (Pmode, base, base_offset);
7336 base = force_operand (base, NULL_RTX);
7337 return plus_constant (Pmode, base, offset - base_offset);
7338 }
7339 }
7340 }
7341
7342 return x;
7343 }
7344
7345 static reg_class_t
7346 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
7347 reg_class_t rclass,
7348 machine_mode mode,
7349 secondary_reload_info *sri)
7350 {
7351 /* Use aarch64_sve_reload_be for SVE reloads that cannot be handled
7352 directly by the *aarch64_sve_mov<mode>_be move pattern. See the
7353 comment at the head of aarch64-sve.md for more details about the
7354 big-endian handling. */
7355 if (BYTES_BIG_ENDIAN
7356 && reg_class_subset_p (rclass, FP_REGS)
7357 && !((REG_P (x) && HARD_REGISTER_P (x))
7358 || aarch64_simd_valid_immediate (x, NULL))
7359 && aarch64_sve_data_mode_p (mode))
7360 {
7361 sri->icode = CODE_FOR_aarch64_sve_reload_be;
7362 return NO_REGS;
7363 }
7364
7365 /* If we have to disable direct literal pool loads and stores because the
7366 function is too big, then we need a scratch register. */
7367 if (MEM_P (x) && GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x)
7368 && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
7369 || targetm.vector_mode_supported_p (GET_MODE (x)))
7370 && !aarch64_pcrelative_literal_loads)
7371 {
7372 sri->icode = code_for_aarch64_reload_movcp (mode, DImode);
7373 return NO_REGS;
7374 }
7375
7376 /* Without the TARGET_SIMD instructions we cannot move a Q register
7377 to a Q register directly. We need a scratch. */
7378 if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
7379 && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
7380 && reg_class_subset_p (rclass, FP_REGS))
7381 {
7382 sri->icode = code_for_aarch64_reload_mov (mode);
7383 return NO_REGS;
7384 }
7385
7386 /* A TFmode or TImode memory access should be handled via an FP_REGS
7387 because AArch64 has richer addressing modes for LDR/STR instructions
7388 than LDP/STP instructions. */
7389 if (TARGET_FLOAT && rclass == GENERAL_REGS
7390 && known_eq (GET_MODE_SIZE (mode), 16) && MEM_P (x))
7391 return FP_REGS;
7392
7393 if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
7394 return GENERAL_REGS;
7395
7396 return NO_REGS;
7397 }
7398
7399 static bool
7400 aarch64_can_eliminate (const int from ATTRIBUTE_UNUSED, const int to)
7401 {
7402 gcc_assert (from == ARG_POINTER_REGNUM || from == FRAME_POINTER_REGNUM);
7403
7404 /* If we need a frame pointer, ARG_POINTER_REGNUM and FRAME_POINTER_REGNUM
7405 can only eliminate to HARD_FRAME_POINTER_REGNUM. */
7406 if (frame_pointer_needed)
7407 return to == HARD_FRAME_POINTER_REGNUM;
7408 return true;
7409 }
7410
7411 poly_int64
7412 aarch64_initial_elimination_offset (unsigned from, unsigned to)
7413 {
7414 if (to == HARD_FRAME_POINTER_REGNUM)
7415 {
7416 if (from == ARG_POINTER_REGNUM)
7417 return cfun->machine->frame.hard_fp_offset;
7418
7419 if (from == FRAME_POINTER_REGNUM)
7420 return cfun->machine->frame.hard_fp_offset
7421 - cfun->machine->frame.locals_offset;
7422 }
7423
7424 if (to == STACK_POINTER_REGNUM)
7425 {
7426 if (from == FRAME_POINTER_REGNUM)
7427 return cfun->machine->frame.frame_size
7428 - cfun->machine->frame.locals_offset;
7429 }
7430
7431 return cfun->machine->frame.frame_size;
7432 }
7433
7434 /* Implement RETURN_ADDR_RTX. We do not support moving back to a
7435 previous frame. */
7436
7437 rtx
7438 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
7439 {
7440 if (count != 0)
7441 return const0_rtx;
7442 return get_hard_reg_initial_val (Pmode, LR_REGNUM);
7443 }
7444
7445
7446 static void
7447 aarch64_asm_trampoline_template (FILE *f)
7448 {
7449 if (TARGET_ILP32)
7450 {
7451 asm_fprintf (f, "\tldr\tw%d, .+16\n", IP1_REGNUM - R0_REGNUM);
7452 asm_fprintf (f, "\tldr\tw%d, .+16\n", STATIC_CHAIN_REGNUM - R0_REGNUM);
7453 }
7454 else
7455 {
7456 asm_fprintf (f, "\tldr\t%s, .+16\n", reg_names [IP1_REGNUM]);
7457 asm_fprintf (f, "\tldr\t%s, .+20\n", reg_names [STATIC_CHAIN_REGNUM]);
7458 }
7459 asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
7460 assemble_aligned_integer (4, const0_rtx);
7461 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
7462 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
7463 }
7464
7465 static void
7466 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
7467 {
7468 rtx fnaddr, mem, a_tramp;
7469 const int tramp_code_sz = 16;
7470
7471 /* Don't need to copy the trailing D-words, we fill those in below. */
7472 emit_block_move (m_tramp, assemble_trampoline_template (),
7473 GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
7474 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
7475 fnaddr = XEXP (DECL_RTL (fndecl), 0);
7476 if (GET_MODE (fnaddr) != ptr_mode)
7477 fnaddr = convert_memory_address (ptr_mode, fnaddr);
7478 emit_move_insn (mem, fnaddr);
7479
7480 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
7481 emit_move_insn (mem, chain_value);
7482
7483 /* XXX We should really define a "clear_cache" pattern and use
7484 gen_clear_cache(). */
7485 a_tramp = XEXP (m_tramp, 0);
7486 emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
7487 LCT_NORMAL, VOIDmode, a_tramp, ptr_mode,
7488 plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
7489 ptr_mode);
7490 }
7491
7492 static unsigned char
7493 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
7494 {
7495 /* ??? Logically we should only need to provide a value when
7496 HARD_REGNO_MODE_OK says that at least one register in REGCLASS
7497 can hold MODE, but at the moment we need to handle all modes.
7498 Just ignore any runtime parts for registers that can't store them. */
7499 HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
7500 unsigned int nregs;
7501 switch (regclass)
7502 {
7503 case TAILCALL_ADDR_REGS:
7504 case POINTER_REGS:
7505 case GENERAL_REGS:
7506 case ALL_REGS:
7507 case POINTER_AND_FP_REGS:
7508 case FP_REGS:
7509 case FP_LO_REGS:
7510 if (aarch64_sve_data_mode_p (mode)
7511 && constant_multiple_p (GET_MODE_SIZE (mode),
7512 BYTES_PER_SVE_VECTOR, &nregs))
7513 return nregs;
7514 return (aarch64_vector_data_mode_p (mode)
7515 ? CEIL (lowest_size, UNITS_PER_VREG)
7516 : CEIL (lowest_size, UNITS_PER_WORD));
7517 case STACK_REG:
7518 case PR_REGS:
7519 case PR_LO_REGS:
7520 case PR_HI_REGS:
7521 return 1;
7522
7523 case NO_REGS:
7524 return 0;
7525
7526 default:
7527 break;
7528 }
7529 gcc_unreachable ();
7530 }
7531
7532 static reg_class_t
7533 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
7534 {
7535 if (regclass == POINTER_REGS)
7536 return GENERAL_REGS;
7537
7538 if (regclass == STACK_REG)
7539 {
7540 if (REG_P(x)
7541 && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
7542 return regclass;
7543
7544 return NO_REGS;
7545 }
7546
7547 /* Register eliminiation can result in a request for
7548 SP+constant->FP_REGS. We cannot support such operations which
7549 use SP as source and an FP_REG as destination, so reject out
7550 right now. */
7551 if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
7552 {
7553 rtx lhs = XEXP (x, 0);
7554
7555 /* Look through a possible SUBREG introduced by ILP32. */
7556 if (GET_CODE (lhs) == SUBREG)
7557 lhs = SUBREG_REG (lhs);
7558
7559 gcc_assert (REG_P (lhs));
7560 gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
7561 POINTER_REGS));
7562 return NO_REGS;
7563 }
7564
7565 return regclass;
7566 }
7567
7568 void
7569 aarch64_asm_output_labelref (FILE* f, const char *name)
7570 {
7571 asm_fprintf (f, "%U%s", name);
7572 }
7573
7574 static void
7575 aarch64_elf_asm_constructor (rtx symbol, int priority)
7576 {
7577 if (priority == DEFAULT_INIT_PRIORITY)
7578 default_ctor_section_asm_out_constructor (symbol, priority);
7579 else
7580 {
7581 section *s;
7582 /* While priority is known to be in range [0, 65535], so 18 bytes
7583 would be enough, the compiler might not know that. To avoid
7584 -Wformat-truncation false positive, use a larger size. */
7585 char buf[23];
7586 snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
7587 s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
7588 switch_to_section (s);
7589 assemble_align (POINTER_SIZE);
7590 assemble_aligned_integer (POINTER_BYTES, symbol);
7591 }
7592 }
7593
7594 static void
7595 aarch64_elf_asm_destructor (rtx symbol, int priority)
7596 {
7597 if (priority == DEFAULT_INIT_PRIORITY)
7598 default_dtor_section_asm_out_destructor (symbol, priority);
7599 else
7600 {
7601 section *s;
7602 /* While priority is known to be in range [0, 65535], so 18 bytes
7603 would be enough, the compiler might not know that. To avoid
7604 -Wformat-truncation false positive, use a larger size. */
7605 char buf[23];
7606 snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
7607 s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
7608 switch_to_section (s);
7609 assemble_align (POINTER_SIZE);
7610 assemble_aligned_integer (POINTER_BYTES, symbol);
7611 }
7612 }
7613
7614 const char*
7615 aarch64_output_casesi (rtx *operands)
7616 {
7617 char buf[100];
7618 char label[100];
7619 rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
7620 int index;
7621 static const char *const patterns[4][2] =
7622 {
7623 {
7624 "ldrb\t%w3, [%0,%w1,uxtw]",
7625 "add\t%3, %4, %w3, sxtb #2"
7626 },
7627 {
7628 "ldrh\t%w3, [%0,%w1,uxtw #1]",
7629 "add\t%3, %4, %w3, sxth #2"
7630 },
7631 {
7632 "ldr\t%w3, [%0,%w1,uxtw #2]",
7633 "add\t%3, %4, %w3, sxtw #2"
7634 },
7635 /* We assume that DImode is only generated when not optimizing and
7636 that we don't really need 64-bit address offsets. That would
7637 imply an object file with 8GB of code in a single function! */
7638 {
7639 "ldr\t%w3, [%0,%w1,uxtw #2]",
7640 "add\t%3, %4, %w3, sxtw #2"
7641 }
7642 };
7643
7644 gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
7645
7646 scalar_int_mode mode = as_a <scalar_int_mode> (GET_MODE (diff_vec));
7647 index = exact_log2 (GET_MODE_SIZE (mode));
7648
7649 gcc_assert (index >= 0 && index <= 3);
7650
7651 /* Need to implement table size reduction, by chaning the code below. */
7652 output_asm_insn (patterns[index][0], operands);
7653 ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
7654 snprintf (buf, sizeof (buf),
7655 "adr\t%%4, %s", targetm.strip_name_encoding (label));
7656 output_asm_insn (buf, operands);
7657 output_asm_insn (patterns[index][1], operands);
7658 output_asm_insn ("br\t%3", operands);
7659 assemble_label (asm_out_file, label);
7660 return "";
7661 }
7662
7663
7664 /* Return size in bits of an arithmetic operand which is shifted/scaled and
7665 masked such that it is suitable for a UXTB, UXTH, or UXTW extend
7666 operator. */
7667
7668 int
7669 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
7670 {
7671 if (shift >= 0 && shift <= 3)
7672 {
7673 int size;
7674 for (size = 8; size <= 32; size *= 2)
7675 {
7676 HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
7677 if (mask == bits << shift)
7678 return size;
7679 }
7680 }
7681 return 0;
7682 }
7683
7684 /* Constant pools are per function only when PC relative
7685 literal loads are true or we are in the large memory
7686 model. */
7687
7688 static inline bool
7689 aarch64_can_use_per_function_literal_pools_p (void)
7690 {
7691 return (aarch64_pcrelative_literal_loads
7692 || aarch64_cmodel == AARCH64_CMODEL_LARGE);
7693 }
7694
7695 static bool
7696 aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
7697 {
7698 /* We can't use blocks for constants when we're using a per-function
7699 constant pool. */
7700 return !aarch64_can_use_per_function_literal_pools_p ();
7701 }
7702
7703 /* Select appropriate section for constants depending
7704 on where we place literal pools. */
7705
7706 static section *
7707 aarch64_select_rtx_section (machine_mode mode,
7708 rtx x,
7709 unsigned HOST_WIDE_INT align)
7710 {
7711 if (aarch64_can_use_per_function_literal_pools_p ())
7712 return function_section (current_function_decl);
7713
7714 return default_elf_select_rtx_section (mode, x, align);
7715 }
7716
7717 /* Implement ASM_OUTPUT_POOL_EPILOGUE. */
7718 void
7719 aarch64_asm_output_pool_epilogue (FILE *f, const char *, tree,
7720 HOST_WIDE_INT offset)
7721 {
7722 /* When using per-function literal pools, we must ensure that any code
7723 section is aligned to the minimal instruction length, lest we get
7724 errors from the assembler re "unaligned instructions". */
7725 if ((offset & 3) && aarch64_can_use_per_function_literal_pools_p ())
7726 ASM_OUTPUT_ALIGN (f, 2);
7727 }
7728
7729 /* Costs. */
7730
7731 /* Helper function for rtx cost calculation. Strip a shift expression
7732 from X. Returns the inner operand if successful, or the original
7733 expression on failure. */
7734 static rtx
7735 aarch64_strip_shift (rtx x)
7736 {
7737 rtx op = x;
7738
7739 /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
7740 we can convert both to ROR during final output. */
7741 if ((GET_CODE (op) == ASHIFT
7742 || GET_CODE (op) == ASHIFTRT
7743 || GET_CODE (op) == LSHIFTRT
7744 || GET_CODE (op) == ROTATERT
7745 || GET_CODE (op) == ROTATE)
7746 && CONST_INT_P (XEXP (op, 1)))
7747 return XEXP (op, 0);
7748
7749 if (GET_CODE (op) == MULT
7750 && CONST_INT_P (XEXP (op, 1))
7751 && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
7752 return XEXP (op, 0);
7753
7754 return x;
7755 }
7756
7757 /* Helper function for rtx cost calculation. Strip an extend
7758 expression from X. Returns the inner operand if successful, or the
7759 original expression on failure. We deal with a number of possible
7760 canonicalization variations here. If STRIP_SHIFT is true, then
7761 we can strip off a shift also. */
7762 static rtx
7763 aarch64_strip_extend (rtx x, bool strip_shift)
7764 {
7765 scalar_int_mode mode;
7766 rtx op = x;
7767
7768 if (!is_a <scalar_int_mode> (GET_MODE (op), &mode))
7769 return op;
7770
7771 /* Zero and sign extraction of a widened value. */
7772 if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
7773 && XEXP (op, 2) == const0_rtx
7774 && GET_CODE (XEXP (op, 0)) == MULT
7775 && aarch64_is_extend_from_extract (mode, XEXP (XEXP (op, 0), 1),
7776 XEXP (op, 1)))
7777 return XEXP (XEXP (op, 0), 0);
7778
7779 /* It can also be represented (for zero-extend) as an AND with an
7780 immediate. */
7781 if (GET_CODE (op) == AND
7782 && GET_CODE (XEXP (op, 0)) == MULT
7783 && CONST_INT_P (XEXP (XEXP (op, 0), 1))
7784 && CONST_INT_P (XEXP (op, 1))
7785 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
7786 INTVAL (XEXP (op, 1))) != 0)
7787 return XEXP (XEXP (op, 0), 0);
7788
7789 /* Now handle extended register, as this may also have an optional
7790 left shift by 1..4. */
7791 if (strip_shift
7792 && GET_CODE (op) == ASHIFT
7793 && CONST_INT_P (XEXP (op, 1))
7794 && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
7795 op = XEXP (op, 0);
7796
7797 if (GET_CODE (op) == ZERO_EXTEND
7798 || GET_CODE (op) == SIGN_EXTEND)
7799 op = XEXP (op, 0);
7800
7801 if (op != x)
7802 return op;
7803
7804 return x;
7805 }
7806
7807 /* Return true iff CODE is a shift supported in combination
7808 with arithmetic instructions. */
7809
7810 static bool
7811 aarch64_shift_p (enum rtx_code code)
7812 {
7813 return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
7814 }
7815
7816
7817 /* Return true iff X is a cheap shift without a sign extend. */
7818
7819 static bool
7820 aarch64_cheap_mult_shift_p (rtx x)
7821 {
7822 rtx op0, op1;
7823
7824 op0 = XEXP (x, 0);
7825 op1 = XEXP (x, 1);
7826
7827 if (!(aarch64_tune_params.extra_tuning_flags
7828 & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND))
7829 return false;
7830
7831 if (GET_CODE (op0) == SIGN_EXTEND)
7832 return false;
7833
7834 if (GET_CODE (x) == ASHIFT && CONST_INT_P (op1)
7835 && UINTVAL (op1) <= 4)
7836 return true;
7837
7838 if (GET_CODE (x) != MULT || !CONST_INT_P (op1))
7839 return false;
7840
7841 HOST_WIDE_INT l2 = exact_log2 (INTVAL (op1));
7842
7843 if (l2 > 0 && l2 <= 4)
7844 return true;
7845
7846 return false;
7847 }
7848
7849 /* Helper function for rtx cost calculation. Calculate the cost of
7850 a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
7851 Return the calculated cost of the expression, recursing manually in to
7852 operands where needed. */
7853
7854 static int
7855 aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
7856 {
7857 rtx op0, op1;
7858 const struct cpu_cost_table *extra_cost
7859 = aarch64_tune_params.insn_extra_cost;
7860 int cost = 0;
7861 bool compound_p = (outer == PLUS || outer == MINUS);
7862 machine_mode mode = GET_MODE (x);
7863
7864 gcc_checking_assert (code == MULT);
7865
7866 op0 = XEXP (x, 0);
7867 op1 = XEXP (x, 1);
7868
7869 if (VECTOR_MODE_P (mode))
7870 mode = GET_MODE_INNER (mode);
7871
7872 /* Integer multiply/fma. */
7873 if (GET_MODE_CLASS (mode) == MODE_INT)
7874 {
7875 /* The multiply will be canonicalized as a shift, cost it as such. */
7876 if (aarch64_shift_p (GET_CODE (x))
7877 || (CONST_INT_P (op1)
7878 && exact_log2 (INTVAL (op1)) > 0))
7879 {
7880 bool is_extend = GET_CODE (op0) == ZERO_EXTEND
7881 || GET_CODE (op0) == SIGN_EXTEND;
7882 if (speed)
7883 {
7884 if (compound_p)
7885 {
7886 /* If the shift is considered cheap,
7887 then don't add any cost. */
7888 if (aarch64_cheap_mult_shift_p (x))
7889 ;
7890 else if (REG_P (op1))
7891 /* ARITH + shift-by-register. */
7892 cost += extra_cost->alu.arith_shift_reg;
7893 else if (is_extend)
7894 /* ARITH + extended register. We don't have a cost field
7895 for ARITH+EXTEND+SHIFT, so use extend_arith here. */
7896 cost += extra_cost->alu.extend_arith;
7897 else
7898 /* ARITH + shift-by-immediate. */
7899 cost += extra_cost->alu.arith_shift;
7900 }
7901 else
7902 /* LSL (immediate). */
7903 cost += extra_cost->alu.shift;
7904
7905 }
7906 /* Strip extends as we will have costed them in the case above. */
7907 if (is_extend)
7908 op0 = aarch64_strip_extend (op0, true);
7909
7910 cost += rtx_cost (op0, VOIDmode, code, 0, speed);
7911
7912 return cost;
7913 }
7914
7915 /* MNEG or [US]MNEGL. Extract the NEG operand and indicate that it's a
7916 compound and let the below cases handle it. After all, MNEG is a
7917 special-case alias of MSUB. */
7918 if (GET_CODE (op0) == NEG)
7919 {
7920 op0 = XEXP (op0, 0);
7921 compound_p = true;
7922 }
7923
7924 /* Integer multiplies or FMAs have zero/sign extending variants. */
7925 if ((GET_CODE (op0) == ZERO_EXTEND
7926 && GET_CODE (op1) == ZERO_EXTEND)
7927 || (GET_CODE (op0) == SIGN_EXTEND
7928 && GET_CODE (op1) == SIGN_EXTEND))
7929 {
7930 cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
7931 cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
7932
7933 if (speed)
7934 {
7935 if (compound_p)
7936 /* SMADDL/UMADDL/UMSUBL/SMSUBL. */
7937 cost += extra_cost->mult[0].extend_add;
7938 else
7939 /* MUL/SMULL/UMULL. */
7940 cost += extra_cost->mult[0].extend;
7941 }
7942
7943 return cost;
7944 }
7945
7946 /* This is either an integer multiply or a MADD. In both cases
7947 we want to recurse and cost the operands. */
7948 cost += rtx_cost (op0, mode, MULT, 0, speed);
7949 cost += rtx_cost (op1, mode, MULT, 1, speed);
7950
7951 if (speed)
7952 {
7953 if (compound_p)
7954 /* MADD/MSUB. */
7955 cost += extra_cost->mult[mode == DImode].add;
7956 else
7957 /* MUL. */
7958 cost += extra_cost->mult[mode == DImode].simple;
7959 }
7960
7961 return cost;
7962 }
7963 else
7964 {
7965 if (speed)
7966 {
7967 /* Floating-point FMA/FMUL can also support negations of the
7968 operands, unless the rounding mode is upward or downward in
7969 which case FNMUL is different than FMUL with operand negation. */
7970 bool neg0 = GET_CODE (op0) == NEG;
7971 bool neg1 = GET_CODE (op1) == NEG;
7972 if (compound_p || !flag_rounding_math || (neg0 && neg1))
7973 {
7974 if (neg0)
7975 op0 = XEXP (op0, 0);
7976 if (neg1)
7977 op1 = XEXP (op1, 0);
7978 }
7979
7980 if (compound_p)
7981 /* FMADD/FNMADD/FNMSUB/FMSUB. */
7982 cost += extra_cost->fp[mode == DFmode].fma;
7983 else
7984 /* FMUL/FNMUL. */
7985 cost += extra_cost->fp[mode == DFmode].mult;
7986 }
7987
7988 cost += rtx_cost (op0, mode, MULT, 0, speed);
7989 cost += rtx_cost (op1, mode, MULT, 1, speed);
7990 return cost;
7991 }
7992 }
7993
7994 static int
7995 aarch64_address_cost (rtx x,
7996 machine_mode mode,
7997 addr_space_t as ATTRIBUTE_UNUSED,
7998 bool speed)
7999 {
8000 enum rtx_code c = GET_CODE (x);
8001 const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
8002 struct aarch64_address_info info;
8003 int cost = 0;
8004 info.shift = 0;
8005
8006 if (!aarch64_classify_address (&info, x, mode, false))
8007 {
8008 if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
8009 {
8010 /* This is a CONST or SYMBOL ref which will be split
8011 in a different way depending on the code model in use.
8012 Cost it through the generic infrastructure. */
8013 int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
8014 /* Divide through by the cost of one instruction to
8015 bring it to the same units as the address costs. */
8016 cost_symbol_ref /= COSTS_N_INSNS (1);
8017 /* The cost is then the cost of preparing the address,
8018 followed by an immediate (possibly 0) offset. */
8019 return cost_symbol_ref + addr_cost->imm_offset;
8020 }
8021 else
8022 {
8023 /* This is most likely a jump table from a case
8024 statement. */
8025 return addr_cost->register_offset;
8026 }
8027 }
8028
8029 switch (info.type)
8030 {
8031 case ADDRESS_LO_SUM:
8032 case ADDRESS_SYMBOLIC:
8033 case ADDRESS_REG_IMM:
8034 cost += addr_cost->imm_offset;
8035 break;
8036
8037 case ADDRESS_REG_WB:
8038 if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
8039 cost += addr_cost->pre_modify;
8040 else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
8041 cost += addr_cost->post_modify;
8042 else
8043 gcc_unreachable ();
8044
8045 break;
8046
8047 case ADDRESS_REG_REG:
8048 cost += addr_cost->register_offset;
8049 break;
8050
8051 case ADDRESS_REG_SXTW:
8052 cost += addr_cost->register_sextend;
8053 break;
8054
8055 case ADDRESS_REG_UXTW:
8056 cost += addr_cost->register_zextend;
8057 break;
8058
8059 default:
8060 gcc_unreachable ();
8061 }
8062
8063
8064 if (info.shift > 0)
8065 {
8066 /* For the sake of calculating the cost of the shifted register
8067 component, we can treat same sized modes in the same way. */
8068 if (known_eq (GET_MODE_BITSIZE (mode), 16))
8069 cost += addr_cost->addr_scale_costs.hi;
8070 else if (known_eq (GET_MODE_BITSIZE (mode), 32))
8071 cost += addr_cost->addr_scale_costs.si;
8072 else if (known_eq (GET_MODE_BITSIZE (mode), 64))
8073 cost += addr_cost->addr_scale_costs.di;
8074 else
8075 /* We can't tell, or this is a 128-bit vector. */
8076 cost += addr_cost->addr_scale_costs.ti;
8077 }
8078
8079 return cost;
8080 }
8081
8082 /* Return the cost of a branch. If SPEED_P is true then the compiler is
8083 optimizing for speed. If PREDICTABLE_P is true then the branch is predicted
8084 to be taken. */
8085
8086 int
8087 aarch64_branch_cost (bool speed_p, bool predictable_p)
8088 {
8089 /* When optimizing for speed, use the cost of unpredictable branches. */
8090 const struct cpu_branch_cost *branch_costs =
8091 aarch64_tune_params.branch_costs;
8092
8093 if (!speed_p || predictable_p)
8094 return branch_costs->predictable;
8095 else
8096 return branch_costs->unpredictable;
8097 }
8098
8099 /* Return true if the RTX X in mode MODE is a zero or sign extract
8100 usable in an ADD or SUB (extended register) instruction. */
8101 static bool
8102 aarch64_rtx_arith_op_extract_p (rtx x, scalar_int_mode mode)
8103 {
8104 /* Catch add with a sign extract.
8105 This is add_<optab><mode>_multp2. */
8106 if (GET_CODE (x) == SIGN_EXTRACT
8107 || GET_CODE (x) == ZERO_EXTRACT)
8108 {
8109 rtx op0 = XEXP (x, 0);
8110 rtx op1 = XEXP (x, 1);
8111 rtx op2 = XEXP (x, 2);
8112
8113 if (GET_CODE (op0) == MULT
8114 && CONST_INT_P (op1)
8115 && op2 == const0_rtx
8116 && CONST_INT_P (XEXP (op0, 1))
8117 && aarch64_is_extend_from_extract (mode,
8118 XEXP (op0, 1),
8119 op1))
8120 {
8121 return true;
8122 }
8123 }
8124 /* The simple case <ARITH>, XD, XN, XM, [us]xt.
8125 No shift. */
8126 else if (GET_CODE (x) == SIGN_EXTEND
8127 || GET_CODE (x) == ZERO_EXTEND)
8128 return REG_P (XEXP (x, 0));
8129
8130 return false;
8131 }
8132
8133 static bool
8134 aarch64_frint_unspec_p (unsigned int u)
8135 {
8136 switch (u)
8137 {
8138 case UNSPEC_FRINTZ:
8139 case UNSPEC_FRINTP:
8140 case UNSPEC_FRINTM:
8141 case UNSPEC_FRINTA:
8142 case UNSPEC_FRINTN:
8143 case UNSPEC_FRINTX:
8144 case UNSPEC_FRINTI:
8145 return true;
8146
8147 default:
8148 return false;
8149 }
8150 }
8151
8152 /* Return true iff X is an rtx that will match an extr instruction
8153 i.e. as described in the *extr<mode>5_insn family of patterns.
8154 OP0 and OP1 will be set to the operands of the shifts involved
8155 on success and will be NULL_RTX otherwise. */
8156
8157 static bool
8158 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
8159 {
8160 rtx op0, op1;
8161 scalar_int_mode mode;
8162 if (!is_a <scalar_int_mode> (GET_MODE (x), &mode))
8163 return false;
8164
8165 *res_op0 = NULL_RTX;
8166 *res_op1 = NULL_RTX;
8167
8168 if (GET_CODE (x) != IOR)
8169 return false;
8170
8171 op0 = XEXP (x, 0);
8172 op1 = XEXP (x, 1);
8173
8174 if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
8175 || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
8176 {
8177 /* Canonicalise locally to ashift in op0, lshiftrt in op1. */
8178 if (GET_CODE (op1) == ASHIFT)
8179 std::swap (op0, op1);
8180
8181 if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
8182 return false;
8183
8184 unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
8185 unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
8186
8187 if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
8188 && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
8189 {
8190 *res_op0 = XEXP (op0, 0);
8191 *res_op1 = XEXP (op1, 0);
8192 return true;
8193 }
8194 }
8195
8196 return false;
8197 }
8198
8199 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
8200 storing it in *COST. Result is true if the total cost of the operation
8201 has now been calculated. */
8202 static bool
8203 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
8204 {
8205 rtx inner;
8206 rtx comparator;
8207 enum rtx_code cmpcode;
8208
8209 if (COMPARISON_P (op0))
8210 {
8211 inner = XEXP (op0, 0);
8212 comparator = XEXP (op0, 1);
8213 cmpcode = GET_CODE (op0);
8214 }
8215 else
8216 {
8217 inner = op0;
8218 comparator = const0_rtx;
8219 cmpcode = NE;
8220 }
8221
8222 if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
8223 {
8224 /* Conditional branch. */
8225 if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
8226 return true;
8227 else
8228 {
8229 if (cmpcode == NE || cmpcode == EQ)
8230 {
8231 if (comparator == const0_rtx)
8232 {
8233 /* TBZ/TBNZ/CBZ/CBNZ. */
8234 if (GET_CODE (inner) == ZERO_EXTRACT)
8235 /* TBZ/TBNZ. */
8236 *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
8237 ZERO_EXTRACT, 0, speed);
8238 else
8239 /* CBZ/CBNZ. */
8240 *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
8241
8242 return true;
8243 }
8244 }
8245 else if (cmpcode == LT || cmpcode == GE)
8246 {
8247 /* TBZ/TBNZ. */
8248 if (comparator == const0_rtx)
8249 return true;
8250 }
8251 }
8252 }
8253 else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
8254 {
8255 /* CCMP. */
8256 if (GET_CODE (op1) == COMPARE)
8257 {
8258 /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0. */
8259 if (XEXP (op1, 1) == const0_rtx)
8260 *cost += 1;
8261 if (speed)
8262 {
8263 machine_mode mode = GET_MODE (XEXP (op1, 0));
8264 const struct cpu_cost_table *extra_cost
8265 = aarch64_tune_params.insn_extra_cost;
8266
8267 if (GET_MODE_CLASS (mode) == MODE_INT)
8268 *cost += extra_cost->alu.arith;
8269 else
8270 *cost += extra_cost->fp[mode == DFmode].compare;
8271 }
8272 return true;
8273 }
8274
8275 /* It's a conditional operation based on the status flags,
8276 so it must be some flavor of CSEL. */
8277
8278 /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL. */
8279 if (GET_CODE (op1) == NEG
8280 || GET_CODE (op1) == NOT
8281 || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
8282 op1 = XEXP (op1, 0);
8283 else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND)
8284 {
8285 /* CSEL with zero-extension (*cmovdi_insn_uxtw). */
8286 op1 = XEXP (op1, 0);
8287 op2 = XEXP (op2, 0);
8288 }
8289
8290 *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
8291 *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
8292 return true;
8293 }
8294
8295 /* We don't know what this is, cost all operands. */
8296 return false;
8297 }
8298
8299 /* Check whether X is a bitfield operation of the form shift + extend that
8300 maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction. If so, return the
8301 operand to which the bitfield operation is applied. Otherwise return
8302 NULL_RTX. */
8303
8304 static rtx
8305 aarch64_extend_bitfield_pattern_p (rtx x)
8306 {
8307 rtx_code outer_code = GET_CODE (x);
8308 machine_mode outer_mode = GET_MODE (x);
8309
8310 if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND
8311 && outer_mode != SImode && outer_mode != DImode)
8312 return NULL_RTX;
8313
8314 rtx inner = XEXP (x, 0);
8315 rtx_code inner_code = GET_CODE (inner);
8316 machine_mode inner_mode = GET_MODE (inner);
8317 rtx op = NULL_RTX;
8318
8319 switch (inner_code)
8320 {
8321 case ASHIFT:
8322 if (CONST_INT_P (XEXP (inner, 1))
8323 && (inner_mode == QImode || inner_mode == HImode))
8324 op = XEXP (inner, 0);
8325 break;
8326 case LSHIFTRT:
8327 if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1))
8328 && (inner_mode == QImode || inner_mode == HImode))
8329 op = XEXP (inner, 0);
8330 break;
8331 case ASHIFTRT:
8332 if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1))
8333 && (inner_mode == QImode || inner_mode == HImode))
8334 op = XEXP (inner, 0);
8335 break;
8336 default:
8337 break;
8338 }
8339
8340 return op;
8341 }
8342
8343 /* Return true if the mask and a shift amount from an RTX of the form
8344 (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
8345 mode MODE. See the *andim_ashift<mode>_bfiz pattern. */
8346
8347 bool
8348 aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode mode, rtx mask,
8349 rtx shft_amnt)
8350 {
8351 return CONST_INT_P (mask) && CONST_INT_P (shft_amnt)
8352 && INTVAL (shft_amnt) < GET_MODE_BITSIZE (mode)
8353 && exact_log2 ((INTVAL (mask) >> INTVAL (shft_amnt)) + 1) >= 0
8354 && (INTVAL (mask) & ((1 << INTVAL (shft_amnt)) - 1)) == 0;
8355 }
8356
8357 /* Calculate the cost of calculating X, storing it in *COST. Result
8358 is true if the total cost of the operation has now been calculated. */
8359 static bool
8360 aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
8361 int param ATTRIBUTE_UNUSED, int *cost, bool speed)
8362 {
8363 rtx op0, op1, op2;
8364 const struct cpu_cost_table *extra_cost
8365 = aarch64_tune_params.insn_extra_cost;
8366 int code = GET_CODE (x);
8367 scalar_int_mode int_mode;
8368
8369 /* By default, assume that everything has equivalent cost to the
8370 cheapest instruction. Any additional costs are applied as a delta
8371 above this default. */
8372 *cost = COSTS_N_INSNS (1);
8373
8374 switch (code)
8375 {
8376 case SET:
8377 /* The cost depends entirely on the operands to SET. */
8378 *cost = 0;
8379 op0 = SET_DEST (x);
8380 op1 = SET_SRC (x);
8381
8382 switch (GET_CODE (op0))
8383 {
8384 case MEM:
8385 if (speed)
8386 {
8387 rtx address = XEXP (op0, 0);
8388 if (VECTOR_MODE_P (mode))
8389 *cost += extra_cost->ldst.storev;
8390 else if (GET_MODE_CLASS (mode) == MODE_INT)
8391 *cost += extra_cost->ldst.store;
8392 else if (mode == SFmode)
8393 *cost += extra_cost->ldst.storef;
8394 else if (mode == DFmode)
8395 *cost += extra_cost->ldst.stored;
8396
8397 *cost +=
8398 COSTS_N_INSNS (aarch64_address_cost (address, mode,
8399 0, speed));
8400 }
8401
8402 *cost += rtx_cost (op1, mode, SET, 1, speed);
8403 return true;
8404
8405 case SUBREG:
8406 if (! REG_P (SUBREG_REG (op0)))
8407 *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
8408
8409 /* Fall through. */
8410 case REG:
8411 /* The cost is one per vector-register copied. */
8412 if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
8413 {
8414 int nregs = aarch64_hard_regno_nregs (V0_REGNUM, GET_MODE (op0));
8415 *cost = COSTS_N_INSNS (nregs);
8416 }
8417 /* const0_rtx is in general free, but we will use an
8418 instruction to set a register to 0. */
8419 else if (REG_P (op1) || op1 == const0_rtx)
8420 {
8421 /* The cost is 1 per register copied. */
8422 int nregs = aarch64_hard_regno_nregs (R0_REGNUM, GET_MODE (op0));
8423 *cost = COSTS_N_INSNS (nregs);
8424 }
8425 else
8426 /* Cost is just the cost of the RHS of the set. */
8427 *cost += rtx_cost (op1, mode, SET, 1, speed);
8428 return true;
8429
8430 case ZERO_EXTRACT:
8431 case SIGN_EXTRACT:
8432 /* Bit-field insertion. Strip any redundant widening of
8433 the RHS to meet the width of the target. */
8434 if (GET_CODE (op1) == SUBREG)
8435 op1 = SUBREG_REG (op1);
8436 if ((GET_CODE (op1) == ZERO_EXTEND
8437 || GET_CODE (op1) == SIGN_EXTEND)
8438 && CONST_INT_P (XEXP (op0, 1))
8439 && is_a <scalar_int_mode> (GET_MODE (XEXP (op1, 0)), &int_mode)
8440 && GET_MODE_BITSIZE (int_mode) >= INTVAL (XEXP (op0, 1)))
8441 op1 = XEXP (op1, 0);
8442
8443 if (CONST_INT_P (op1))
8444 {
8445 /* MOV immediate is assumed to always be cheap. */
8446 *cost = COSTS_N_INSNS (1);
8447 }
8448 else
8449 {
8450 /* BFM. */
8451 if (speed)
8452 *cost += extra_cost->alu.bfi;
8453 *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed);
8454 }
8455
8456 return true;
8457
8458 default:
8459 /* We can't make sense of this, assume default cost. */
8460 *cost = COSTS_N_INSNS (1);
8461 return false;
8462 }
8463 return false;
8464
8465 case CONST_INT:
8466 /* If an instruction can incorporate a constant within the
8467 instruction, the instruction's expression avoids calling
8468 rtx_cost() on the constant. If rtx_cost() is called on a
8469 constant, then it is usually because the constant must be
8470 moved into a register by one or more instructions.
8471
8472 The exception is constant 0, which can be expressed
8473 as XZR/WZR and is therefore free. The exception to this is
8474 if we have (set (reg) (const0_rtx)) in which case we must cost
8475 the move. However, we can catch that when we cost the SET, so
8476 we don't need to consider that here. */
8477 if (x == const0_rtx)
8478 *cost = 0;
8479 else
8480 {
8481 /* To an approximation, building any other constant is
8482 proportionally expensive to the number of instructions
8483 required to build that constant. This is true whether we
8484 are compiling for SPEED or otherwise. */
8485 if (!is_a <scalar_int_mode> (mode, &int_mode))
8486 int_mode = word_mode;
8487 *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
8488 (NULL_RTX, x, false, int_mode));
8489 }
8490 return true;
8491
8492 case CONST_DOUBLE:
8493
8494 /* First determine number of instructions to do the move
8495 as an integer constant. */
8496 if (!aarch64_float_const_representable_p (x)
8497 && !aarch64_can_const_movi_rtx_p (x, mode)
8498 && aarch64_float_const_rtx_p (x))
8499 {
8500 unsigned HOST_WIDE_INT ival;
8501 bool succeed = aarch64_reinterpret_float_as_int (x, &ival);
8502 gcc_assert (succeed);
8503
8504 scalar_int_mode imode = (mode == HFmode
8505 ? SImode
8506 : int_mode_for_mode (mode).require ());
8507 int ncost = aarch64_internal_mov_immediate
8508 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
8509 *cost += COSTS_N_INSNS (ncost);
8510 return true;
8511 }
8512
8513 if (speed)
8514 {
8515 /* mov[df,sf]_aarch64. */
8516 if (aarch64_float_const_representable_p (x))
8517 /* FMOV (scalar immediate). */
8518 *cost += extra_cost->fp[mode == DFmode].fpconst;
8519 else if (!aarch64_float_const_zero_rtx_p (x))
8520 {
8521 /* This will be a load from memory. */
8522 if (mode == DFmode)
8523 *cost += extra_cost->ldst.loadd;
8524 else
8525 *cost += extra_cost->ldst.loadf;
8526 }
8527 else
8528 /* Otherwise this is +0.0. We get this using MOVI d0, #0
8529 or MOV v0.s[0], wzr - neither of which are modeled by the
8530 cost tables. Just use the default cost. */
8531 {
8532 }
8533 }
8534
8535 return true;
8536
8537 case MEM:
8538 if (speed)
8539 {
8540 /* For loads we want the base cost of a load, plus an
8541 approximation for the additional cost of the addressing
8542 mode. */
8543 rtx address = XEXP (x, 0);
8544 if (VECTOR_MODE_P (mode))
8545 *cost += extra_cost->ldst.loadv;
8546 else if (GET_MODE_CLASS (mode) == MODE_INT)
8547 *cost += extra_cost->ldst.load;
8548 else if (mode == SFmode)
8549 *cost += extra_cost->ldst.loadf;
8550 else if (mode == DFmode)
8551 *cost += extra_cost->ldst.loadd;
8552
8553 *cost +=
8554 COSTS_N_INSNS (aarch64_address_cost (address, mode,
8555 0, speed));
8556 }
8557
8558 return true;
8559
8560 case NEG:
8561 op0 = XEXP (x, 0);
8562
8563 if (VECTOR_MODE_P (mode))
8564 {
8565 if (speed)
8566 {
8567 /* FNEG. */
8568 *cost += extra_cost->vect.alu;
8569 }
8570 return false;
8571 }
8572
8573 if (GET_MODE_CLASS (mode) == MODE_INT)
8574 {
8575 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
8576 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
8577 {
8578 /* CSETM. */
8579 *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
8580 return true;
8581 }
8582
8583 /* Cost this as SUB wzr, X. */
8584 op0 = CONST0_RTX (mode);
8585 op1 = XEXP (x, 0);
8586 goto cost_minus;
8587 }
8588
8589 if (GET_MODE_CLASS (mode) == MODE_FLOAT)
8590 {
8591 /* Support (neg(fma...)) as a single instruction only if
8592 sign of zeros is unimportant. This matches the decision
8593 making in aarch64.md. */
8594 if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
8595 {
8596 /* FNMADD. */
8597 *cost = rtx_cost (op0, mode, NEG, 0, speed);
8598 return true;
8599 }
8600 if (GET_CODE (op0) == MULT)
8601 {
8602 /* FNMUL. */
8603 *cost = rtx_cost (op0, mode, NEG, 0, speed);
8604 return true;
8605 }
8606 if (speed)
8607 /* FNEG. */
8608 *cost += extra_cost->fp[mode == DFmode].neg;
8609 return false;
8610 }
8611
8612 return false;
8613
8614 case CLRSB:
8615 case CLZ:
8616 if (speed)
8617 {
8618 if (VECTOR_MODE_P (mode))
8619 *cost += extra_cost->vect.alu;
8620 else
8621 *cost += extra_cost->alu.clz;
8622 }
8623
8624 return false;
8625
8626 case COMPARE:
8627 op0 = XEXP (x, 0);
8628 op1 = XEXP (x, 1);
8629
8630 if (op1 == const0_rtx
8631 && GET_CODE (op0) == AND)
8632 {
8633 x = op0;
8634 mode = GET_MODE (op0);
8635 goto cost_logic;
8636 }
8637
8638 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
8639 {
8640 /* TODO: A write to the CC flags possibly costs extra, this
8641 needs encoding in the cost tables. */
8642
8643 mode = GET_MODE (op0);
8644 /* ANDS. */
8645 if (GET_CODE (op0) == AND)
8646 {
8647 x = op0;
8648 goto cost_logic;
8649 }
8650
8651 if (GET_CODE (op0) == PLUS)
8652 {
8653 /* ADDS (and CMN alias). */
8654 x = op0;
8655 goto cost_plus;
8656 }
8657
8658 if (GET_CODE (op0) == MINUS)
8659 {
8660 /* SUBS. */
8661 x = op0;
8662 goto cost_minus;
8663 }
8664
8665 if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx
8666 && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1))
8667 && CONST_INT_P (XEXP (op0, 2)))
8668 {
8669 /* COMPARE of ZERO_EXTRACT form of TST-immediate.
8670 Handle it here directly rather than going to cost_logic
8671 since we know the immediate generated for the TST is valid
8672 so we can avoid creating an intermediate rtx for it only
8673 for costing purposes. */
8674 if (speed)
8675 *cost += extra_cost->alu.logical;
8676
8677 *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0),
8678 ZERO_EXTRACT, 0, speed);
8679 return true;
8680 }
8681
8682 if (GET_CODE (op1) == NEG)
8683 {
8684 /* CMN. */
8685 if (speed)
8686 *cost += extra_cost->alu.arith;
8687
8688 *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
8689 *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
8690 return true;
8691 }
8692
8693 /* CMP.
8694
8695 Compare can freely swap the order of operands, and
8696 canonicalization puts the more complex operation first.
8697 But the integer MINUS logic expects the shift/extend
8698 operation in op1. */
8699 if (! (REG_P (op0)
8700 || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
8701 {
8702 op0 = XEXP (x, 1);
8703 op1 = XEXP (x, 0);
8704 }
8705 goto cost_minus;
8706 }
8707
8708 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
8709 {
8710 /* FCMP. */
8711 if (speed)
8712 *cost += extra_cost->fp[mode == DFmode].compare;
8713
8714 if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
8715 {
8716 *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
8717 /* FCMP supports constant 0.0 for no extra cost. */
8718 return true;
8719 }
8720 return false;
8721 }
8722
8723 if (VECTOR_MODE_P (mode))
8724 {
8725 /* Vector compare. */
8726 if (speed)
8727 *cost += extra_cost->vect.alu;
8728
8729 if (aarch64_float_const_zero_rtx_p (op1))
8730 {
8731 /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
8732 cost. */
8733 return true;
8734 }
8735 return false;
8736 }
8737 return false;
8738
8739 case MINUS:
8740 {
8741 op0 = XEXP (x, 0);
8742 op1 = XEXP (x, 1);
8743
8744 cost_minus:
8745 *cost += rtx_cost (op0, mode, MINUS, 0, speed);
8746
8747 /* Detect valid immediates. */
8748 if ((GET_MODE_CLASS (mode) == MODE_INT
8749 || (GET_MODE_CLASS (mode) == MODE_CC
8750 && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
8751 && CONST_INT_P (op1)
8752 && aarch64_uimm12_shift (INTVAL (op1)))
8753 {
8754 if (speed)
8755 /* SUB(S) (immediate). */
8756 *cost += extra_cost->alu.arith;
8757 return true;
8758 }
8759
8760 /* Look for SUB (extended register). */
8761 if (is_a <scalar_int_mode> (mode, &int_mode)
8762 && aarch64_rtx_arith_op_extract_p (op1, int_mode))
8763 {
8764 if (speed)
8765 *cost += extra_cost->alu.extend_arith;
8766
8767 op1 = aarch64_strip_extend (op1, true);
8768 *cost += rtx_cost (op1, VOIDmode,
8769 (enum rtx_code) GET_CODE (op1), 0, speed);
8770 return true;
8771 }
8772
8773 rtx new_op1 = aarch64_strip_extend (op1, false);
8774
8775 /* Cost this as an FMA-alike operation. */
8776 if ((GET_CODE (new_op1) == MULT
8777 || aarch64_shift_p (GET_CODE (new_op1)))
8778 && code != COMPARE)
8779 {
8780 *cost += aarch64_rtx_mult_cost (new_op1, MULT,
8781 (enum rtx_code) code,
8782 speed);
8783 return true;
8784 }
8785
8786 *cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
8787
8788 if (speed)
8789 {
8790 if (VECTOR_MODE_P (mode))
8791 {
8792 /* Vector SUB. */
8793 *cost += extra_cost->vect.alu;
8794 }
8795 else if (GET_MODE_CLASS (mode) == MODE_INT)
8796 {
8797 /* SUB(S). */
8798 *cost += extra_cost->alu.arith;
8799 }
8800 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
8801 {
8802 /* FSUB. */
8803 *cost += extra_cost->fp[mode == DFmode].addsub;
8804 }
8805 }
8806 return true;
8807 }
8808
8809 case PLUS:
8810 {
8811 rtx new_op0;
8812
8813 op0 = XEXP (x, 0);
8814 op1 = XEXP (x, 1);
8815
8816 cost_plus:
8817 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
8818 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
8819 {
8820 /* CSINC. */
8821 *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
8822 *cost += rtx_cost (op1, mode, PLUS, 1, speed);
8823 return true;
8824 }
8825
8826 if (GET_MODE_CLASS (mode) == MODE_INT
8827 && ((CONST_INT_P (op1) && aarch64_uimm12_shift (INTVAL (op1)))
8828 || aarch64_sve_addvl_addpl_immediate (op1, mode)))
8829 {
8830 *cost += rtx_cost (op0, mode, PLUS, 0, speed);
8831
8832 if (speed)
8833 /* ADD (immediate). */
8834 *cost += extra_cost->alu.arith;
8835 return true;
8836 }
8837
8838 *cost += rtx_cost (op1, mode, PLUS, 1, speed);
8839
8840 /* Look for ADD (extended register). */
8841 if (is_a <scalar_int_mode> (mode, &int_mode)
8842 && aarch64_rtx_arith_op_extract_p (op0, int_mode))
8843 {
8844 if (speed)
8845 *cost += extra_cost->alu.extend_arith;
8846
8847 op0 = aarch64_strip_extend (op0, true);
8848 *cost += rtx_cost (op0, VOIDmode,
8849 (enum rtx_code) GET_CODE (op0), 0, speed);
8850 return true;
8851 }
8852
8853 /* Strip any extend, leave shifts behind as we will
8854 cost them through mult_cost. */
8855 new_op0 = aarch64_strip_extend (op0, false);
8856
8857 if (GET_CODE (new_op0) == MULT
8858 || aarch64_shift_p (GET_CODE (new_op0)))
8859 {
8860 *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
8861 speed);
8862 return true;
8863 }
8864
8865 *cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
8866
8867 if (speed)
8868 {
8869 if (VECTOR_MODE_P (mode))
8870 {
8871 /* Vector ADD. */
8872 *cost += extra_cost->vect.alu;
8873 }
8874 else if (GET_MODE_CLASS (mode) == MODE_INT)
8875 {
8876 /* ADD. */
8877 *cost += extra_cost->alu.arith;
8878 }
8879 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
8880 {
8881 /* FADD. */
8882 *cost += extra_cost->fp[mode == DFmode].addsub;
8883 }
8884 }
8885 return true;
8886 }
8887
8888 case BSWAP:
8889 *cost = COSTS_N_INSNS (1);
8890
8891 if (speed)
8892 {
8893 if (VECTOR_MODE_P (mode))
8894 *cost += extra_cost->vect.alu;
8895 else
8896 *cost += extra_cost->alu.rev;
8897 }
8898 return false;
8899
8900 case IOR:
8901 if (aarch_rev16_p (x))
8902 {
8903 *cost = COSTS_N_INSNS (1);
8904
8905 if (speed)
8906 {
8907 if (VECTOR_MODE_P (mode))
8908 *cost += extra_cost->vect.alu;
8909 else
8910 *cost += extra_cost->alu.rev;
8911 }
8912 return true;
8913 }
8914
8915 if (aarch64_extr_rtx_p (x, &op0, &op1))
8916 {
8917 *cost += rtx_cost (op0, mode, IOR, 0, speed);
8918 *cost += rtx_cost (op1, mode, IOR, 1, speed);
8919 if (speed)
8920 *cost += extra_cost->alu.shift;
8921
8922 return true;
8923 }
8924 /* Fall through. */
8925 case XOR:
8926 case AND:
8927 cost_logic:
8928 op0 = XEXP (x, 0);
8929 op1 = XEXP (x, 1);
8930
8931 if (VECTOR_MODE_P (mode))
8932 {
8933 if (speed)
8934 *cost += extra_cost->vect.alu;
8935 return true;
8936 }
8937
8938 if (code == AND
8939 && GET_CODE (op0) == MULT
8940 && CONST_INT_P (XEXP (op0, 1))
8941 && CONST_INT_P (op1)
8942 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
8943 INTVAL (op1)) != 0)
8944 {
8945 /* This is a UBFM/SBFM. */
8946 *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
8947 if (speed)
8948 *cost += extra_cost->alu.bfx;
8949 return true;
8950 }
8951
8952 if (is_int_mode (mode, &int_mode))
8953 {
8954 if (CONST_INT_P (op1))
8955 {
8956 /* We have a mask + shift version of a UBFIZ
8957 i.e. the *andim_ashift<mode>_bfiz pattern. */
8958 if (GET_CODE (op0) == ASHIFT
8959 && aarch64_mask_and_shift_for_ubfiz_p (int_mode, op1,
8960 XEXP (op0, 1)))
8961 {
8962 *cost += rtx_cost (XEXP (op0, 0), int_mode,
8963 (enum rtx_code) code, 0, speed);
8964 if (speed)
8965 *cost += extra_cost->alu.bfx;
8966
8967 return true;
8968 }
8969 else if (aarch64_bitmask_imm (INTVAL (op1), int_mode))
8970 {
8971 /* We possibly get the immediate for free, this is not
8972 modelled. */
8973 *cost += rtx_cost (op0, int_mode,
8974 (enum rtx_code) code, 0, speed);
8975 if (speed)
8976 *cost += extra_cost->alu.logical;
8977
8978 return true;
8979 }
8980 }
8981 else
8982 {
8983 rtx new_op0 = op0;
8984
8985 /* Handle ORN, EON, or BIC. */
8986 if (GET_CODE (op0) == NOT)
8987 op0 = XEXP (op0, 0);
8988
8989 new_op0 = aarch64_strip_shift (op0);
8990
8991 /* If we had a shift on op0 then this is a logical-shift-
8992 by-register/immediate operation. Otherwise, this is just
8993 a logical operation. */
8994 if (speed)
8995 {
8996 if (new_op0 != op0)
8997 {
8998 /* Shift by immediate. */
8999 if (CONST_INT_P (XEXP (op0, 1)))
9000 *cost += extra_cost->alu.log_shift;
9001 else
9002 *cost += extra_cost->alu.log_shift_reg;
9003 }
9004 else
9005 *cost += extra_cost->alu.logical;
9006 }
9007
9008 /* In both cases we want to cost both operands. */
9009 *cost += rtx_cost (new_op0, int_mode, (enum rtx_code) code,
9010 0, speed);
9011 *cost += rtx_cost (op1, int_mode, (enum rtx_code) code,
9012 1, speed);
9013
9014 return true;
9015 }
9016 }
9017 return false;
9018
9019 case NOT:
9020 x = XEXP (x, 0);
9021 op0 = aarch64_strip_shift (x);
9022
9023 if (VECTOR_MODE_P (mode))
9024 {
9025 /* Vector NOT. */
9026 *cost += extra_cost->vect.alu;
9027 return false;
9028 }
9029
9030 /* MVN-shifted-reg. */
9031 if (op0 != x)
9032 {
9033 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
9034
9035 if (speed)
9036 *cost += extra_cost->alu.log_shift;
9037
9038 return true;
9039 }
9040 /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
9041 Handle the second form here taking care that 'a' in the above can
9042 be a shift. */
9043 else if (GET_CODE (op0) == XOR)
9044 {
9045 rtx newop0 = XEXP (op0, 0);
9046 rtx newop1 = XEXP (op0, 1);
9047 rtx op0_stripped = aarch64_strip_shift (newop0);
9048
9049 *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed);
9050 *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
9051
9052 if (speed)
9053 {
9054 if (op0_stripped != newop0)
9055 *cost += extra_cost->alu.log_shift;
9056 else
9057 *cost += extra_cost->alu.logical;
9058 }
9059
9060 return true;
9061 }
9062 /* MVN. */
9063 if (speed)
9064 *cost += extra_cost->alu.logical;
9065
9066 return false;
9067
9068 case ZERO_EXTEND:
9069
9070 op0 = XEXP (x, 0);
9071 /* If a value is written in SI mode, then zero extended to DI
9072 mode, the operation will in general be free as a write to
9073 a 'w' register implicitly zeroes the upper bits of an 'x'
9074 register. However, if this is
9075
9076 (set (reg) (zero_extend (reg)))
9077
9078 we must cost the explicit register move. */
9079 if (mode == DImode
9080 && GET_MODE (op0) == SImode
9081 && outer == SET)
9082 {
9083 int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
9084
9085 /* If OP_COST is non-zero, then the cost of the zero extend
9086 is effectively the cost of the inner operation. Otherwise
9087 we have a MOV instruction and we take the cost from the MOV
9088 itself. This is true independently of whether we are
9089 optimizing for space or time. */
9090 if (op_cost)
9091 *cost = op_cost;
9092
9093 return true;
9094 }
9095 else if (MEM_P (op0))
9096 {
9097 /* All loads can zero extend to any size for free. */
9098 *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
9099 return true;
9100 }
9101
9102 op0 = aarch64_extend_bitfield_pattern_p (x);
9103 if (op0)
9104 {
9105 *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed);
9106 if (speed)
9107 *cost += extra_cost->alu.bfx;
9108 return true;
9109 }
9110
9111 if (speed)
9112 {
9113 if (VECTOR_MODE_P (mode))
9114 {
9115 /* UMOV. */
9116 *cost += extra_cost->vect.alu;
9117 }
9118 else
9119 {
9120 /* We generate an AND instead of UXTB/UXTH. */
9121 *cost += extra_cost->alu.logical;
9122 }
9123 }
9124 return false;
9125
9126 case SIGN_EXTEND:
9127 if (MEM_P (XEXP (x, 0)))
9128 {
9129 /* LDRSH. */
9130 if (speed)
9131 {
9132 rtx address = XEXP (XEXP (x, 0), 0);
9133 *cost += extra_cost->ldst.load_sign_extend;
9134
9135 *cost +=
9136 COSTS_N_INSNS (aarch64_address_cost (address, mode,
9137 0, speed));
9138 }
9139 return true;
9140 }
9141
9142 op0 = aarch64_extend_bitfield_pattern_p (x);
9143 if (op0)
9144 {
9145 *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed);
9146 if (speed)
9147 *cost += extra_cost->alu.bfx;
9148 return true;
9149 }
9150
9151 if (speed)
9152 {
9153 if (VECTOR_MODE_P (mode))
9154 *cost += extra_cost->vect.alu;
9155 else
9156 *cost += extra_cost->alu.extend;
9157 }
9158 return false;
9159
9160 case ASHIFT:
9161 op0 = XEXP (x, 0);
9162 op1 = XEXP (x, 1);
9163
9164 if (CONST_INT_P (op1))
9165 {
9166 if (speed)
9167 {
9168 if (VECTOR_MODE_P (mode))
9169 {
9170 /* Vector shift (immediate). */
9171 *cost += extra_cost->vect.alu;
9172 }
9173 else
9174 {
9175 /* LSL (immediate), UBMF, UBFIZ and friends. These are all
9176 aliases. */
9177 *cost += extra_cost->alu.shift;
9178 }
9179 }
9180
9181 /* We can incorporate zero/sign extend for free. */
9182 if (GET_CODE (op0) == ZERO_EXTEND
9183 || GET_CODE (op0) == SIGN_EXTEND)
9184 op0 = XEXP (op0, 0);
9185
9186 *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
9187 return true;
9188 }
9189 else
9190 {
9191 if (VECTOR_MODE_P (mode))
9192 {
9193 if (speed)
9194 /* Vector shift (register). */
9195 *cost += extra_cost->vect.alu;
9196 }
9197 else
9198 {
9199 if (speed)
9200 /* LSLV. */
9201 *cost += extra_cost->alu.shift_reg;
9202
9203 if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
9204 && CONST_INT_P (XEXP (op1, 1))
9205 && known_eq (INTVAL (XEXP (op1, 1)),
9206 GET_MODE_BITSIZE (mode) - 1))
9207 {
9208 *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
9209 /* We already demanded XEXP (op1, 0) to be REG_P, so
9210 don't recurse into it. */
9211 return true;
9212 }
9213 }
9214 return false; /* All arguments need to be in registers. */
9215 }
9216
9217 case ROTATE:
9218 case ROTATERT:
9219 case LSHIFTRT:
9220 case ASHIFTRT:
9221 op0 = XEXP (x, 0);
9222 op1 = XEXP (x, 1);
9223
9224 if (CONST_INT_P (op1))
9225 {
9226 /* ASR (immediate) and friends. */
9227 if (speed)
9228 {
9229 if (VECTOR_MODE_P (mode))
9230 *cost += extra_cost->vect.alu;
9231 else
9232 *cost += extra_cost->alu.shift;
9233 }
9234
9235 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
9236 return true;
9237 }
9238 else
9239 {
9240 if (VECTOR_MODE_P (mode))
9241 {
9242 if (speed)
9243 /* Vector shift (register). */
9244 *cost += extra_cost->vect.alu;
9245 }
9246 else
9247 {
9248 if (speed)
9249 /* ASR (register) and friends. */
9250 *cost += extra_cost->alu.shift_reg;
9251
9252 if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
9253 && CONST_INT_P (XEXP (op1, 1))
9254 && known_eq (INTVAL (XEXP (op1, 1)),
9255 GET_MODE_BITSIZE (mode) - 1))
9256 {
9257 *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
9258 /* We already demanded XEXP (op1, 0) to be REG_P, so
9259 don't recurse into it. */
9260 return true;
9261 }
9262 }
9263 return false; /* All arguments need to be in registers. */
9264 }
9265
9266 case SYMBOL_REF:
9267
9268 if (aarch64_cmodel == AARCH64_CMODEL_LARGE
9269 || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
9270 {
9271 /* LDR. */
9272 if (speed)
9273 *cost += extra_cost->ldst.load;
9274 }
9275 else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
9276 || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
9277 {
9278 /* ADRP, followed by ADD. */
9279 *cost += COSTS_N_INSNS (1);
9280 if (speed)
9281 *cost += 2 * extra_cost->alu.arith;
9282 }
9283 else if (aarch64_cmodel == AARCH64_CMODEL_TINY
9284 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
9285 {
9286 /* ADR. */
9287 if (speed)
9288 *cost += extra_cost->alu.arith;
9289 }
9290
9291 if (flag_pic)
9292 {
9293 /* One extra load instruction, after accessing the GOT. */
9294 *cost += COSTS_N_INSNS (1);
9295 if (speed)
9296 *cost += extra_cost->ldst.load;
9297 }
9298 return true;
9299
9300 case HIGH:
9301 case LO_SUM:
9302 /* ADRP/ADD (immediate). */
9303 if (speed)
9304 *cost += extra_cost->alu.arith;
9305 return true;
9306
9307 case ZERO_EXTRACT:
9308 case SIGN_EXTRACT:
9309 /* UBFX/SBFX. */
9310 if (speed)
9311 {
9312 if (VECTOR_MODE_P (mode))
9313 *cost += extra_cost->vect.alu;
9314 else
9315 *cost += extra_cost->alu.bfx;
9316 }
9317
9318 /* We can trust that the immediates used will be correct (there
9319 are no by-register forms), so we need only cost op0. */
9320 *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed);
9321 return true;
9322
9323 case MULT:
9324 *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
9325 /* aarch64_rtx_mult_cost always handles recursion to its
9326 operands. */
9327 return true;
9328
9329 case MOD:
9330 /* We can expand signed mod by power of 2 using a NEGS, two parallel
9331 ANDs and a CSNEG. Assume here that CSNEG is the same as the cost of
9332 an unconditional negate. This case should only ever be reached through
9333 the set_smod_pow2_cheap check in expmed.c. */
9334 if (CONST_INT_P (XEXP (x, 1))
9335 && exact_log2 (INTVAL (XEXP (x, 1))) > 0
9336 && (mode == SImode || mode == DImode))
9337 {
9338 /* We expand to 4 instructions. Reset the baseline. */
9339 *cost = COSTS_N_INSNS (4);
9340
9341 if (speed)
9342 *cost += 2 * extra_cost->alu.logical
9343 + 2 * extra_cost->alu.arith;
9344
9345 return true;
9346 }
9347
9348 /* Fall-through. */
9349 case UMOD:
9350 if (speed)
9351 {
9352 /* Slighly prefer UMOD over SMOD. */
9353 if (VECTOR_MODE_P (mode))
9354 *cost += extra_cost->vect.alu;
9355 else if (GET_MODE_CLASS (mode) == MODE_INT)
9356 *cost += (extra_cost->mult[mode == DImode].add
9357 + extra_cost->mult[mode == DImode].idiv
9358 + (code == MOD ? 1 : 0));
9359 }
9360 return false; /* All arguments need to be in registers. */
9361
9362 case DIV:
9363 case UDIV:
9364 case SQRT:
9365 if (speed)
9366 {
9367 if (VECTOR_MODE_P (mode))
9368 *cost += extra_cost->vect.alu;
9369 else if (GET_MODE_CLASS (mode) == MODE_INT)
9370 /* There is no integer SQRT, so only DIV and UDIV can get
9371 here. */
9372 *cost += (extra_cost->mult[mode == DImode].idiv
9373 /* Slighly prefer UDIV over SDIV. */
9374 + (code == DIV ? 1 : 0));
9375 else
9376 *cost += extra_cost->fp[mode == DFmode].div;
9377 }
9378 return false; /* All arguments need to be in registers. */
9379
9380 case IF_THEN_ELSE:
9381 return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
9382 XEXP (x, 2), cost, speed);
9383
9384 case EQ:
9385 case NE:
9386 case GT:
9387 case GTU:
9388 case LT:
9389 case LTU:
9390 case GE:
9391 case GEU:
9392 case LE:
9393 case LEU:
9394
9395 return false; /* All arguments must be in registers. */
9396
9397 case FMA:
9398 op0 = XEXP (x, 0);
9399 op1 = XEXP (x, 1);
9400 op2 = XEXP (x, 2);
9401
9402 if (speed)
9403 {
9404 if (VECTOR_MODE_P (mode))
9405 *cost += extra_cost->vect.alu;
9406 else
9407 *cost += extra_cost->fp[mode == DFmode].fma;
9408 }
9409
9410 /* FMSUB, FNMADD, and FNMSUB are free. */
9411 if (GET_CODE (op0) == NEG)
9412 op0 = XEXP (op0, 0);
9413
9414 if (GET_CODE (op2) == NEG)
9415 op2 = XEXP (op2, 0);
9416
9417 /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
9418 and the by-element operand as operand 0. */
9419 if (GET_CODE (op1) == NEG)
9420 op1 = XEXP (op1, 0);
9421
9422 /* Catch vector-by-element operations. The by-element operand can
9423 either be (vec_duplicate (vec_select (x))) or just
9424 (vec_select (x)), depending on whether we are multiplying by
9425 a vector or a scalar.
9426
9427 Canonicalization is not very good in these cases, FMA4 will put the
9428 by-element operand as operand 0, FNMA4 will have it as operand 1. */
9429 if (GET_CODE (op0) == VEC_DUPLICATE)
9430 op0 = XEXP (op0, 0);
9431 else if (GET_CODE (op1) == VEC_DUPLICATE)
9432 op1 = XEXP (op1, 0);
9433
9434 if (GET_CODE (op0) == VEC_SELECT)
9435 op0 = XEXP (op0, 0);
9436 else if (GET_CODE (op1) == VEC_SELECT)
9437 op1 = XEXP (op1, 0);
9438
9439 /* If the remaining parameters are not registers,
9440 get the cost to put them into registers. */
9441 *cost += rtx_cost (op0, mode, FMA, 0, speed);
9442 *cost += rtx_cost (op1, mode, FMA, 1, speed);
9443 *cost += rtx_cost (op2, mode, FMA, 2, speed);
9444 return true;
9445
9446 case FLOAT:
9447 case UNSIGNED_FLOAT:
9448 if (speed)
9449 *cost += extra_cost->fp[mode == DFmode].fromint;
9450 return false;
9451
9452 case FLOAT_EXTEND:
9453 if (speed)
9454 {
9455 if (VECTOR_MODE_P (mode))
9456 {
9457 /*Vector truncate. */
9458 *cost += extra_cost->vect.alu;
9459 }
9460 else
9461 *cost += extra_cost->fp[mode == DFmode].widen;
9462 }
9463 return false;
9464
9465 case FLOAT_TRUNCATE:
9466 if (speed)
9467 {
9468 if (VECTOR_MODE_P (mode))
9469 {
9470 /*Vector conversion. */
9471 *cost += extra_cost->vect.alu;
9472 }
9473 else
9474 *cost += extra_cost->fp[mode == DFmode].narrow;
9475 }
9476 return false;
9477
9478 case FIX:
9479 case UNSIGNED_FIX:
9480 x = XEXP (x, 0);
9481 /* Strip the rounding part. They will all be implemented
9482 by the fcvt* family of instructions anyway. */
9483 if (GET_CODE (x) == UNSPEC)
9484 {
9485 unsigned int uns_code = XINT (x, 1);
9486
9487 if (uns_code == UNSPEC_FRINTA
9488 || uns_code == UNSPEC_FRINTM
9489 || uns_code == UNSPEC_FRINTN
9490 || uns_code == UNSPEC_FRINTP
9491 || uns_code == UNSPEC_FRINTZ)
9492 x = XVECEXP (x, 0, 0);
9493 }
9494
9495 if (speed)
9496 {
9497 if (VECTOR_MODE_P (mode))
9498 *cost += extra_cost->vect.alu;
9499 else
9500 *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
9501 }
9502
9503 /* We can combine fmul by a power of 2 followed by a fcvt into a single
9504 fixed-point fcvt. */
9505 if (GET_CODE (x) == MULT
9506 && ((VECTOR_MODE_P (mode)
9507 && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
9508 || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
9509 {
9510 *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code,
9511 0, speed);
9512 return true;
9513 }
9514
9515 *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
9516 return true;
9517
9518 case ABS:
9519 if (VECTOR_MODE_P (mode))
9520 {
9521 /* ABS (vector). */
9522 if (speed)
9523 *cost += extra_cost->vect.alu;
9524 }
9525 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
9526 {
9527 op0 = XEXP (x, 0);
9528
9529 /* FABD, which is analogous to FADD. */
9530 if (GET_CODE (op0) == MINUS)
9531 {
9532 *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
9533 *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
9534 if (speed)
9535 *cost += extra_cost->fp[mode == DFmode].addsub;
9536
9537 return true;
9538 }
9539 /* Simple FABS is analogous to FNEG. */
9540 if (speed)
9541 *cost += extra_cost->fp[mode == DFmode].neg;
9542 }
9543 else
9544 {
9545 /* Integer ABS will either be split to
9546 two arithmetic instructions, or will be an ABS
9547 (scalar), which we don't model. */
9548 *cost = COSTS_N_INSNS (2);
9549 if (speed)
9550 *cost += 2 * extra_cost->alu.arith;
9551 }
9552 return false;
9553
9554 case SMAX:
9555 case SMIN:
9556 if (speed)
9557 {
9558 if (VECTOR_MODE_P (mode))
9559 *cost += extra_cost->vect.alu;
9560 else
9561 {
9562 /* FMAXNM/FMINNM/FMAX/FMIN.
9563 TODO: This may not be accurate for all implementations, but
9564 we do not model this in the cost tables. */
9565 *cost += extra_cost->fp[mode == DFmode].addsub;
9566 }
9567 }
9568 return false;
9569
9570 case UNSPEC:
9571 /* The floating point round to integer frint* instructions. */
9572 if (aarch64_frint_unspec_p (XINT (x, 1)))
9573 {
9574 if (speed)
9575 *cost += extra_cost->fp[mode == DFmode].roundint;
9576
9577 return false;
9578 }
9579
9580 if (XINT (x, 1) == UNSPEC_RBIT)
9581 {
9582 if (speed)
9583 *cost += extra_cost->alu.rev;
9584
9585 return false;
9586 }
9587 break;
9588
9589 case TRUNCATE:
9590
9591 /* Decompose <su>muldi3_highpart. */
9592 if (/* (truncate:DI */
9593 mode == DImode
9594 /* (lshiftrt:TI */
9595 && GET_MODE (XEXP (x, 0)) == TImode
9596 && GET_CODE (XEXP (x, 0)) == LSHIFTRT
9597 /* (mult:TI */
9598 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
9599 /* (ANY_EXTEND:TI (reg:DI))
9600 (ANY_EXTEND:TI (reg:DI))) */
9601 && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
9602 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
9603 || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
9604 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
9605 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
9606 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
9607 /* (const_int 64) */
9608 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
9609 && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
9610 {
9611 /* UMULH/SMULH. */
9612 if (speed)
9613 *cost += extra_cost->mult[mode == DImode].extend;
9614 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
9615 mode, MULT, 0, speed);
9616 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
9617 mode, MULT, 1, speed);
9618 return true;
9619 }
9620
9621 /* Fall through. */
9622 default:
9623 break;
9624 }
9625
9626 if (dump_file
9627 && flag_aarch64_verbose_cost)
9628 fprintf (dump_file,
9629 "\nFailed to cost RTX. Assuming default cost.\n");
9630
9631 return true;
9632 }
9633
9634 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
9635 calculated for X. This cost is stored in *COST. Returns true
9636 if the total cost of X was calculated. */
9637 static bool
9638 aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
9639 int param, int *cost, bool speed)
9640 {
9641 bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
9642
9643 if (dump_file
9644 && flag_aarch64_verbose_cost)
9645 {
9646 print_rtl_single (dump_file, x);
9647 fprintf (dump_file, "\n%s cost: %d (%s)\n",
9648 speed ? "Hot" : "Cold",
9649 *cost, result ? "final" : "partial");
9650 }
9651
9652 return result;
9653 }
9654
9655 static int
9656 aarch64_register_move_cost (machine_mode mode,
9657 reg_class_t from_i, reg_class_t to_i)
9658 {
9659 enum reg_class from = (enum reg_class) from_i;
9660 enum reg_class to = (enum reg_class) to_i;
9661 const struct cpu_regmove_cost *regmove_cost
9662 = aarch64_tune_params.regmove_cost;
9663
9664 /* Caller save and pointer regs are equivalent to GENERAL_REGS. */
9665 if (to == TAILCALL_ADDR_REGS || to == POINTER_REGS)
9666 to = GENERAL_REGS;
9667
9668 if (from == TAILCALL_ADDR_REGS || from == POINTER_REGS)
9669 from = GENERAL_REGS;
9670
9671 /* Moving between GPR and stack cost is the same as GP2GP. */
9672 if ((from == GENERAL_REGS && to == STACK_REG)
9673 || (to == GENERAL_REGS && from == STACK_REG))
9674 return regmove_cost->GP2GP;
9675
9676 /* To/From the stack register, we move via the gprs. */
9677 if (to == STACK_REG || from == STACK_REG)
9678 return aarch64_register_move_cost (mode, from, GENERAL_REGS)
9679 + aarch64_register_move_cost (mode, GENERAL_REGS, to);
9680
9681 if (known_eq (GET_MODE_SIZE (mode), 16))
9682 {
9683 /* 128-bit operations on general registers require 2 instructions. */
9684 if (from == GENERAL_REGS && to == GENERAL_REGS)
9685 return regmove_cost->GP2GP * 2;
9686 else if (from == GENERAL_REGS)
9687 return regmove_cost->GP2FP * 2;
9688 else if (to == GENERAL_REGS)
9689 return regmove_cost->FP2GP * 2;
9690
9691 /* When AdvSIMD instructions are disabled it is not possible to move
9692 a 128-bit value directly between Q registers. This is handled in
9693 secondary reload. A general register is used as a scratch to move
9694 the upper DI value and the lower DI value is moved directly,
9695 hence the cost is the sum of three moves. */
9696 if (! TARGET_SIMD)
9697 return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
9698
9699 return regmove_cost->FP2FP;
9700 }
9701
9702 if (from == GENERAL_REGS && to == GENERAL_REGS)
9703 return regmove_cost->GP2GP;
9704 else if (from == GENERAL_REGS)
9705 return regmove_cost->GP2FP;
9706 else if (to == GENERAL_REGS)
9707 return regmove_cost->FP2GP;
9708
9709 return regmove_cost->FP2FP;
9710 }
9711
9712 static int
9713 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
9714 reg_class_t rclass ATTRIBUTE_UNUSED,
9715 bool in ATTRIBUTE_UNUSED)
9716 {
9717 return aarch64_tune_params.memmov_cost;
9718 }
9719
9720 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
9721 to optimize 1.0/sqrt. */
9722
9723 static bool
9724 use_rsqrt_p (machine_mode mode)
9725 {
9726 return (!flag_trapping_math
9727 && flag_unsafe_math_optimizations
9728 && ((aarch64_tune_params.approx_modes->recip_sqrt
9729 & AARCH64_APPROX_MODE (mode))
9730 || flag_mrecip_low_precision_sqrt));
9731 }
9732
9733 /* Function to decide when to use the approximate reciprocal square root
9734 builtin. */
9735
9736 static tree
9737 aarch64_builtin_reciprocal (tree fndecl)
9738 {
9739 machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
9740
9741 if (!use_rsqrt_p (mode))
9742 return NULL_TREE;
9743 return aarch64_builtin_rsqrt (DECL_FUNCTION_CODE (fndecl));
9744 }
9745
9746 /* Emit instruction sequence to compute either the approximate square root
9747 or its approximate reciprocal, depending on the flag RECP, and return
9748 whether the sequence was emitted or not. */
9749
9750 bool
9751 aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
9752 {
9753 machine_mode mode = GET_MODE (dst);
9754
9755 if (GET_MODE_INNER (mode) == HFmode)
9756 {
9757 gcc_assert (!recp);
9758 return false;
9759 }
9760
9761 if (!recp)
9762 {
9763 if (!(flag_mlow_precision_sqrt
9764 || (aarch64_tune_params.approx_modes->sqrt
9765 & AARCH64_APPROX_MODE (mode))))
9766 return false;
9767
9768 if (flag_finite_math_only
9769 || flag_trapping_math
9770 || !flag_unsafe_math_optimizations
9771 || optimize_function_for_size_p (cfun))
9772 return false;
9773 }
9774 else
9775 /* Caller assumes we cannot fail. */
9776 gcc_assert (use_rsqrt_p (mode));
9777
9778 machine_mode mmsk = mode_for_int_vector (mode).require ();
9779 rtx xmsk = gen_reg_rtx (mmsk);
9780 if (!recp)
9781 /* When calculating the approximate square root, compare the
9782 argument with 0.0 and create a mask. */
9783 emit_insn (gen_rtx_SET (xmsk,
9784 gen_rtx_NEG (mmsk,
9785 gen_rtx_EQ (mmsk, src,
9786 CONST0_RTX (mode)))));
9787
9788 /* Estimate the approximate reciprocal square root. */
9789 rtx xdst = gen_reg_rtx (mode);
9790 emit_insn (gen_aarch64_rsqrte (mode, xdst, src));
9791
9792 /* Iterate over the series twice for SF and thrice for DF. */
9793 int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
9794
9795 /* Optionally iterate over the series once less for faster performance
9796 while sacrificing the accuracy. */
9797 if ((recp && flag_mrecip_low_precision_sqrt)
9798 || (!recp && flag_mlow_precision_sqrt))
9799 iterations--;
9800
9801 /* Iterate over the series to calculate the approximate reciprocal square
9802 root. */
9803 rtx x1 = gen_reg_rtx (mode);
9804 while (iterations--)
9805 {
9806 rtx x2 = gen_reg_rtx (mode);
9807 emit_set_insn (x2, gen_rtx_MULT (mode, xdst, xdst));
9808
9809 emit_insn (gen_aarch64_rsqrts (mode, x1, src, x2));
9810
9811 if (iterations > 0)
9812 emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, x1));
9813 }
9814
9815 if (!recp)
9816 {
9817 /* Qualify the approximate reciprocal square root when the argument is
9818 0.0 by squashing the intermediary result to 0.0. */
9819 rtx xtmp = gen_reg_rtx (mmsk);
9820 emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
9821 gen_rtx_SUBREG (mmsk, xdst, 0)));
9822 emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
9823
9824 /* Calculate the approximate square root. */
9825 emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, src));
9826 }
9827
9828 /* Finalize the approximation. */
9829 emit_set_insn (dst, gen_rtx_MULT (mode, xdst, x1));
9830
9831 return true;
9832 }
9833
9834 /* Emit the instruction sequence to compute the approximation for the division
9835 of NUM by DEN in QUO and return whether the sequence was emitted or not. */
9836
9837 bool
9838 aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
9839 {
9840 machine_mode mode = GET_MODE (quo);
9841
9842 if (GET_MODE_INNER (mode) == HFmode)
9843 return false;
9844
9845 bool use_approx_division_p = (flag_mlow_precision_div
9846 || (aarch64_tune_params.approx_modes->division
9847 & AARCH64_APPROX_MODE (mode)));
9848
9849 if (!flag_finite_math_only
9850 || flag_trapping_math
9851 || !flag_unsafe_math_optimizations
9852 || optimize_function_for_size_p (cfun)
9853 || !use_approx_division_p)
9854 return false;
9855
9856 if (!TARGET_SIMD && VECTOR_MODE_P (mode))
9857 return false;
9858
9859 /* Estimate the approximate reciprocal. */
9860 rtx xrcp = gen_reg_rtx (mode);
9861 emit_insn (gen_aarch64_frecpe (mode, xrcp, den));
9862
9863 /* Iterate over the series twice for SF and thrice for DF. */
9864 int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
9865
9866 /* Optionally iterate over the series once less for faster performance,
9867 while sacrificing the accuracy. */
9868 if (flag_mlow_precision_div)
9869 iterations--;
9870
9871 /* Iterate over the series to calculate the approximate reciprocal. */
9872 rtx xtmp = gen_reg_rtx (mode);
9873 while (iterations--)
9874 {
9875 emit_insn (gen_aarch64_frecps (mode, xtmp, xrcp, den));
9876
9877 if (iterations > 0)
9878 emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xtmp));
9879 }
9880
9881 if (num != CONST1_RTX (mode))
9882 {
9883 /* As the approximate reciprocal of DEN is already calculated, only
9884 calculate the approximate division when NUM is not 1.0. */
9885 rtx xnum = force_reg (mode, num);
9886 emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xnum));
9887 }
9888
9889 /* Finalize the approximation. */
9890 emit_set_insn (quo, gen_rtx_MULT (mode, xrcp, xtmp));
9891 return true;
9892 }
9893
9894 /* Return the number of instructions that can be issued per cycle. */
9895 static int
9896 aarch64_sched_issue_rate (void)
9897 {
9898 return aarch64_tune_params.issue_rate;
9899 }
9900
9901 static int
9902 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
9903 {
9904 int issue_rate = aarch64_sched_issue_rate ();
9905
9906 return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
9907 }
9908
9909
9910 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
9911 autopref_multipass_dfa_lookahead_guard from haifa-sched.c. It only
9912 has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0. */
9913
9914 static int
9915 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
9916 int ready_index)
9917 {
9918 return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
9919 }
9920
9921
9922 /* Vectorizer cost model target hooks. */
9923
9924 /* Implement targetm.vectorize.builtin_vectorization_cost. */
9925 static int
9926 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
9927 tree vectype,
9928 int misalign ATTRIBUTE_UNUSED)
9929 {
9930 unsigned elements;
9931 const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
9932 bool fp = false;
9933
9934 if (vectype != NULL)
9935 fp = FLOAT_TYPE_P (vectype);
9936
9937 switch (type_of_cost)
9938 {
9939 case scalar_stmt:
9940 return fp ? costs->scalar_fp_stmt_cost : costs->scalar_int_stmt_cost;
9941
9942 case scalar_load:
9943 return costs->scalar_load_cost;
9944
9945 case scalar_store:
9946 return costs->scalar_store_cost;
9947
9948 case vector_stmt:
9949 return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
9950
9951 case vector_load:
9952 return costs->vec_align_load_cost;
9953
9954 case vector_store:
9955 return costs->vec_store_cost;
9956
9957 case vec_to_scalar:
9958 return costs->vec_to_scalar_cost;
9959
9960 case scalar_to_vec:
9961 return costs->scalar_to_vec_cost;
9962
9963 case unaligned_load:
9964 case vector_gather_load:
9965 return costs->vec_unalign_load_cost;
9966
9967 case unaligned_store:
9968 case vector_scatter_store:
9969 return costs->vec_unalign_store_cost;
9970
9971 case cond_branch_taken:
9972 return costs->cond_taken_branch_cost;
9973
9974 case cond_branch_not_taken:
9975 return costs->cond_not_taken_branch_cost;
9976
9977 case vec_perm:
9978 return costs->vec_permute_cost;
9979
9980 case vec_promote_demote:
9981 return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
9982
9983 case vec_construct:
9984 elements = estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype));
9985 return elements / 2 + 1;
9986
9987 default:
9988 gcc_unreachable ();
9989 }
9990 }
9991
9992 /* Implement targetm.vectorize.add_stmt_cost. */
9993 static unsigned
9994 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
9995 struct _stmt_vec_info *stmt_info, int misalign,
9996 enum vect_cost_model_location where)
9997 {
9998 unsigned *cost = (unsigned *) data;
9999 unsigned retval = 0;
10000
10001 if (flag_vect_cost_model)
10002 {
10003 tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
10004 int stmt_cost =
10005 aarch64_builtin_vectorization_cost (kind, vectype, misalign);
10006
10007 /* Statements in an inner loop relative to the loop being
10008 vectorized are weighted more heavily. The value here is
10009 arbitrary and could potentially be improved with analysis. */
10010 if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
10011 count *= 50; /* FIXME */
10012
10013 retval = (unsigned) (count * stmt_cost);
10014 cost[where] += retval;
10015 }
10016
10017 return retval;
10018 }
10019
10020 static void initialize_aarch64_code_model (struct gcc_options *);
10021
10022 /* Parse the TO_PARSE string and put the architecture struct that it
10023 selects into RES and the architectural features into ISA_FLAGS.
10024 Return an aarch64_parse_opt_result describing the parse result.
10025 If there is an error parsing, RES and ISA_FLAGS are left unchanged. */
10026
10027 static enum aarch64_parse_opt_result
10028 aarch64_parse_arch (const char *to_parse, const struct processor **res,
10029 unsigned long *isa_flags)
10030 {
10031 char *ext;
10032 const struct processor *arch;
10033 char *str = (char *) alloca (strlen (to_parse) + 1);
10034 size_t len;
10035
10036 strcpy (str, to_parse);
10037
10038 ext = strchr (str, '+');
10039
10040 if (ext != NULL)
10041 len = ext - str;
10042 else
10043 len = strlen (str);
10044
10045 if (len == 0)
10046 return AARCH64_PARSE_MISSING_ARG;
10047
10048
10049 /* Loop through the list of supported ARCHes to find a match. */
10050 for (arch = all_architectures; arch->name != NULL; arch++)
10051 {
10052 if (strlen (arch->name) == len && strncmp (arch->name, str, len) == 0)
10053 {
10054 unsigned long isa_temp = arch->flags;
10055
10056 if (ext != NULL)
10057 {
10058 /* TO_PARSE string contains at least one extension. */
10059 enum aarch64_parse_opt_result ext_res
10060 = aarch64_parse_extension (ext, &isa_temp);
10061
10062 if (ext_res != AARCH64_PARSE_OK)
10063 return ext_res;
10064 }
10065 /* Extension parsing was successful. Confirm the result
10066 arch and ISA flags. */
10067 *res = arch;
10068 *isa_flags = isa_temp;
10069 return AARCH64_PARSE_OK;
10070 }
10071 }
10072
10073 /* ARCH name not found in list. */
10074 return AARCH64_PARSE_INVALID_ARG;
10075 }
10076
10077 /* Parse the TO_PARSE string and put the result tuning in RES and the
10078 architecture flags in ISA_FLAGS. Return an aarch64_parse_opt_result
10079 describing the parse result. If there is an error parsing, RES and
10080 ISA_FLAGS are left unchanged. */
10081
10082 static enum aarch64_parse_opt_result
10083 aarch64_parse_cpu (const char *to_parse, const struct processor **res,
10084 unsigned long *isa_flags)
10085 {
10086 char *ext;
10087 const struct processor *cpu;
10088 char *str = (char *) alloca (strlen (to_parse) + 1);
10089 size_t len;
10090
10091 strcpy (str, to_parse);
10092
10093 ext = strchr (str, '+');
10094
10095 if (ext != NULL)
10096 len = ext - str;
10097 else
10098 len = strlen (str);
10099
10100 if (len == 0)
10101 return AARCH64_PARSE_MISSING_ARG;
10102
10103
10104 /* Loop through the list of supported CPUs to find a match. */
10105 for (cpu = all_cores; cpu->name != NULL; cpu++)
10106 {
10107 if (strlen (cpu->name) == len && strncmp (cpu->name, str, len) == 0)
10108 {
10109 unsigned long isa_temp = cpu->flags;
10110
10111
10112 if (ext != NULL)
10113 {
10114 /* TO_PARSE string contains at least one extension. */
10115 enum aarch64_parse_opt_result ext_res
10116 = aarch64_parse_extension (ext, &isa_temp);
10117
10118 if (ext_res != AARCH64_PARSE_OK)
10119 return ext_res;
10120 }
10121 /* Extension parsing was successfull. Confirm the result
10122 cpu and ISA flags. */
10123 *res = cpu;
10124 *isa_flags = isa_temp;
10125 return AARCH64_PARSE_OK;
10126 }
10127 }
10128
10129 /* CPU name not found in list. */
10130 return AARCH64_PARSE_INVALID_ARG;
10131 }
10132
10133 /* Parse the TO_PARSE string and put the cpu it selects into RES.
10134 Return an aarch64_parse_opt_result describing the parse result.
10135 If the parsing fails the RES does not change. */
10136
10137 static enum aarch64_parse_opt_result
10138 aarch64_parse_tune (const char *to_parse, const struct processor **res)
10139 {
10140 const struct processor *cpu;
10141 char *str = (char *) alloca (strlen (to_parse) + 1);
10142
10143 strcpy (str, to_parse);
10144
10145 /* Loop through the list of supported CPUs to find a match. */
10146 for (cpu = all_cores; cpu->name != NULL; cpu++)
10147 {
10148 if (strcmp (cpu->name, str) == 0)
10149 {
10150 *res = cpu;
10151 return AARCH64_PARSE_OK;
10152 }
10153 }
10154
10155 /* CPU name not found in list. */
10156 return AARCH64_PARSE_INVALID_ARG;
10157 }
10158
10159 /* Parse TOKEN, which has length LENGTH to see if it is an option
10160 described in FLAG. If it is, return the index bit for that fusion type.
10161 If not, error (printing OPTION_NAME) and return zero. */
10162
10163 static unsigned int
10164 aarch64_parse_one_option_token (const char *token,
10165 size_t length,
10166 const struct aarch64_flag_desc *flag,
10167 const char *option_name)
10168 {
10169 for (; flag->name != NULL; flag++)
10170 {
10171 if (length == strlen (flag->name)
10172 && !strncmp (flag->name, token, length))
10173 return flag->flag;
10174 }
10175
10176 error ("unknown flag passed in -moverride=%s (%s)", option_name, token);
10177 return 0;
10178 }
10179
10180 /* Parse OPTION which is a comma-separated list of flags to enable.
10181 FLAGS gives the list of flags we understand, INITIAL_STATE gives any
10182 default state we inherit from the CPU tuning structures. OPTION_NAME
10183 gives the top-level option we are parsing in the -moverride string,
10184 for use in error messages. */
10185
10186 static unsigned int
10187 aarch64_parse_boolean_options (const char *option,
10188 const struct aarch64_flag_desc *flags,
10189 unsigned int initial_state,
10190 const char *option_name)
10191 {
10192 const char separator = '.';
10193 const char* specs = option;
10194 const char* ntoken = option;
10195 unsigned int found_flags = initial_state;
10196
10197 while ((ntoken = strchr (specs, separator)))
10198 {
10199 size_t token_length = ntoken - specs;
10200 unsigned token_ops = aarch64_parse_one_option_token (specs,
10201 token_length,
10202 flags,
10203 option_name);
10204 /* If we find "none" (or, for simplicity's sake, an error) anywhere
10205 in the token stream, reset the supported operations. So:
10206
10207 adrp+add.cmp+branch.none.adrp+add
10208
10209 would have the result of turning on only adrp+add fusion. */
10210 if (!token_ops)
10211 found_flags = 0;
10212
10213 found_flags |= token_ops;
10214 specs = ++ntoken;
10215 }
10216
10217 /* We ended with a comma, print something. */
10218 if (!(*specs))
10219 {
10220 error ("%s string ill-formed\n", option_name);
10221 return 0;
10222 }
10223
10224 /* We still have one more token to parse. */
10225 size_t token_length = strlen (specs);
10226 unsigned token_ops = aarch64_parse_one_option_token (specs,
10227 token_length,
10228 flags,
10229 option_name);
10230 if (!token_ops)
10231 found_flags = 0;
10232
10233 found_flags |= token_ops;
10234 return found_flags;
10235 }
10236
10237 /* Support for overriding instruction fusion. */
10238
10239 static void
10240 aarch64_parse_fuse_string (const char *fuse_string,
10241 struct tune_params *tune)
10242 {
10243 tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
10244 aarch64_fusible_pairs,
10245 tune->fusible_ops,
10246 "fuse=");
10247 }
10248
10249 /* Support for overriding other tuning flags. */
10250
10251 static void
10252 aarch64_parse_tune_string (const char *tune_string,
10253 struct tune_params *tune)
10254 {
10255 tune->extra_tuning_flags
10256 = aarch64_parse_boolean_options (tune_string,
10257 aarch64_tuning_flags,
10258 tune->extra_tuning_flags,
10259 "tune=");
10260 }
10261
10262 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
10263 we understand. If it is, extract the option string and handoff to
10264 the appropriate function. */
10265
10266 void
10267 aarch64_parse_one_override_token (const char* token,
10268 size_t length,
10269 struct tune_params *tune)
10270 {
10271 const struct aarch64_tuning_override_function *fn
10272 = aarch64_tuning_override_functions;
10273
10274 const char *option_part = strchr (token, '=');
10275 if (!option_part)
10276 {
10277 error ("tuning string missing in option (%s)", token);
10278 return;
10279 }
10280
10281 /* Get the length of the option name. */
10282 length = option_part - token;
10283 /* Skip the '=' to get to the option string. */
10284 option_part++;
10285
10286 for (; fn->name != NULL; fn++)
10287 {
10288 if (!strncmp (fn->name, token, length))
10289 {
10290 fn->parse_override (option_part, tune);
10291 return;
10292 }
10293 }
10294
10295 error ("unknown tuning option (%s)",token);
10296 return;
10297 }
10298
10299 /* A checking mechanism for the implementation of the tls size. */
10300
10301 static void
10302 initialize_aarch64_tls_size (struct gcc_options *opts)
10303 {
10304 if (aarch64_tls_size == 0)
10305 aarch64_tls_size = 24;
10306
10307 switch (opts->x_aarch64_cmodel_var)
10308 {
10309 case AARCH64_CMODEL_TINY:
10310 /* Both the default and maximum TLS size allowed under tiny is 1M which
10311 needs two instructions to address, so we clamp the size to 24. */
10312 if (aarch64_tls_size > 24)
10313 aarch64_tls_size = 24;
10314 break;
10315 case AARCH64_CMODEL_SMALL:
10316 /* The maximum TLS size allowed under small is 4G. */
10317 if (aarch64_tls_size > 32)
10318 aarch64_tls_size = 32;
10319 break;
10320 case AARCH64_CMODEL_LARGE:
10321 /* The maximum TLS size allowed under large is 16E.
10322 FIXME: 16E should be 64bit, we only support 48bit offset now. */
10323 if (aarch64_tls_size > 48)
10324 aarch64_tls_size = 48;
10325 break;
10326 default:
10327 gcc_unreachable ();
10328 }
10329
10330 return;
10331 }
10332
10333 /* Parse STRING looking for options in the format:
10334 string :: option:string
10335 option :: name=substring
10336 name :: {a-z}
10337 substring :: defined by option. */
10338
10339 static void
10340 aarch64_parse_override_string (const char* input_string,
10341 struct tune_params* tune)
10342 {
10343 const char separator = ':';
10344 size_t string_length = strlen (input_string) + 1;
10345 char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
10346 char *string = string_root;
10347 strncpy (string, input_string, string_length);
10348 string[string_length - 1] = '\0';
10349
10350 char* ntoken = string;
10351
10352 while ((ntoken = strchr (string, separator)))
10353 {
10354 size_t token_length = ntoken - string;
10355 /* Make this substring look like a string. */
10356 *ntoken = '\0';
10357 aarch64_parse_one_override_token (string, token_length, tune);
10358 string = ++ntoken;
10359 }
10360
10361 /* One last option to parse. */
10362 aarch64_parse_one_override_token (string, strlen (string), tune);
10363 free (string_root);
10364 }
10365
10366
10367 static void
10368 aarch64_override_options_after_change_1 (struct gcc_options *opts)
10369 {
10370 /* PR 70044: We have to be careful about being called multiple times for the
10371 same function. This means all changes should be repeatable. */
10372
10373 /* Set aarch64_use_frame_pointer based on -fno-omit-frame-pointer.
10374 Disable the frame pointer flag so the mid-end will not use a frame
10375 pointer in leaf functions in order to support -fomit-leaf-frame-pointer.
10376 Set x_flag_omit_frame_pointer to the special value 2 to differentiate
10377 between -fomit-frame-pointer (1) and -fno-omit-frame-pointer (2). */
10378 aarch64_use_frame_pointer = opts->x_flag_omit_frame_pointer != 1;
10379 if (opts->x_flag_omit_frame_pointer == 0)
10380 opts->x_flag_omit_frame_pointer = 2;
10381
10382 /* If not optimizing for size, set the default
10383 alignment to what the target wants. */
10384 if (!opts->x_optimize_size)
10385 {
10386 if (opts->x_flag_align_loops && !opts->x_str_align_loops)
10387 opts->x_str_align_loops = aarch64_tune_params.loop_align;
10388 if (opts->x_flag_align_jumps && !opts->x_str_align_jumps)
10389 opts->x_str_align_jumps = aarch64_tune_params.jump_align;
10390 if (opts->x_flag_align_functions && !opts->x_str_align_functions)
10391 opts->x_str_align_functions = aarch64_tune_params.function_align;
10392 }
10393
10394 /* We default to no pc-relative literal loads. */
10395
10396 aarch64_pcrelative_literal_loads = false;
10397
10398 /* If -mpc-relative-literal-loads is set on the command line, this
10399 implies that the user asked for PC relative literal loads. */
10400 if (opts->x_pcrelative_literal_loads == 1)
10401 aarch64_pcrelative_literal_loads = true;
10402
10403 /* In the tiny memory model it makes no sense to disallow PC relative
10404 literal pool loads. */
10405 if (aarch64_cmodel == AARCH64_CMODEL_TINY
10406 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
10407 aarch64_pcrelative_literal_loads = true;
10408
10409 /* When enabling the lower precision Newton series for the square root, also
10410 enable it for the reciprocal square root, since the latter is an
10411 intermediary step for the former. */
10412 if (flag_mlow_precision_sqrt)
10413 flag_mrecip_low_precision_sqrt = true;
10414 }
10415
10416 /* 'Unpack' up the internal tuning structs and update the options
10417 in OPTS. The caller must have set up selected_tune and selected_arch
10418 as all the other target-specific codegen decisions are
10419 derived from them. */
10420
10421 void
10422 aarch64_override_options_internal (struct gcc_options *opts)
10423 {
10424 aarch64_tune_flags = selected_tune->flags;
10425 aarch64_tune = selected_tune->sched_core;
10426 /* Make a copy of the tuning parameters attached to the core, which
10427 we may later overwrite. */
10428 aarch64_tune_params = *(selected_tune->tune);
10429 aarch64_architecture_version = selected_arch->architecture_version;
10430
10431 if (opts->x_aarch64_override_tune_string)
10432 aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
10433 &aarch64_tune_params);
10434
10435 /* This target defaults to strict volatile bitfields. */
10436 if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
10437 opts->x_flag_strict_volatile_bitfields = 1;
10438
10439 initialize_aarch64_code_model (opts);
10440 initialize_aarch64_tls_size (opts);
10441
10442 int queue_depth = 0;
10443 switch (aarch64_tune_params.autoprefetcher_model)
10444 {
10445 case tune_params::AUTOPREFETCHER_OFF:
10446 queue_depth = -1;
10447 break;
10448 case tune_params::AUTOPREFETCHER_WEAK:
10449 queue_depth = 0;
10450 break;
10451 case tune_params::AUTOPREFETCHER_STRONG:
10452 queue_depth = max_insn_queue_index + 1;
10453 break;
10454 default:
10455 gcc_unreachable ();
10456 }
10457
10458 /* We don't mind passing in global_options_set here as we don't use
10459 the *options_set structs anyway. */
10460 maybe_set_param_value (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH,
10461 queue_depth,
10462 opts->x_param_values,
10463 global_options_set.x_param_values);
10464
10465 /* Set up parameters to be used in prefetching algorithm. Do not
10466 override the defaults unless we are tuning for a core we have
10467 researched values for. */
10468 if (aarch64_tune_params.prefetch->num_slots > 0)
10469 maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
10470 aarch64_tune_params.prefetch->num_slots,
10471 opts->x_param_values,
10472 global_options_set.x_param_values);
10473 if (aarch64_tune_params.prefetch->l1_cache_size >= 0)
10474 maybe_set_param_value (PARAM_L1_CACHE_SIZE,
10475 aarch64_tune_params.prefetch->l1_cache_size,
10476 opts->x_param_values,
10477 global_options_set.x_param_values);
10478 if (aarch64_tune_params.prefetch->l1_cache_line_size >= 0)
10479 maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
10480 aarch64_tune_params.prefetch->l1_cache_line_size,
10481 opts->x_param_values,
10482 global_options_set.x_param_values);
10483 if (aarch64_tune_params.prefetch->l2_cache_size >= 0)
10484 maybe_set_param_value (PARAM_L2_CACHE_SIZE,
10485 aarch64_tune_params.prefetch->l2_cache_size,
10486 opts->x_param_values,
10487 global_options_set.x_param_values);
10488 if (!aarch64_tune_params.prefetch->prefetch_dynamic_strides)
10489 maybe_set_param_value (PARAM_PREFETCH_DYNAMIC_STRIDES,
10490 0,
10491 opts->x_param_values,
10492 global_options_set.x_param_values);
10493 if (aarch64_tune_params.prefetch->minimum_stride >= 0)
10494 maybe_set_param_value (PARAM_PREFETCH_MINIMUM_STRIDE,
10495 aarch64_tune_params.prefetch->minimum_stride,
10496 opts->x_param_values,
10497 global_options_set.x_param_values);
10498
10499 /* Use the alternative scheduling-pressure algorithm by default. */
10500 maybe_set_param_value (PARAM_SCHED_PRESSURE_ALGORITHM, SCHED_PRESSURE_MODEL,
10501 opts->x_param_values,
10502 global_options_set.x_param_values);
10503
10504 /* Enable sw prefetching at specified optimization level for
10505 CPUS that have prefetch. Lower optimization level threshold by 1
10506 when profiling is enabled. */
10507 if (opts->x_flag_prefetch_loop_arrays < 0
10508 && !opts->x_optimize_size
10509 && aarch64_tune_params.prefetch->default_opt_level >= 0
10510 && opts->x_optimize >= aarch64_tune_params.prefetch->default_opt_level)
10511 opts->x_flag_prefetch_loop_arrays = 1;
10512
10513 if (opts->x_aarch64_arch_string == NULL)
10514 opts->x_aarch64_arch_string = selected_arch->name;
10515 if (opts->x_aarch64_cpu_string == NULL)
10516 opts->x_aarch64_cpu_string = selected_cpu->name;
10517 if (opts->x_aarch64_tune_string == NULL)
10518 opts->x_aarch64_tune_string = selected_tune->name;
10519
10520 aarch64_override_options_after_change_1 (opts);
10521 }
10522
10523 /* Print a hint with a suggestion for a core or architecture name that
10524 most closely resembles what the user passed in STR. ARCH is true if
10525 the user is asking for an architecture name. ARCH is false if the user
10526 is asking for a core name. */
10527
10528 static void
10529 aarch64_print_hint_for_core_or_arch (const char *str, bool arch)
10530 {
10531 auto_vec<const char *> candidates;
10532 const struct processor *entry = arch ? all_architectures : all_cores;
10533 for (; entry->name != NULL; entry++)
10534 candidates.safe_push (entry->name);
10535
10536 #ifdef HAVE_LOCAL_CPU_DETECT
10537 /* Add also "native" as possible value. */
10538 if (arch)
10539 candidates.safe_push ("native");
10540 #endif
10541
10542 char *s;
10543 const char *hint = candidates_list_and_hint (str, s, candidates);
10544 if (hint)
10545 inform (input_location, "valid arguments are: %s;"
10546 " did you mean %qs?", s, hint);
10547 else
10548 inform (input_location, "valid arguments are: %s", s);
10549
10550 XDELETEVEC (s);
10551 }
10552
10553 /* Print a hint with a suggestion for a core name that most closely resembles
10554 what the user passed in STR. */
10555
10556 inline static void
10557 aarch64_print_hint_for_core (const char *str)
10558 {
10559 aarch64_print_hint_for_core_or_arch (str, false);
10560 }
10561
10562 /* Print a hint with a suggestion for an architecture name that most closely
10563 resembles what the user passed in STR. */
10564
10565 inline static void
10566 aarch64_print_hint_for_arch (const char *str)
10567 {
10568 aarch64_print_hint_for_core_or_arch (str, true);
10569 }
10570
10571 /* Validate a command-line -mcpu option. Parse the cpu and extensions (if any)
10572 specified in STR and throw errors if appropriate. Put the results if
10573 they are valid in RES and ISA_FLAGS. Return whether the option is
10574 valid. */
10575
10576 static bool
10577 aarch64_validate_mcpu (const char *str, const struct processor **res,
10578 unsigned long *isa_flags)
10579 {
10580 enum aarch64_parse_opt_result parse_res
10581 = aarch64_parse_cpu (str, res, isa_flags);
10582
10583 if (parse_res == AARCH64_PARSE_OK)
10584 return true;
10585
10586 switch (parse_res)
10587 {
10588 case AARCH64_PARSE_MISSING_ARG:
10589 error ("missing cpu name in %<-mcpu=%s%>", str);
10590 break;
10591 case AARCH64_PARSE_INVALID_ARG:
10592 error ("unknown value %qs for -mcpu", str);
10593 aarch64_print_hint_for_core (str);
10594 break;
10595 case AARCH64_PARSE_INVALID_FEATURE:
10596 error ("invalid feature modifier in %<-mcpu=%s%>", str);
10597 break;
10598 default:
10599 gcc_unreachable ();
10600 }
10601
10602 return false;
10603 }
10604
10605 /* Validate a command-line -march option. Parse the arch and extensions
10606 (if any) specified in STR and throw errors if appropriate. Put the
10607 results, if they are valid, in RES and ISA_FLAGS. Return whether the
10608 option is valid. */
10609
10610 static bool
10611 aarch64_validate_march (const char *str, const struct processor **res,
10612 unsigned long *isa_flags)
10613 {
10614 enum aarch64_parse_opt_result parse_res
10615 = aarch64_parse_arch (str, res, isa_flags);
10616
10617 if (parse_res == AARCH64_PARSE_OK)
10618 return true;
10619
10620 switch (parse_res)
10621 {
10622 case AARCH64_PARSE_MISSING_ARG:
10623 error ("missing arch name in %<-march=%s%>", str);
10624 break;
10625 case AARCH64_PARSE_INVALID_ARG:
10626 error ("unknown value %qs for -march", str);
10627 aarch64_print_hint_for_arch (str);
10628 break;
10629 case AARCH64_PARSE_INVALID_FEATURE:
10630 error ("invalid feature modifier in %<-march=%s%>", str);
10631 break;
10632 default:
10633 gcc_unreachable ();
10634 }
10635
10636 return false;
10637 }
10638
10639 /* Validate a command-line -mtune option. Parse the cpu
10640 specified in STR and throw errors if appropriate. Put the
10641 result, if it is valid, in RES. Return whether the option is
10642 valid. */
10643
10644 static bool
10645 aarch64_validate_mtune (const char *str, const struct processor **res)
10646 {
10647 enum aarch64_parse_opt_result parse_res
10648 = aarch64_parse_tune (str, res);
10649
10650 if (parse_res == AARCH64_PARSE_OK)
10651 return true;
10652
10653 switch (parse_res)
10654 {
10655 case AARCH64_PARSE_MISSING_ARG:
10656 error ("missing cpu name in %<-mtune=%s%>", str);
10657 break;
10658 case AARCH64_PARSE_INVALID_ARG:
10659 error ("unknown value %qs for -mtune", str);
10660 aarch64_print_hint_for_core (str);
10661 break;
10662 default:
10663 gcc_unreachable ();
10664 }
10665 return false;
10666 }
10667
10668 /* Return the CPU corresponding to the enum CPU.
10669 If it doesn't specify a cpu, return the default. */
10670
10671 static const struct processor *
10672 aarch64_get_tune_cpu (enum aarch64_processor cpu)
10673 {
10674 if (cpu != aarch64_none)
10675 return &all_cores[cpu];
10676
10677 /* The & 0x3f is to extract the bottom 6 bits that encode the
10678 default cpu as selected by the --with-cpu GCC configure option
10679 in config.gcc.
10680 ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
10681 flags mechanism should be reworked to make it more sane. */
10682 return &all_cores[TARGET_CPU_DEFAULT & 0x3f];
10683 }
10684
10685 /* Return the architecture corresponding to the enum ARCH.
10686 If it doesn't specify a valid architecture, return the default. */
10687
10688 static const struct processor *
10689 aarch64_get_arch (enum aarch64_arch arch)
10690 {
10691 if (arch != aarch64_no_arch)
10692 return &all_architectures[arch];
10693
10694 const struct processor *cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
10695
10696 return &all_architectures[cpu->arch];
10697 }
10698
10699 /* Return the VG value associated with -msve-vector-bits= value VALUE. */
10700
10701 static poly_uint16
10702 aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits_enum value)
10703 {
10704 /* For now generate vector-length agnostic code for -msve-vector-bits=128.
10705 This ensures we can clearly distinguish SVE and Advanced SIMD modes when
10706 deciding which .md file patterns to use and when deciding whether
10707 something is a legitimate address or constant. */
10708 if (value == SVE_SCALABLE || value == SVE_128)
10709 return poly_uint16 (2, 2);
10710 else
10711 return (int) value / 64;
10712 }
10713
10714 /* Implement TARGET_OPTION_OVERRIDE. This is called once in the beginning
10715 and is used to parse the -m{cpu,tune,arch} strings and setup the initial
10716 tuning structs. In particular it must set selected_tune and
10717 aarch64_isa_flags that define the available ISA features and tuning
10718 decisions. It must also set selected_arch as this will be used to
10719 output the .arch asm tags for each function. */
10720
10721 static void
10722 aarch64_override_options (void)
10723 {
10724 unsigned long cpu_isa = 0;
10725 unsigned long arch_isa = 0;
10726 aarch64_isa_flags = 0;
10727
10728 bool valid_cpu = true;
10729 bool valid_tune = true;
10730 bool valid_arch = true;
10731
10732 selected_cpu = NULL;
10733 selected_arch = NULL;
10734 selected_tune = NULL;
10735
10736 /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
10737 If either of -march or -mtune is given, they override their
10738 respective component of -mcpu. */
10739 if (aarch64_cpu_string)
10740 valid_cpu = aarch64_validate_mcpu (aarch64_cpu_string, &selected_cpu,
10741 &cpu_isa);
10742
10743 if (aarch64_arch_string)
10744 valid_arch = aarch64_validate_march (aarch64_arch_string, &selected_arch,
10745 &arch_isa);
10746
10747 if (aarch64_tune_string)
10748 valid_tune = aarch64_validate_mtune (aarch64_tune_string, &selected_tune);
10749
10750 /* If the user did not specify a processor, choose the default
10751 one for them. This will be the CPU set during configuration using
10752 --with-cpu, otherwise it is "generic". */
10753 if (!selected_cpu)
10754 {
10755 if (selected_arch)
10756 {
10757 selected_cpu = &all_cores[selected_arch->ident];
10758 aarch64_isa_flags = arch_isa;
10759 explicit_arch = selected_arch->arch;
10760 }
10761 else
10762 {
10763 /* Get default configure-time CPU. */
10764 selected_cpu = aarch64_get_tune_cpu (aarch64_none);
10765 aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
10766 }
10767
10768 if (selected_tune)
10769 explicit_tune_core = selected_tune->ident;
10770 }
10771 /* If both -mcpu and -march are specified check that they are architecturally
10772 compatible, warn if they're not and prefer the -march ISA flags. */
10773 else if (selected_arch)
10774 {
10775 if (selected_arch->arch != selected_cpu->arch)
10776 {
10777 warning (0, "switch -mcpu=%s conflicts with -march=%s switch",
10778 all_architectures[selected_cpu->arch].name,
10779 selected_arch->name);
10780 }
10781 aarch64_isa_flags = arch_isa;
10782 explicit_arch = selected_arch->arch;
10783 explicit_tune_core = selected_tune ? selected_tune->ident
10784 : selected_cpu->ident;
10785 }
10786 else
10787 {
10788 /* -mcpu but no -march. */
10789 aarch64_isa_flags = cpu_isa;
10790 explicit_tune_core = selected_tune ? selected_tune->ident
10791 : selected_cpu->ident;
10792 gcc_assert (selected_cpu);
10793 selected_arch = &all_architectures[selected_cpu->arch];
10794 explicit_arch = selected_arch->arch;
10795 }
10796
10797 /* Set the arch as well as we will need it when outputing
10798 the .arch directive in assembly. */
10799 if (!selected_arch)
10800 {
10801 gcc_assert (selected_cpu);
10802 selected_arch = &all_architectures[selected_cpu->arch];
10803 }
10804
10805 if (!selected_tune)
10806 selected_tune = selected_cpu;
10807
10808 #ifndef HAVE_AS_MABI_OPTION
10809 /* The compiler may have been configured with 2.23.* binutils, which does
10810 not have support for ILP32. */
10811 if (TARGET_ILP32)
10812 error ("assembler does not support -mabi=ilp32");
10813 #endif
10814
10815 /* Convert -msve-vector-bits to a VG count. */
10816 aarch64_sve_vg = aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits);
10817
10818 if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE && TARGET_ILP32)
10819 sorry ("return address signing is only supported for -mabi=lp64");
10820
10821 /* Make sure we properly set up the explicit options. */
10822 if ((aarch64_cpu_string && valid_cpu)
10823 || (aarch64_tune_string && valid_tune))
10824 gcc_assert (explicit_tune_core != aarch64_none);
10825
10826 if ((aarch64_cpu_string && valid_cpu)
10827 || (aarch64_arch_string && valid_arch))
10828 gcc_assert (explicit_arch != aarch64_no_arch);
10829
10830 aarch64_override_options_internal (&global_options);
10831
10832 /* Save these options as the default ones in case we push and pop them later
10833 while processing functions with potential target attributes. */
10834 target_option_default_node = target_option_current_node
10835 = build_target_option_node (&global_options);
10836 }
10837
10838 /* Implement targetm.override_options_after_change. */
10839
10840 static void
10841 aarch64_override_options_after_change (void)
10842 {
10843 aarch64_override_options_after_change_1 (&global_options);
10844 }
10845
10846 static struct machine_function *
10847 aarch64_init_machine_status (void)
10848 {
10849 struct machine_function *machine;
10850 machine = ggc_cleared_alloc<machine_function> ();
10851 return machine;
10852 }
10853
10854 void
10855 aarch64_init_expanders (void)
10856 {
10857 init_machine_status = aarch64_init_machine_status;
10858 }
10859
10860 /* A checking mechanism for the implementation of the various code models. */
10861 static void
10862 initialize_aarch64_code_model (struct gcc_options *opts)
10863 {
10864 if (opts->x_flag_pic)
10865 {
10866 switch (opts->x_aarch64_cmodel_var)
10867 {
10868 case AARCH64_CMODEL_TINY:
10869 aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
10870 break;
10871 case AARCH64_CMODEL_SMALL:
10872 #ifdef HAVE_AS_SMALL_PIC_RELOCS
10873 aarch64_cmodel = (flag_pic == 2
10874 ? AARCH64_CMODEL_SMALL_PIC
10875 : AARCH64_CMODEL_SMALL_SPIC);
10876 #else
10877 aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
10878 #endif
10879 break;
10880 case AARCH64_CMODEL_LARGE:
10881 sorry ("code model %qs with -f%s", "large",
10882 opts->x_flag_pic > 1 ? "PIC" : "pic");
10883 break;
10884 default:
10885 gcc_unreachable ();
10886 }
10887 }
10888 else
10889 aarch64_cmodel = opts->x_aarch64_cmodel_var;
10890 }
10891
10892 /* Implement TARGET_OPTION_SAVE. */
10893
10894 static void
10895 aarch64_option_save (struct cl_target_option *ptr, struct gcc_options *opts)
10896 {
10897 ptr->x_aarch64_override_tune_string = opts->x_aarch64_override_tune_string;
10898 }
10899
10900 /* Implements TARGET_OPTION_RESTORE. Restore the backend codegen decisions
10901 using the information saved in PTR. */
10902
10903 static void
10904 aarch64_option_restore (struct gcc_options *opts, struct cl_target_option *ptr)
10905 {
10906 opts->x_explicit_tune_core = ptr->x_explicit_tune_core;
10907 selected_tune = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
10908 opts->x_explicit_arch = ptr->x_explicit_arch;
10909 selected_arch = aarch64_get_arch (ptr->x_explicit_arch);
10910 opts->x_aarch64_override_tune_string = ptr->x_aarch64_override_tune_string;
10911
10912 aarch64_override_options_internal (opts);
10913 }
10914
10915 /* Implement TARGET_OPTION_PRINT. */
10916
10917 static void
10918 aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
10919 {
10920 const struct processor *cpu
10921 = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
10922 unsigned long isa_flags = ptr->x_aarch64_isa_flags;
10923 const struct processor *arch = aarch64_get_arch (ptr->x_explicit_arch);
10924 std::string extension
10925 = aarch64_get_extension_string_for_isa_flags (isa_flags, arch->flags);
10926
10927 fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
10928 fprintf (file, "%*sselected arch = %s%s\n", indent, "",
10929 arch->name, extension.c_str ());
10930 }
10931
10932 static GTY(()) tree aarch64_previous_fndecl;
10933
10934 void
10935 aarch64_reset_previous_fndecl (void)
10936 {
10937 aarch64_previous_fndecl = NULL;
10938 }
10939
10940 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
10941 Used by aarch64_set_current_function and aarch64_pragma_target_parse to
10942 make sure optab availability predicates are recomputed when necessary. */
10943
10944 void
10945 aarch64_save_restore_target_globals (tree new_tree)
10946 {
10947 if (TREE_TARGET_GLOBALS (new_tree))
10948 restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
10949 else if (new_tree == target_option_default_node)
10950 restore_target_globals (&default_target_globals);
10951 else
10952 TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
10953 }
10954
10955 /* Implement TARGET_SET_CURRENT_FUNCTION. Unpack the codegen decisions
10956 like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
10957 of the function, if such exists. This function may be called multiple
10958 times on a single function so use aarch64_previous_fndecl to avoid
10959 setting up identical state. */
10960
10961 static void
10962 aarch64_set_current_function (tree fndecl)
10963 {
10964 if (!fndecl || fndecl == aarch64_previous_fndecl)
10965 return;
10966
10967 tree old_tree = (aarch64_previous_fndecl
10968 ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl)
10969 : NULL_TREE);
10970
10971 tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
10972
10973 /* If current function has no attributes but the previous one did,
10974 use the default node. */
10975 if (!new_tree && old_tree)
10976 new_tree = target_option_default_node;
10977
10978 /* If nothing to do, return. #pragma GCC reset or #pragma GCC pop to
10979 the default have been handled by aarch64_save_restore_target_globals from
10980 aarch64_pragma_target_parse. */
10981 if (old_tree == new_tree)
10982 return;
10983
10984 aarch64_previous_fndecl = fndecl;
10985
10986 /* First set the target options. */
10987 cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
10988
10989 aarch64_save_restore_target_globals (new_tree);
10990 }
10991
10992 /* Enum describing the various ways we can handle attributes.
10993 In many cases we can reuse the generic option handling machinery. */
10994
10995 enum aarch64_attr_opt_type
10996 {
10997 aarch64_attr_mask, /* Attribute should set a bit in target_flags. */
10998 aarch64_attr_bool, /* Attribute sets or unsets a boolean variable. */
10999 aarch64_attr_enum, /* Attribute sets an enum variable. */
11000 aarch64_attr_custom /* Attribute requires a custom handling function. */
11001 };
11002
11003 /* All the information needed to handle a target attribute.
11004 NAME is the name of the attribute.
11005 ATTR_TYPE specifies the type of behavior of the attribute as described
11006 in the definition of enum aarch64_attr_opt_type.
11007 ALLOW_NEG is true if the attribute supports a "no-" form.
11008 HANDLER is the function that takes the attribute string as an argument
11009 It is needed only when the ATTR_TYPE is aarch64_attr_custom.
11010 OPT_NUM is the enum specifying the option that the attribute modifies.
11011 This is needed for attributes that mirror the behavior of a command-line
11012 option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
11013 aarch64_attr_enum. */
11014
11015 struct aarch64_attribute_info
11016 {
11017 const char *name;
11018 enum aarch64_attr_opt_type attr_type;
11019 bool allow_neg;
11020 bool (*handler) (const char *);
11021 enum opt_code opt_num;
11022 };
11023
11024 /* Handle the ARCH_STR argument to the arch= target attribute. */
11025
11026 static bool
11027 aarch64_handle_attr_arch (const char *str)
11028 {
11029 const struct processor *tmp_arch = NULL;
11030 enum aarch64_parse_opt_result parse_res
11031 = aarch64_parse_arch (str, &tmp_arch, &aarch64_isa_flags);
11032
11033 if (parse_res == AARCH64_PARSE_OK)
11034 {
11035 gcc_assert (tmp_arch);
11036 selected_arch = tmp_arch;
11037 explicit_arch = selected_arch->arch;
11038 return true;
11039 }
11040
11041 switch (parse_res)
11042 {
11043 case AARCH64_PARSE_MISSING_ARG:
11044 error ("missing name in %<target(\"arch=\")%> pragma or attribute");
11045 break;
11046 case AARCH64_PARSE_INVALID_ARG:
11047 error ("invalid name (\"%s\") in %<target(\"arch=\")%> pragma or attribute", str);
11048 aarch64_print_hint_for_arch (str);
11049 break;
11050 case AARCH64_PARSE_INVALID_FEATURE:
11051 error ("invalid value (\"%s\") in %<target()%> pragma or attribute", str);
11052 break;
11053 default:
11054 gcc_unreachable ();
11055 }
11056
11057 return false;
11058 }
11059
11060 /* Handle the argument CPU_STR to the cpu= target attribute. */
11061
11062 static bool
11063 aarch64_handle_attr_cpu (const char *str)
11064 {
11065 const struct processor *tmp_cpu = NULL;
11066 enum aarch64_parse_opt_result parse_res
11067 = aarch64_parse_cpu (str, &tmp_cpu, &aarch64_isa_flags);
11068
11069 if (parse_res == AARCH64_PARSE_OK)
11070 {
11071 gcc_assert (tmp_cpu);
11072 selected_tune = tmp_cpu;
11073 explicit_tune_core = selected_tune->ident;
11074
11075 selected_arch = &all_architectures[tmp_cpu->arch];
11076 explicit_arch = selected_arch->arch;
11077 return true;
11078 }
11079
11080 switch (parse_res)
11081 {
11082 case AARCH64_PARSE_MISSING_ARG:
11083 error ("missing name in %<target(\"cpu=\")%> pragma or attribute");
11084 break;
11085 case AARCH64_PARSE_INVALID_ARG:
11086 error ("invalid name (\"%s\") in %<target(\"cpu=\")%> pragma or attribute", str);
11087 aarch64_print_hint_for_core (str);
11088 break;
11089 case AARCH64_PARSE_INVALID_FEATURE:
11090 error ("invalid value (\"%s\") in %<target()%> pragma or attribute", str);
11091 break;
11092 default:
11093 gcc_unreachable ();
11094 }
11095
11096 return false;
11097 }
11098
11099 /* Handle the argument STR to the tune= target attribute. */
11100
11101 static bool
11102 aarch64_handle_attr_tune (const char *str)
11103 {
11104 const struct processor *tmp_tune = NULL;
11105 enum aarch64_parse_opt_result parse_res
11106 = aarch64_parse_tune (str, &tmp_tune);
11107
11108 if (parse_res == AARCH64_PARSE_OK)
11109 {
11110 gcc_assert (tmp_tune);
11111 selected_tune = tmp_tune;
11112 explicit_tune_core = selected_tune->ident;
11113 return true;
11114 }
11115
11116 switch (parse_res)
11117 {
11118 case AARCH64_PARSE_INVALID_ARG:
11119 error ("invalid name (\"%s\") in %<target(\"tune=\")%> pragma or attribute", str);
11120 aarch64_print_hint_for_core (str);
11121 break;
11122 default:
11123 gcc_unreachable ();
11124 }
11125
11126 return false;
11127 }
11128
11129 /* Parse an architecture extensions target attribute string specified in STR.
11130 For example "+fp+nosimd". Show any errors if needed. Return TRUE
11131 if successful. Update aarch64_isa_flags to reflect the ISA features
11132 modified. */
11133
11134 static bool
11135 aarch64_handle_attr_isa_flags (char *str)
11136 {
11137 enum aarch64_parse_opt_result parse_res;
11138 unsigned long isa_flags = aarch64_isa_flags;
11139
11140 /* We allow "+nothing" in the beginning to clear out all architectural
11141 features if the user wants to handpick specific features. */
11142 if (strncmp ("+nothing", str, 8) == 0)
11143 {
11144 isa_flags = 0;
11145 str += 8;
11146 }
11147
11148 parse_res = aarch64_parse_extension (str, &isa_flags);
11149
11150 if (parse_res == AARCH64_PARSE_OK)
11151 {
11152 aarch64_isa_flags = isa_flags;
11153 return true;
11154 }
11155
11156 switch (parse_res)
11157 {
11158 case AARCH64_PARSE_MISSING_ARG:
11159 error ("missing value in %<target()%> pragma or attribute");
11160 break;
11161
11162 case AARCH64_PARSE_INVALID_FEATURE:
11163 error ("invalid value (\"%s\") in %<target()%> pragma or attribute", str);
11164 break;
11165
11166 default:
11167 gcc_unreachable ();
11168 }
11169
11170 return false;
11171 }
11172
11173 /* The target attributes that we support. On top of these we also support just
11174 ISA extensions, like __attribute__ ((target ("+crc"))), but that case is
11175 handled explicitly in aarch64_process_one_target_attr. */
11176
11177 static const struct aarch64_attribute_info aarch64_attributes[] =
11178 {
11179 { "general-regs-only", aarch64_attr_mask, false, NULL,
11180 OPT_mgeneral_regs_only },
11181 { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
11182 OPT_mfix_cortex_a53_835769 },
11183 { "fix-cortex-a53-843419", aarch64_attr_bool, true, NULL,
11184 OPT_mfix_cortex_a53_843419 },
11185 { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
11186 { "strict-align", aarch64_attr_mask, true, NULL, OPT_mstrict_align },
11187 { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
11188 OPT_momit_leaf_frame_pointer },
11189 { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
11190 { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
11191 OPT_march_ },
11192 { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
11193 { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
11194 OPT_mtune_ },
11195 { "sign-return-address", aarch64_attr_enum, false, NULL,
11196 OPT_msign_return_address_ },
11197 { NULL, aarch64_attr_custom, false, NULL, OPT____ }
11198 };
11199
11200 /* Parse ARG_STR which contains the definition of one target attribute.
11201 Show appropriate errors if any or return true if the attribute is valid. */
11202
11203 static bool
11204 aarch64_process_one_target_attr (char *arg_str)
11205 {
11206 bool invert = false;
11207
11208 size_t len = strlen (arg_str);
11209
11210 if (len == 0)
11211 {
11212 error ("malformed %<target()%> pragma or attribute");
11213 return false;
11214 }
11215
11216 char *str_to_check = (char *) alloca (len + 1);
11217 strcpy (str_to_check, arg_str);
11218
11219 /* Skip leading whitespace. */
11220 while (*str_to_check == ' ' || *str_to_check == '\t')
11221 str_to_check++;
11222
11223 /* We have something like __attribute__ ((target ("+fp+nosimd"))).
11224 It is easier to detect and handle it explicitly here rather than going
11225 through the machinery for the rest of the target attributes in this
11226 function. */
11227 if (*str_to_check == '+')
11228 return aarch64_handle_attr_isa_flags (str_to_check);
11229
11230 if (len > 3 && strncmp (str_to_check, "no-", 3) == 0)
11231 {
11232 invert = true;
11233 str_to_check += 3;
11234 }
11235 char *arg = strchr (str_to_check, '=');
11236
11237 /* If we found opt=foo then terminate STR_TO_CHECK at the '='
11238 and point ARG to "foo". */
11239 if (arg)
11240 {
11241 *arg = '\0';
11242 arg++;
11243 }
11244 const struct aarch64_attribute_info *p_attr;
11245 bool found = false;
11246 for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
11247 {
11248 /* If the names don't match up, or the user has given an argument
11249 to an attribute that doesn't accept one, or didn't give an argument
11250 to an attribute that expects one, fail to match. */
11251 if (strcmp (str_to_check, p_attr->name) != 0)
11252 continue;
11253
11254 found = true;
11255 bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
11256 || p_attr->attr_type == aarch64_attr_enum;
11257
11258 if (attr_need_arg_p ^ (arg != NULL))
11259 {
11260 error ("pragma or attribute %<target(\"%s\")%> does not accept an argument", str_to_check);
11261 return false;
11262 }
11263
11264 /* If the name matches but the attribute does not allow "no-" versions
11265 then we can't match. */
11266 if (invert && !p_attr->allow_neg)
11267 {
11268 error ("pragma or attribute %<target(\"%s\")%> does not allow a negated form", str_to_check);
11269 return false;
11270 }
11271
11272 switch (p_attr->attr_type)
11273 {
11274 /* Has a custom handler registered.
11275 For example, cpu=, arch=, tune=. */
11276 case aarch64_attr_custom:
11277 gcc_assert (p_attr->handler);
11278 if (!p_attr->handler (arg))
11279 return false;
11280 break;
11281
11282 /* Either set or unset a boolean option. */
11283 case aarch64_attr_bool:
11284 {
11285 struct cl_decoded_option decoded;
11286
11287 generate_option (p_attr->opt_num, NULL, !invert,
11288 CL_TARGET, &decoded);
11289 aarch64_handle_option (&global_options, &global_options_set,
11290 &decoded, input_location);
11291 break;
11292 }
11293 /* Set or unset a bit in the target_flags. aarch64_handle_option
11294 should know what mask to apply given the option number. */
11295 case aarch64_attr_mask:
11296 {
11297 struct cl_decoded_option decoded;
11298 /* We only need to specify the option number.
11299 aarch64_handle_option will know which mask to apply. */
11300 decoded.opt_index = p_attr->opt_num;
11301 decoded.value = !invert;
11302 aarch64_handle_option (&global_options, &global_options_set,
11303 &decoded, input_location);
11304 break;
11305 }
11306 /* Use the option setting machinery to set an option to an enum. */
11307 case aarch64_attr_enum:
11308 {
11309 gcc_assert (arg);
11310 bool valid;
11311 int value;
11312 valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
11313 &value, CL_TARGET);
11314 if (valid)
11315 {
11316 set_option (&global_options, NULL, p_attr->opt_num, value,
11317 NULL, DK_UNSPECIFIED, input_location,
11318 global_dc);
11319 }
11320 else
11321 {
11322 error ("pragma or attribute %<target(\"%s=%s\")%> is not valid", str_to_check, arg);
11323 }
11324 break;
11325 }
11326 default:
11327 gcc_unreachable ();
11328 }
11329 }
11330
11331 /* If we reached here we either have found an attribute and validated
11332 it or didn't match any. If we matched an attribute but its arguments
11333 were malformed we will have returned false already. */
11334 return found;
11335 }
11336
11337 /* Count how many times the character C appears in
11338 NULL-terminated string STR. */
11339
11340 static unsigned int
11341 num_occurences_in_str (char c, char *str)
11342 {
11343 unsigned int res = 0;
11344 while (*str != '\0')
11345 {
11346 if (*str == c)
11347 res++;
11348
11349 str++;
11350 }
11351
11352 return res;
11353 }
11354
11355 /* Parse the tree in ARGS that contains the target attribute information
11356 and update the global target options space. */
11357
11358 bool
11359 aarch64_process_target_attr (tree args)
11360 {
11361 if (TREE_CODE (args) == TREE_LIST)
11362 {
11363 do
11364 {
11365 tree head = TREE_VALUE (args);
11366 if (head)
11367 {
11368 if (!aarch64_process_target_attr (head))
11369 return false;
11370 }
11371 args = TREE_CHAIN (args);
11372 } while (args);
11373
11374 return true;
11375 }
11376
11377 if (TREE_CODE (args) != STRING_CST)
11378 {
11379 error ("attribute %<target%> argument not a string");
11380 return false;
11381 }
11382
11383 size_t len = strlen (TREE_STRING_POINTER (args));
11384 char *str_to_check = (char *) alloca (len + 1);
11385 strcpy (str_to_check, TREE_STRING_POINTER (args));
11386
11387 if (len == 0)
11388 {
11389 error ("malformed %<target()%> pragma or attribute");
11390 return false;
11391 }
11392
11393 /* Used to catch empty spaces between commas i.e.
11394 attribute ((target ("attr1,,attr2"))). */
11395 unsigned int num_commas = num_occurences_in_str (',', str_to_check);
11396
11397 /* Handle multiple target attributes separated by ','. */
11398 char *token = strtok (str_to_check, ",");
11399
11400 unsigned int num_attrs = 0;
11401 while (token)
11402 {
11403 num_attrs++;
11404 if (!aarch64_process_one_target_attr (token))
11405 {
11406 error ("pragma or attribute %<target(\"%s\")%> is not valid", token);
11407 return false;
11408 }
11409
11410 token = strtok (NULL, ",");
11411 }
11412
11413 if (num_attrs != num_commas + 1)
11414 {
11415 error ("malformed %<target(\"%s\")%> pragma or attribute", TREE_STRING_POINTER (args));
11416 return false;
11417 }
11418
11419 return true;
11420 }
11421
11422 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P. This is used to
11423 process attribute ((target ("..."))). */
11424
11425 static bool
11426 aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
11427 {
11428 struct cl_target_option cur_target;
11429 bool ret;
11430 tree old_optimize;
11431 tree new_target, new_optimize;
11432 tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
11433
11434 /* If what we're processing is the current pragma string then the
11435 target option node is already stored in target_option_current_node
11436 by aarch64_pragma_target_parse in aarch64-c.c. Use that to avoid
11437 having to re-parse the string. This is especially useful to keep
11438 arm_neon.h compile times down since that header contains a lot
11439 of intrinsics enclosed in pragmas. */
11440 if (!existing_target && args == current_target_pragma)
11441 {
11442 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
11443 return true;
11444 }
11445 tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
11446
11447 old_optimize = build_optimization_node (&global_options);
11448 func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
11449
11450 /* If the function changed the optimization levels as well as setting
11451 target options, start with the optimizations specified. */
11452 if (func_optimize && func_optimize != old_optimize)
11453 cl_optimization_restore (&global_options,
11454 TREE_OPTIMIZATION (func_optimize));
11455
11456 /* Save the current target options to restore at the end. */
11457 cl_target_option_save (&cur_target, &global_options);
11458
11459 /* If fndecl already has some target attributes applied to it, unpack
11460 them so that we add this attribute on top of them, rather than
11461 overwriting them. */
11462 if (existing_target)
11463 {
11464 struct cl_target_option *existing_options
11465 = TREE_TARGET_OPTION (existing_target);
11466
11467 if (existing_options)
11468 cl_target_option_restore (&global_options, existing_options);
11469 }
11470 else
11471 cl_target_option_restore (&global_options,
11472 TREE_TARGET_OPTION (target_option_current_node));
11473
11474 ret = aarch64_process_target_attr (args);
11475
11476 /* Set up any additional state. */
11477 if (ret)
11478 {
11479 aarch64_override_options_internal (&global_options);
11480 /* Initialize SIMD builtins if we haven't already.
11481 Set current_target_pragma to NULL for the duration so that
11482 the builtin initialization code doesn't try to tag the functions
11483 being built with the attributes specified by any current pragma, thus
11484 going into an infinite recursion. */
11485 if (TARGET_SIMD)
11486 {
11487 tree saved_current_target_pragma = current_target_pragma;
11488 current_target_pragma = NULL;
11489 aarch64_init_simd_builtins ();
11490 current_target_pragma = saved_current_target_pragma;
11491 }
11492 new_target = build_target_option_node (&global_options);
11493 }
11494 else
11495 new_target = NULL;
11496
11497 new_optimize = build_optimization_node (&global_options);
11498
11499 if (fndecl && ret)
11500 {
11501 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
11502
11503 if (old_optimize != new_optimize)
11504 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
11505 }
11506
11507 cl_target_option_restore (&global_options, &cur_target);
11508
11509 if (old_optimize != new_optimize)
11510 cl_optimization_restore (&global_options,
11511 TREE_OPTIMIZATION (old_optimize));
11512 return ret;
11513 }
11514
11515 /* Helper for aarch64_can_inline_p. In the case where CALLER and CALLEE are
11516 tri-bool options (yes, no, don't care) and the default value is
11517 DEF, determine whether to reject inlining. */
11518
11519 static bool
11520 aarch64_tribools_ok_for_inlining_p (int caller, int callee,
11521 int dont_care, int def)
11522 {
11523 /* If the callee doesn't care, always allow inlining. */
11524 if (callee == dont_care)
11525 return true;
11526
11527 /* If the caller doesn't care, always allow inlining. */
11528 if (caller == dont_care)
11529 return true;
11530
11531 /* Otherwise, allow inlining if either the callee and caller values
11532 agree, or if the callee is using the default value. */
11533 return (callee == caller || callee == def);
11534 }
11535
11536 /* Implement TARGET_CAN_INLINE_P. Decide whether it is valid
11537 to inline CALLEE into CALLER based on target-specific info.
11538 Make sure that the caller and callee have compatible architectural
11539 features. Then go through the other possible target attributes
11540 and see if they can block inlining. Try not to reject always_inline
11541 callees unless they are incompatible architecturally. */
11542
11543 static bool
11544 aarch64_can_inline_p (tree caller, tree callee)
11545 {
11546 tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
11547 tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
11548
11549 struct cl_target_option *caller_opts
11550 = TREE_TARGET_OPTION (caller_tree ? caller_tree
11551 : target_option_default_node);
11552
11553 struct cl_target_option *callee_opts
11554 = TREE_TARGET_OPTION (callee_tree ? callee_tree
11555 : target_option_default_node);
11556
11557 /* Callee's ISA flags should be a subset of the caller's. */
11558 if ((caller_opts->x_aarch64_isa_flags & callee_opts->x_aarch64_isa_flags)
11559 != callee_opts->x_aarch64_isa_flags)
11560 return false;
11561
11562 /* Allow non-strict aligned functions inlining into strict
11563 aligned ones. */
11564 if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
11565 != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
11566 && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
11567 && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
11568 return false;
11569
11570 bool always_inline = lookup_attribute ("always_inline",
11571 DECL_ATTRIBUTES (callee));
11572
11573 /* If the architectural features match up and the callee is always_inline
11574 then the other attributes don't matter. */
11575 if (always_inline)
11576 return true;
11577
11578 if (caller_opts->x_aarch64_cmodel_var
11579 != callee_opts->x_aarch64_cmodel_var)
11580 return false;
11581
11582 if (caller_opts->x_aarch64_tls_dialect
11583 != callee_opts->x_aarch64_tls_dialect)
11584 return false;
11585
11586 /* Honour explicit requests to workaround errata. */
11587 if (!aarch64_tribools_ok_for_inlining_p (
11588 caller_opts->x_aarch64_fix_a53_err835769,
11589 callee_opts->x_aarch64_fix_a53_err835769,
11590 2, TARGET_FIX_ERR_A53_835769_DEFAULT))
11591 return false;
11592
11593 if (!aarch64_tribools_ok_for_inlining_p (
11594 caller_opts->x_aarch64_fix_a53_err843419,
11595 callee_opts->x_aarch64_fix_a53_err843419,
11596 2, TARGET_FIX_ERR_A53_843419))
11597 return false;
11598
11599 /* If the user explicitly specified -momit-leaf-frame-pointer for the
11600 caller and calle and they don't match up, reject inlining. */
11601 if (!aarch64_tribools_ok_for_inlining_p (
11602 caller_opts->x_flag_omit_leaf_frame_pointer,
11603 callee_opts->x_flag_omit_leaf_frame_pointer,
11604 2, 1))
11605 return false;
11606
11607 /* If the callee has specific tuning overrides, respect them. */
11608 if (callee_opts->x_aarch64_override_tune_string != NULL
11609 && caller_opts->x_aarch64_override_tune_string == NULL)
11610 return false;
11611
11612 /* If the user specified tuning override strings for the
11613 caller and callee and they don't match up, reject inlining.
11614 We just do a string compare here, we don't analyze the meaning
11615 of the string, as it would be too costly for little gain. */
11616 if (callee_opts->x_aarch64_override_tune_string
11617 && caller_opts->x_aarch64_override_tune_string
11618 && (strcmp (callee_opts->x_aarch64_override_tune_string,
11619 caller_opts->x_aarch64_override_tune_string) != 0))
11620 return false;
11621
11622 return true;
11623 }
11624
11625 /* Return true if SYMBOL_REF X binds locally. */
11626
11627 static bool
11628 aarch64_symbol_binds_local_p (const_rtx x)
11629 {
11630 return (SYMBOL_REF_DECL (x)
11631 ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
11632 : SYMBOL_REF_LOCAL_P (x));
11633 }
11634
11635 /* Return true if SYMBOL_REF X is thread local */
11636 static bool
11637 aarch64_tls_symbol_p (rtx x)
11638 {
11639 if (! TARGET_HAVE_TLS)
11640 return false;
11641
11642 if (GET_CODE (x) != SYMBOL_REF)
11643 return false;
11644
11645 return SYMBOL_REF_TLS_MODEL (x) != 0;
11646 }
11647
11648 /* Classify a TLS symbol into one of the TLS kinds. */
11649 enum aarch64_symbol_type
11650 aarch64_classify_tls_symbol (rtx x)
11651 {
11652 enum tls_model tls_kind = tls_symbolic_operand_type (x);
11653
11654 switch (tls_kind)
11655 {
11656 case TLS_MODEL_GLOBAL_DYNAMIC:
11657 case TLS_MODEL_LOCAL_DYNAMIC:
11658 return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
11659
11660 case TLS_MODEL_INITIAL_EXEC:
11661 switch (aarch64_cmodel)
11662 {
11663 case AARCH64_CMODEL_TINY:
11664 case AARCH64_CMODEL_TINY_PIC:
11665 return SYMBOL_TINY_TLSIE;
11666 default:
11667 return SYMBOL_SMALL_TLSIE;
11668 }
11669
11670 case TLS_MODEL_LOCAL_EXEC:
11671 if (aarch64_tls_size == 12)
11672 return SYMBOL_TLSLE12;
11673 else if (aarch64_tls_size == 24)
11674 return SYMBOL_TLSLE24;
11675 else if (aarch64_tls_size == 32)
11676 return SYMBOL_TLSLE32;
11677 else if (aarch64_tls_size == 48)
11678 return SYMBOL_TLSLE48;
11679 else
11680 gcc_unreachable ();
11681
11682 case TLS_MODEL_EMULATED:
11683 case TLS_MODEL_NONE:
11684 return SYMBOL_FORCE_TO_MEM;
11685
11686 default:
11687 gcc_unreachable ();
11688 }
11689 }
11690
11691 /* Return the correct method for accessing X + OFFSET, where X is either
11692 a SYMBOL_REF or LABEL_REF. */
11693
11694 enum aarch64_symbol_type
11695 aarch64_classify_symbol (rtx x, HOST_WIDE_INT offset)
11696 {
11697 if (GET_CODE (x) == LABEL_REF)
11698 {
11699 switch (aarch64_cmodel)
11700 {
11701 case AARCH64_CMODEL_LARGE:
11702 return SYMBOL_FORCE_TO_MEM;
11703
11704 case AARCH64_CMODEL_TINY_PIC:
11705 case AARCH64_CMODEL_TINY:
11706 return SYMBOL_TINY_ABSOLUTE;
11707
11708 case AARCH64_CMODEL_SMALL_SPIC:
11709 case AARCH64_CMODEL_SMALL_PIC:
11710 case AARCH64_CMODEL_SMALL:
11711 return SYMBOL_SMALL_ABSOLUTE;
11712
11713 default:
11714 gcc_unreachable ();
11715 }
11716 }
11717
11718 if (GET_CODE (x) == SYMBOL_REF)
11719 {
11720 if (aarch64_tls_symbol_p (x))
11721 return aarch64_classify_tls_symbol (x);
11722
11723 switch (aarch64_cmodel)
11724 {
11725 case AARCH64_CMODEL_TINY:
11726 /* When we retrieve symbol + offset address, we have to make sure
11727 the offset does not cause overflow of the final address. But
11728 we have no way of knowing the address of symbol at compile time
11729 so we can't accurately say if the distance between the PC and
11730 symbol + offset is outside the addressible range of +/-1M in the
11731 TINY code model. So we rely on images not being greater than
11732 1M and cap the offset at 1M and anything beyond 1M will have to
11733 be loaded using an alternative mechanism. Furthermore if the
11734 symbol is a weak reference to something that isn't known to
11735 resolve to a symbol in this module, then force to memory. */
11736 if ((SYMBOL_REF_WEAK (x)
11737 && !aarch64_symbol_binds_local_p (x))
11738 || !IN_RANGE (offset, -1048575, 1048575))
11739 return SYMBOL_FORCE_TO_MEM;
11740 return SYMBOL_TINY_ABSOLUTE;
11741
11742 case AARCH64_CMODEL_SMALL:
11743 /* Same reasoning as the tiny code model, but the offset cap here is
11744 4G. */
11745 if ((SYMBOL_REF_WEAK (x)
11746 && !aarch64_symbol_binds_local_p (x))
11747 || !IN_RANGE (offset, HOST_WIDE_INT_C (-4294967263),
11748 HOST_WIDE_INT_C (4294967264)))
11749 return SYMBOL_FORCE_TO_MEM;
11750 return SYMBOL_SMALL_ABSOLUTE;
11751
11752 case AARCH64_CMODEL_TINY_PIC:
11753 if (!aarch64_symbol_binds_local_p (x))
11754 return SYMBOL_TINY_GOT;
11755 return SYMBOL_TINY_ABSOLUTE;
11756
11757 case AARCH64_CMODEL_SMALL_SPIC:
11758 case AARCH64_CMODEL_SMALL_PIC:
11759 if (!aarch64_symbol_binds_local_p (x))
11760 return (aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
11761 ? SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G);
11762 return SYMBOL_SMALL_ABSOLUTE;
11763
11764 case AARCH64_CMODEL_LARGE:
11765 /* This is alright even in PIC code as the constant
11766 pool reference is always PC relative and within
11767 the same translation unit. */
11768 if (!aarch64_pcrelative_literal_loads && CONSTANT_POOL_ADDRESS_P (x))
11769 return SYMBOL_SMALL_ABSOLUTE;
11770 else
11771 return SYMBOL_FORCE_TO_MEM;
11772
11773 default:
11774 gcc_unreachable ();
11775 }
11776 }
11777
11778 /* By default push everything into the constant pool. */
11779 return SYMBOL_FORCE_TO_MEM;
11780 }
11781
11782 bool
11783 aarch64_constant_address_p (rtx x)
11784 {
11785 return (CONSTANT_P (x) && memory_address_p (DImode, x));
11786 }
11787
11788 bool
11789 aarch64_legitimate_pic_operand_p (rtx x)
11790 {
11791 if (GET_CODE (x) == SYMBOL_REF
11792 || (GET_CODE (x) == CONST
11793 && GET_CODE (XEXP (x, 0)) == PLUS
11794 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
11795 return false;
11796
11797 return true;
11798 }
11799
11800 /* Implement TARGET_LEGITIMATE_CONSTANT_P hook. Return true for constants
11801 that should be rematerialized rather than spilled. */
11802
11803 static bool
11804 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
11805 {
11806 /* Support CSE and rematerialization of common constants. */
11807 if (CONST_INT_P (x)
11808 || (CONST_DOUBLE_P (x) && GET_MODE_CLASS (mode) == MODE_FLOAT)
11809 || GET_CODE (x) == CONST_VECTOR)
11810 return true;
11811
11812 /* Do not allow vector struct mode constants for Advanced SIMD.
11813 We could support 0 and -1 easily, but they need support in
11814 aarch64-simd.md. */
11815 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
11816 if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
11817 return false;
11818
11819 /* Only accept variable-length vector constants if they can be
11820 handled directly.
11821
11822 ??? It would be possible to handle rematerialization of other
11823 constants via secondary reloads. */
11824 if (vec_flags & VEC_ANY_SVE)
11825 return aarch64_simd_valid_immediate (x, NULL);
11826
11827 if (GET_CODE (x) == HIGH)
11828 x = XEXP (x, 0);
11829
11830 /* Accept polynomial constants that can be calculated by using the
11831 destination of a move as the sole temporary. Constants that
11832 require a second temporary cannot be rematerialized (they can't be
11833 forced to memory and also aren't legitimate constants). */
11834 poly_int64 offset;
11835 if (poly_int_rtx_p (x, &offset))
11836 return aarch64_offset_temporaries (false, offset) <= 1;
11837
11838 /* If an offset is being added to something else, we need to allow the
11839 base to be moved into the destination register, meaning that there
11840 are no free temporaries for the offset. */
11841 x = strip_offset (x, &offset);
11842 if (!offset.is_constant () && aarch64_offset_temporaries (true, offset) > 0)
11843 return false;
11844
11845 /* Do not allow const (plus (anchor_symbol, const_int)). */
11846 if (maybe_ne (offset, 0) && SYMBOL_REF_P (x) && SYMBOL_REF_ANCHOR_P (x))
11847 return false;
11848
11849 /* Treat symbols as constants. Avoid TLS symbols as they are complex,
11850 so spilling them is better than rematerialization. */
11851 if (SYMBOL_REF_P (x) && !SYMBOL_REF_TLS_MODEL (x))
11852 return true;
11853
11854 /* Label references are always constant. */
11855 if (GET_CODE (x) == LABEL_REF)
11856 return true;
11857
11858 return false;
11859 }
11860
11861 rtx
11862 aarch64_load_tp (rtx target)
11863 {
11864 if (!target
11865 || GET_MODE (target) != Pmode
11866 || !register_operand (target, Pmode))
11867 target = gen_reg_rtx (Pmode);
11868
11869 /* Can return in any reg. */
11870 emit_insn (gen_aarch64_load_tp_hard (target));
11871 return target;
11872 }
11873
11874 /* On AAPCS systems, this is the "struct __va_list". */
11875 static GTY(()) tree va_list_type;
11876
11877 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
11878 Return the type to use as __builtin_va_list.
11879
11880 AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
11881
11882 struct __va_list
11883 {
11884 void *__stack;
11885 void *__gr_top;
11886 void *__vr_top;
11887 int __gr_offs;
11888 int __vr_offs;
11889 }; */
11890
11891 static tree
11892 aarch64_build_builtin_va_list (void)
11893 {
11894 tree va_list_name;
11895 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
11896
11897 /* Create the type. */
11898 va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
11899 /* Give it the required name. */
11900 va_list_name = build_decl (BUILTINS_LOCATION,
11901 TYPE_DECL,
11902 get_identifier ("__va_list"),
11903 va_list_type);
11904 DECL_ARTIFICIAL (va_list_name) = 1;
11905 TYPE_NAME (va_list_type) = va_list_name;
11906 TYPE_STUB_DECL (va_list_type) = va_list_name;
11907
11908 /* Create the fields. */
11909 f_stack = build_decl (BUILTINS_LOCATION,
11910 FIELD_DECL, get_identifier ("__stack"),
11911 ptr_type_node);
11912 f_grtop = build_decl (BUILTINS_LOCATION,
11913 FIELD_DECL, get_identifier ("__gr_top"),
11914 ptr_type_node);
11915 f_vrtop = build_decl (BUILTINS_LOCATION,
11916 FIELD_DECL, get_identifier ("__vr_top"),
11917 ptr_type_node);
11918 f_groff = build_decl (BUILTINS_LOCATION,
11919 FIELD_DECL, get_identifier ("__gr_offs"),
11920 integer_type_node);
11921 f_vroff = build_decl (BUILTINS_LOCATION,
11922 FIELD_DECL, get_identifier ("__vr_offs"),
11923 integer_type_node);
11924
11925 /* Tell tree-stdarg pass about our internal offset fields.
11926 NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
11927 purpose to identify whether the code is updating va_list internal
11928 offset fields through irregular way. */
11929 va_list_gpr_counter_field = f_groff;
11930 va_list_fpr_counter_field = f_vroff;
11931
11932 DECL_ARTIFICIAL (f_stack) = 1;
11933 DECL_ARTIFICIAL (f_grtop) = 1;
11934 DECL_ARTIFICIAL (f_vrtop) = 1;
11935 DECL_ARTIFICIAL (f_groff) = 1;
11936 DECL_ARTIFICIAL (f_vroff) = 1;
11937
11938 DECL_FIELD_CONTEXT (f_stack) = va_list_type;
11939 DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
11940 DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
11941 DECL_FIELD_CONTEXT (f_groff) = va_list_type;
11942 DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
11943
11944 TYPE_FIELDS (va_list_type) = f_stack;
11945 DECL_CHAIN (f_stack) = f_grtop;
11946 DECL_CHAIN (f_grtop) = f_vrtop;
11947 DECL_CHAIN (f_vrtop) = f_groff;
11948 DECL_CHAIN (f_groff) = f_vroff;
11949
11950 /* Compute its layout. */
11951 layout_type (va_list_type);
11952
11953 return va_list_type;
11954 }
11955
11956 /* Implement TARGET_EXPAND_BUILTIN_VA_START. */
11957 static void
11958 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
11959 {
11960 const CUMULATIVE_ARGS *cum;
11961 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
11962 tree stack, grtop, vrtop, groff, vroff;
11963 tree t;
11964 int gr_save_area_size = cfun->va_list_gpr_size;
11965 int vr_save_area_size = cfun->va_list_fpr_size;
11966 int vr_offset;
11967
11968 cum = &crtl->args.info;
11969 if (cfun->va_list_gpr_size)
11970 gr_save_area_size = MIN ((NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD,
11971 cfun->va_list_gpr_size);
11972 if (cfun->va_list_fpr_size)
11973 vr_save_area_size = MIN ((NUM_FP_ARG_REGS - cum->aapcs_nvrn)
11974 * UNITS_PER_VREG, cfun->va_list_fpr_size);
11975
11976 if (!TARGET_FLOAT)
11977 {
11978 gcc_assert (cum->aapcs_nvrn == 0);
11979 vr_save_area_size = 0;
11980 }
11981
11982 f_stack = TYPE_FIELDS (va_list_type_node);
11983 f_grtop = DECL_CHAIN (f_stack);
11984 f_vrtop = DECL_CHAIN (f_grtop);
11985 f_groff = DECL_CHAIN (f_vrtop);
11986 f_vroff = DECL_CHAIN (f_groff);
11987
11988 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
11989 NULL_TREE);
11990 grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
11991 NULL_TREE);
11992 vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
11993 NULL_TREE);
11994 groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
11995 NULL_TREE);
11996 vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
11997 NULL_TREE);
11998
11999 /* Emit code to initialize STACK, which points to the next varargs stack
12000 argument. CUM->AAPCS_STACK_SIZE gives the number of stack words used
12001 by named arguments. STACK is 8-byte aligned. */
12002 t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
12003 if (cum->aapcs_stack_size > 0)
12004 t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
12005 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
12006 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
12007
12008 /* Emit code to initialize GRTOP, the top of the GR save area.
12009 virtual_incoming_args_rtx should have been 16 byte aligned. */
12010 t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
12011 t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
12012 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
12013
12014 /* Emit code to initialize VRTOP, the top of the VR save area.
12015 This address is gr_save_area_bytes below GRTOP, rounded
12016 down to the next 16-byte boundary. */
12017 t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
12018 vr_offset = ROUND_UP (gr_save_area_size,
12019 STACK_BOUNDARY / BITS_PER_UNIT);
12020
12021 if (vr_offset)
12022 t = fold_build_pointer_plus_hwi (t, -vr_offset);
12023 t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
12024 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
12025
12026 /* Emit code to initialize GROFF, the offset from GRTOP of the
12027 next GPR argument. */
12028 t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
12029 build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
12030 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
12031
12032 /* Likewise emit code to initialize VROFF, the offset from FTOP
12033 of the next VR argument. */
12034 t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
12035 build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
12036 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
12037 }
12038
12039 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR. */
12040
12041 static tree
12042 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
12043 gimple_seq *post_p ATTRIBUTE_UNUSED)
12044 {
12045 tree addr;
12046 bool indirect_p;
12047 bool is_ha; /* is HFA or HVA. */
12048 bool dw_align; /* double-word align. */
12049 machine_mode ag_mode = VOIDmode;
12050 int nregs;
12051 machine_mode mode;
12052
12053 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
12054 tree stack, f_top, f_off, off, arg, roundup, on_stack;
12055 HOST_WIDE_INT size, rsize, adjust, align;
12056 tree t, u, cond1, cond2;
12057
12058 indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
12059 if (indirect_p)
12060 type = build_pointer_type (type);
12061
12062 mode = TYPE_MODE (type);
12063
12064 f_stack = TYPE_FIELDS (va_list_type_node);
12065 f_grtop = DECL_CHAIN (f_stack);
12066 f_vrtop = DECL_CHAIN (f_grtop);
12067 f_groff = DECL_CHAIN (f_vrtop);
12068 f_vroff = DECL_CHAIN (f_groff);
12069
12070 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
12071 f_stack, NULL_TREE);
12072 size = int_size_in_bytes (type);
12073 align = aarch64_function_arg_alignment (mode, type) / BITS_PER_UNIT;
12074
12075 dw_align = false;
12076 adjust = 0;
12077 if (aarch64_vfp_is_call_or_return_candidate (mode,
12078 type,
12079 &ag_mode,
12080 &nregs,
12081 &is_ha))
12082 {
12083 /* No frontends can create types with variable-sized modes, so we
12084 shouldn't be asked to pass or return them. */
12085 unsigned int ag_size = GET_MODE_SIZE (ag_mode).to_constant ();
12086
12087 /* TYPE passed in fp/simd registers. */
12088 if (!TARGET_FLOAT)
12089 aarch64_err_no_fpadvsimd (mode);
12090
12091 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
12092 unshare_expr (valist), f_vrtop, NULL_TREE);
12093 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
12094 unshare_expr (valist), f_vroff, NULL_TREE);
12095
12096 rsize = nregs * UNITS_PER_VREG;
12097
12098 if (is_ha)
12099 {
12100 if (BYTES_BIG_ENDIAN && ag_size < UNITS_PER_VREG)
12101 adjust = UNITS_PER_VREG - ag_size;
12102 }
12103 else if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
12104 && size < UNITS_PER_VREG)
12105 {
12106 adjust = UNITS_PER_VREG - size;
12107 }
12108 }
12109 else
12110 {
12111 /* TYPE passed in general registers. */
12112 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
12113 unshare_expr (valist), f_grtop, NULL_TREE);
12114 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
12115 unshare_expr (valist), f_groff, NULL_TREE);
12116 rsize = ROUND_UP (size, UNITS_PER_WORD);
12117 nregs = rsize / UNITS_PER_WORD;
12118
12119 if (align > 8)
12120 dw_align = true;
12121
12122 if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
12123 && size < UNITS_PER_WORD)
12124 {
12125 adjust = UNITS_PER_WORD - size;
12126 }
12127 }
12128
12129 /* Get a local temporary for the field value. */
12130 off = get_initialized_tmp_var (f_off, pre_p, NULL);
12131
12132 /* Emit code to branch if off >= 0. */
12133 t = build2 (GE_EXPR, boolean_type_node, off,
12134 build_int_cst (TREE_TYPE (off), 0));
12135 cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
12136
12137 if (dw_align)
12138 {
12139 /* Emit: offs = (offs + 15) & -16. */
12140 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
12141 build_int_cst (TREE_TYPE (off), 15));
12142 t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
12143 build_int_cst (TREE_TYPE (off), -16));
12144 roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
12145 }
12146 else
12147 roundup = NULL;
12148
12149 /* Update ap.__[g|v]r_offs */
12150 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
12151 build_int_cst (TREE_TYPE (off), rsize));
12152 t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
12153
12154 /* String up. */
12155 if (roundup)
12156 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
12157
12158 /* [cond2] if (ap.__[g|v]r_offs > 0) */
12159 u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
12160 build_int_cst (TREE_TYPE (f_off), 0));
12161 cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
12162
12163 /* String up: make sure the assignment happens before the use. */
12164 t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
12165 COND_EXPR_ELSE (cond1) = t;
12166
12167 /* Prepare the trees handling the argument that is passed on the stack;
12168 the top level node will store in ON_STACK. */
12169 arg = get_initialized_tmp_var (stack, pre_p, NULL);
12170 if (align > 8)
12171 {
12172 /* if (alignof(type) > 8) (arg = arg + 15) & -16; */
12173 t = fold_build_pointer_plus_hwi (arg, 15);
12174 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
12175 build_int_cst (TREE_TYPE (t), -16));
12176 roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
12177 }
12178 else
12179 roundup = NULL;
12180 /* Advance ap.__stack */
12181 t = fold_build_pointer_plus_hwi (arg, size + 7);
12182 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
12183 build_int_cst (TREE_TYPE (t), -8));
12184 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
12185 /* String up roundup and advance. */
12186 if (roundup)
12187 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
12188 /* String up with arg */
12189 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
12190 /* Big-endianness related address adjustment. */
12191 if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
12192 && size < UNITS_PER_WORD)
12193 {
12194 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
12195 size_int (UNITS_PER_WORD - size));
12196 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
12197 }
12198
12199 COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
12200 COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
12201
12202 /* Adjustment to OFFSET in the case of BIG_ENDIAN. */
12203 t = off;
12204 if (adjust)
12205 t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
12206 build_int_cst (TREE_TYPE (off), adjust));
12207
12208 t = fold_convert (sizetype, t);
12209 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
12210
12211 if (is_ha)
12212 {
12213 /* type ha; // treat as "struct {ftype field[n];}"
12214 ... [computing offs]
12215 for (i = 0; i <nregs; ++i, offs += 16)
12216 ha.field[i] = *((ftype *)(ap.__vr_top + offs));
12217 return ha; */
12218 int i;
12219 tree tmp_ha, field_t, field_ptr_t;
12220
12221 /* Declare a local variable. */
12222 tmp_ha = create_tmp_var_raw (type, "ha");
12223 gimple_add_tmp_var (tmp_ha);
12224
12225 /* Establish the base type. */
12226 switch (ag_mode)
12227 {
12228 case E_SFmode:
12229 field_t = float_type_node;
12230 field_ptr_t = float_ptr_type_node;
12231 break;
12232 case E_DFmode:
12233 field_t = double_type_node;
12234 field_ptr_t = double_ptr_type_node;
12235 break;
12236 case E_TFmode:
12237 field_t = long_double_type_node;
12238 field_ptr_t = long_double_ptr_type_node;
12239 break;
12240 case E_HFmode:
12241 field_t = aarch64_fp16_type_node;
12242 field_ptr_t = aarch64_fp16_ptr_type_node;
12243 break;
12244 case E_V2SImode:
12245 case E_V4SImode:
12246 {
12247 tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
12248 field_t = build_vector_type_for_mode (innertype, ag_mode);
12249 field_ptr_t = build_pointer_type (field_t);
12250 }
12251 break;
12252 default:
12253 gcc_assert (0);
12254 }
12255
12256 /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area */
12257 tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
12258 addr = t;
12259 t = fold_convert (field_ptr_t, addr);
12260 t = build2 (MODIFY_EXPR, field_t,
12261 build1 (INDIRECT_REF, field_t, tmp_ha),
12262 build1 (INDIRECT_REF, field_t, t));
12263
12264 /* ha.field[i] = *((field_ptr_t)vr_saved_area + i) */
12265 for (i = 1; i < nregs; ++i)
12266 {
12267 addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
12268 u = fold_convert (field_ptr_t, addr);
12269 u = build2 (MODIFY_EXPR, field_t,
12270 build2 (MEM_REF, field_t, tmp_ha,
12271 build_int_cst (field_ptr_t,
12272 (i *
12273 int_size_in_bytes (field_t)))),
12274 build1 (INDIRECT_REF, field_t, u));
12275 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
12276 }
12277
12278 u = fold_convert (TREE_TYPE (f_top), tmp_ha);
12279 t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
12280 }
12281
12282 COND_EXPR_ELSE (cond2) = t;
12283 addr = fold_convert (build_pointer_type (type), cond1);
12284 addr = build_va_arg_indirect_ref (addr);
12285
12286 if (indirect_p)
12287 addr = build_va_arg_indirect_ref (addr);
12288
12289 return addr;
12290 }
12291
12292 /* Implement TARGET_SETUP_INCOMING_VARARGS. */
12293
12294 static void
12295 aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
12296 tree type, int *pretend_size ATTRIBUTE_UNUSED,
12297 int no_rtl)
12298 {
12299 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
12300 CUMULATIVE_ARGS local_cum;
12301 int gr_saved = cfun->va_list_gpr_size;
12302 int vr_saved = cfun->va_list_fpr_size;
12303
12304 /* The caller has advanced CUM up to, but not beyond, the last named
12305 argument. Advance a local copy of CUM past the last "real" named
12306 argument, to find out how many registers are left over. */
12307 local_cum = *cum;
12308 aarch64_function_arg_advance (pack_cumulative_args(&local_cum), mode, type, true);
12309
12310 /* Found out how many registers we need to save.
12311 Honor tree-stdvar analysis results. */
12312 if (cfun->va_list_gpr_size)
12313 gr_saved = MIN (NUM_ARG_REGS - local_cum.aapcs_ncrn,
12314 cfun->va_list_gpr_size / UNITS_PER_WORD);
12315 if (cfun->va_list_fpr_size)
12316 vr_saved = MIN (NUM_FP_ARG_REGS - local_cum.aapcs_nvrn,
12317 cfun->va_list_fpr_size / UNITS_PER_VREG);
12318
12319 if (!TARGET_FLOAT)
12320 {
12321 gcc_assert (local_cum.aapcs_nvrn == 0);
12322 vr_saved = 0;
12323 }
12324
12325 if (!no_rtl)
12326 {
12327 if (gr_saved > 0)
12328 {
12329 rtx ptr, mem;
12330
12331 /* virtual_incoming_args_rtx should have been 16-byte aligned. */
12332 ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
12333 - gr_saved * UNITS_PER_WORD);
12334 mem = gen_frame_mem (BLKmode, ptr);
12335 set_mem_alias_set (mem, get_varargs_alias_set ());
12336
12337 move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
12338 mem, gr_saved);
12339 }
12340 if (vr_saved > 0)
12341 {
12342 /* We can't use move_block_from_reg, because it will use
12343 the wrong mode, storing D regs only. */
12344 machine_mode mode = TImode;
12345 int off, i, vr_start;
12346
12347 /* Set OFF to the offset from virtual_incoming_args_rtx of
12348 the first vector register. The VR save area lies below
12349 the GR one, and is aligned to 16 bytes. */
12350 off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
12351 STACK_BOUNDARY / BITS_PER_UNIT);
12352 off -= vr_saved * UNITS_PER_VREG;
12353
12354 vr_start = V0_REGNUM + local_cum.aapcs_nvrn;
12355 for (i = 0; i < vr_saved; ++i)
12356 {
12357 rtx ptr, mem;
12358
12359 ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
12360 mem = gen_frame_mem (mode, ptr);
12361 set_mem_alias_set (mem, get_varargs_alias_set ());
12362 aarch64_emit_move (mem, gen_rtx_REG (mode, vr_start + i));
12363 off += UNITS_PER_VREG;
12364 }
12365 }
12366 }
12367
12368 /* We don't save the size into *PRETEND_SIZE because we want to avoid
12369 any complication of having crtl->args.pretend_args_size changed. */
12370 cfun->machine->frame.saved_varargs_size
12371 = (ROUND_UP (gr_saved * UNITS_PER_WORD,
12372 STACK_BOUNDARY / BITS_PER_UNIT)
12373 + vr_saved * UNITS_PER_VREG);
12374 }
12375
12376 static void
12377 aarch64_conditional_register_usage (void)
12378 {
12379 int i;
12380 if (!TARGET_FLOAT)
12381 {
12382 for (i = V0_REGNUM; i <= V31_REGNUM; i++)
12383 {
12384 fixed_regs[i] = 1;
12385 call_used_regs[i] = 1;
12386 }
12387 }
12388 if (!TARGET_SVE)
12389 for (i = P0_REGNUM; i <= P15_REGNUM; i++)
12390 {
12391 fixed_regs[i] = 1;
12392 call_used_regs[i] = 1;
12393 }
12394
12395 /* When tracking speculation, we need a couple of call-clobbered registers
12396 to track the speculation state. It would be nice to just use
12397 IP0 and IP1, but currently there are numerous places that just
12398 assume these registers are free for other uses (eg pointer
12399 authentication). */
12400 if (aarch64_track_speculation)
12401 {
12402 fixed_regs[SPECULATION_TRACKER_REGNUM] = 1;
12403 call_used_regs[SPECULATION_TRACKER_REGNUM] = 1;
12404 fixed_regs[SPECULATION_SCRATCH_REGNUM] = 1;
12405 call_used_regs[SPECULATION_SCRATCH_REGNUM] = 1;
12406 }
12407 }
12408
12409 /* Walk down the type tree of TYPE counting consecutive base elements.
12410 If *MODEP is VOIDmode, then set it to the first valid floating point
12411 type. If a non-floating point type is found, or if a floating point
12412 type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
12413 otherwise return the count in the sub-tree. */
12414 static int
12415 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
12416 {
12417 machine_mode mode;
12418 HOST_WIDE_INT size;
12419
12420 switch (TREE_CODE (type))
12421 {
12422 case REAL_TYPE:
12423 mode = TYPE_MODE (type);
12424 if (mode != DFmode && mode != SFmode
12425 && mode != TFmode && mode != HFmode)
12426 return -1;
12427
12428 if (*modep == VOIDmode)
12429 *modep = mode;
12430
12431 if (*modep == mode)
12432 return 1;
12433
12434 break;
12435
12436 case COMPLEX_TYPE:
12437 mode = TYPE_MODE (TREE_TYPE (type));
12438 if (mode != DFmode && mode != SFmode
12439 && mode != TFmode && mode != HFmode)
12440 return -1;
12441
12442 if (*modep == VOIDmode)
12443 *modep = mode;
12444
12445 if (*modep == mode)
12446 return 2;
12447
12448 break;
12449
12450 case VECTOR_TYPE:
12451 /* Use V2SImode and V4SImode as representatives of all 64-bit
12452 and 128-bit vector types. */
12453 size = int_size_in_bytes (type);
12454 switch (size)
12455 {
12456 case 8:
12457 mode = V2SImode;
12458 break;
12459 case 16:
12460 mode = V4SImode;
12461 break;
12462 default:
12463 return -1;
12464 }
12465
12466 if (*modep == VOIDmode)
12467 *modep = mode;
12468
12469 /* Vector modes are considered to be opaque: two vectors are
12470 equivalent for the purposes of being homogeneous aggregates
12471 if they are the same size. */
12472 if (*modep == mode)
12473 return 1;
12474
12475 break;
12476
12477 case ARRAY_TYPE:
12478 {
12479 int count;
12480 tree index = TYPE_DOMAIN (type);
12481
12482 /* Can't handle incomplete types nor sizes that are not
12483 fixed. */
12484 if (!COMPLETE_TYPE_P (type)
12485 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
12486 return -1;
12487
12488 count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
12489 if (count == -1
12490 || !index
12491 || !TYPE_MAX_VALUE (index)
12492 || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
12493 || !TYPE_MIN_VALUE (index)
12494 || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
12495 || count < 0)
12496 return -1;
12497
12498 count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
12499 - tree_to_uhwi (TYPE_MIN_VALUE (index)));
12500
12501 /* There must be no padding. */
12502 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
12503 count * GET_MODE_BITSIZE (*modep)))
12504 return -1;
12505
12506 return count;
12507 }
12508
12509 case RECORD_TYPE:
12510 {
12511 int count = 0;
12512 int sub_count;
12513 tree field;
12514
12515 /* Can't handle incomplete types nor sizes that are not
12516 fixed. */
12517 if (!COMPLETE_TYPE_P (type)
12518 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
12519 return -1;
12520
12521 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
12522 {
12523 if (TREE_CODE (field) != FIELD_DECL)
12524 continue;
12525
12526 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
12527 if (sub_count < 0)
12528 return -1;
12529 count += sub_count;
12530 }
12531
12532 /* There must be no padding. */
12533 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
12534 count * GET_MODE_BITSIZE (*modep)))
12535 return -1;
12536
12537 return count;
12538 }
12539
12540 case UNION_TYPE:
12541 case QUAL_UNION_TYPE:
12542 {
12543 /* These aren't very interesting except in a degenerate case. */
12544 int count = 0;
12545 int sub_count;
12546 tree field;
12547
12548 /* Can't handle incomplete types nor sizes that are not
12549 fixed. */
12550 if (!COMPLETE_TYPE_P (type)
12551 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
12552 return -1;
12553
12554 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
12555 {
12556 if (TREE_CODE (field) != FIELD_DECL)
12557 continue;
12558
12559 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
12560 if (sub_count < 0)
12561 return -1;
12562 count = count > sub_count ? count : sub_count;
12563 }
12564
12565 /* There must be no padding. */
12566 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
12567 count * GET_MODE_BITSIZE (*modep)))
12568 return -1;
12569
12570 return count;
12571 }
12572
12573 default:
12574 break;
12575 }
12576
12577 return -1;
12578 }
12579
12580 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
12581 type as described in AAPCS64 \S 4.1.2.
12582
12583 See the comment above aarch64_composite_type_p for the notes on MODE. */
12584
12585 static bool
12586 aarch64_short_vector_p (const_tree type,
12587 machine_mode mode)
12588 {
12589 poly_int64 size = -1;
12590
12591 if (type && TREE_CODE (type) == VECTOR_TYPE)
12592 size = int_size_in_bytes (type);
12593 else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
12594 || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
12595 size = GET_MODE_SIZE (mode);
12596
12597 return known_eq (size, 8) || known_eq (size, 16);
12598 }
12599
12600 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
12601 type as described in AAPCS64 \S 4.3. This includes aggregate, union and
12602 array types. The C99 floating-point complex types are also considered
12603 as composite types, according to AAPCS64 \S 7.1.1. The complex integer
12604 types, which are GCC extensions and out of the scope of AAPCS64, are
12605 treated as composite types here as well.
12606
12607 Note that MODE itself is not sufficient in determining whether a type
12608 is such a composite type or not. This is because
12609 stor-layout.c:compute_record_mode may have already changed the MODE
12610 (BLKmode) of a RECORD_TYPE TYPE to some other mode. For example, a
12611 structure with only one field may have its MODE set to the mode of the
12612 field. Also an integer mode whose size matches the size of the
12613 RECORD_TYPE type may be used to substitute the original mode
12614 (i.e. BLKmode) in certain circumstances. In other words, MODE cannot be
12615 solely relied on. */
12616
12617 static bool
12618 aarch64_composite_type_p (const_tree type,
12619 machine_mode mode)
12620 {
12621 if (aarch64_short_vector_p (type, mode))
12622 return false;
12623
12624 if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
12625 return true;
12626
12627 if (mode == BLKmode
12628 || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
12629 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
12630 return true;
12631
12632 return false;
12633 }
12634
12635 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
12636 shall be passed or returned in simd/fp register(s) (providing these
12637 parameter passing registers are available).
12638
12639 Upon successful return, *COUNT returns the number of needed registers,
12640 *BASE_MODE returns the mode of the individual register and when IS_HAF
12641 is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
12642 floating-point aggregate or a homogeneous short-vector aggregate. */
12643
12644 static bool
12645 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
12646 const_tree type,
12647 machine_mode *base_mode,
12648 int *count,
12649 bool *is_ha)
12650 {
12651 machine_mode new_mode = VOIDmode;
12652 bool composite_p = aarch64_composite_type_p (type, mode);
12653
12654 if (is_ha != NULL) *is_ha = false;
12655
12656 if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
12657 || aarch64_short_vector_p (type, mode))
12658 {
12659 *count = 1;
12660 new_mode = mode;
12661 }
12662 else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
12663 {
12664 if (is_ha != NULL) *is_ha = true;
12665 *count = 2;
12666 new_mode = GET_MODE_INNER (mode);
12667 }
12668 else if (type && composite_p)
12669 {
12670 int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
12671
12672 if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
12673 {
12674 if (is_ha != NULL) *is_ha = true;
12675 *count = ag_count;
12676 }
12677 else
12678 return false;
12679 }
12680 else
12681 return false;
12682
12683 *base_mode = new_mode;
12684 return true;
12685 }
12686
12687 /* Implement TARGET_STRUCT_VALUE_RTX. */
12688
12689 static rtx
12690 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
12691 int incoming ATTRIBUTE_UNUSED)
12692 {
12693 return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
12694 }
12695
12696 /* Implements target hook vector_mode_supported_p. */
12697 static bool
12698 aarch64_vector_mode_supported_p (machine_mode mode)
12699 {
12700 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
12701 return vec_flags != 0 && (vec_flags & VEC_STRUCT) == 0;
12702 }
12703
12704 /* Return appropriate SIMD container
12705 for MODE within a vector of WIDTH bits. */
12706 static machine_mode
12707 aarch64_simd_container_mode (scalar_mode mode, poly_int64 width)
12708 {
12709 if (TARGET_SVE && known_eq (width, BITS_PER_SVE_VECTOR))
12710 switch (mode)
12711 {
12712 case E_DFmode:
12713 return VNx2DFmode;
12714 case E_SFmode:
12715 return VNx4SFmode;
12716 case E_HFmode:
12717 return VNx8HFmode;
12718 case E_DImode:
12719 return VNx2DImode;
12720 case E_SImode:
12721 return VNx4SImode;
12722 case E_HImode:
12723 return VNx8HImode;
12724 case E_QImode:
12725 return VNx16QImode;
12726 default:
12727 return word_mode;
12728 }
12729
12730 gcc_assert (known_eq (width, 64) || known_eq (width, 128));
12731 if (TARGET_SIMD)
12732 {
12733 if (known_eq (width, 128))
12734 switch (mode)
12735 {
12736 case E_DFmode:
12737 return V2DFmode;
12738 case E_SFmode:
12739 return V4SFmode;
12740 case E_HFmode:
12741 return V8HFmode;
12742 case E_SImode:
12743 return V4SImode;
12744 case E_HImode:
12745 return V8HImode;
12746 case E_QImode:
12747 return V16QImode;
12748 case E_DImode:
12749 return V2DImode;
12750 default:
12751 break;
12752 }
12753 else
12754 switch (mode)
12755 {
12756 case E_SFmode:
12757 return V2SFmode;
12758 case E_HFmode:
12759 return V4HFmode;
12760 case E_SImode:
12761 return V2SImode;
12762 case E_HImode:
12763 return V4HImode;
12764 case E_QImode:
12765 return V8QImode;
12766 default:
12767 break;
12768 }
12769 }
12770 return word_mode;
12771 }
12772
12773 /* Return 128-bit container as the preferred SIMD mode for MODE. */
12774 static machine_mode
12775 aarch64_preferred_simd_mode (scalar_mode mode)
12776 {
12777 poly_int64 bits = TARGET_SVE ? BITS_PER_SVE_VECTOR : 128;
12778 return aarch64_simd_container_mode (mode, bits);
12779 }
12780
12781 /* Return a list of possible vector sizes for the vectorizer
12782 to iterate over. */
12783 static void
12784 aarch64_autovectorize_vector_sizes (vector_sizes *sizes)
12785 {
12786 if (TARGET_SVE)
12787 sizes->safe_push (BYTES_PER_SVE_VECTOR);
12788 sizes->safe_push (16);
12789 sizes->safe_push (8);
12790 }
12791
12792 /* Implement TARGET_MANGLE_TYPE. */
12793
12794 static const char *
12795 aarch64_mangle_type (const_tree type)
12796 {
12797 /* The AArch64 ABI documents say that "__va_list" has to be
12798 managled as if it is in the "std" namespace. */
12799 if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
12800 return "St9__va_list";
12801
12802 /* Half-precision float. */
12803 if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16)
12804 return "Dh";
12805
12806 /* Mangle AArch64-specific internal types. TYPE_NAME is non-NULL_TREE for
12807 builtin types. */
12808 if (TYPE_NAME (type) != NULL)
12809 return aarch64_mangle_builtin_type (type);
12810
12811 /* Use the default mangling. */
12812 return NULL;
12813 }
12814
12815 /* Find the first rtx_insn before insn that will generate an assembly
12816 instruction. */
12817
12818 static rtx_insn *
12819 aarch64_prev_real_insn (rtx_insn *insn)
12820 {
12821 if (!insn)
12822 return NULL;
12823
12824 do
12825 {
12826 insn = prev_real_insn (insn);
12827 }
12828 while (insn && recog_memoized (insn) < 0);
12829
12830 return insn;
12831 }
12832
12833 static bool
12834 is_madd_op (enum attr_type t1)
12835 {
12836 unsigned int i;
12837 /* A number of these may be AArch32 only. */
12838 enum attr_type mlatypes[] = {
12839 TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
12840 TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
12841 TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
12842 };
12843
12844 for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
12845 {
12846 if (t1 == mlatypes[i])
12847 return true;
12848 }
12849
12850 return false;
12851 }
12852
12853 /* Check if there is a register dependency between a load and the insn
12854 for which we hold recog_data. */
12855
12856 static bool
12857 dep_between_memop_and_curr (rtx memop)
12858 {
12859 rtx load_reg;
12860 int opno;
12861
12862 gcc_assert (GET_CODE (memop) == SET);
12863
12864 if (!REG_P (SET_DEST (memop)))
12865 return false;
12866
12867 load_reg = SET_DEST (memop);
12868 for (opno = 1; opno < recog_data.n_operands; opno++)
12869 {
12870 rtx operand = recog_data.operand[opno];
12871 if (REG_P (operand)
12872 && reg_overlap_mentioned_p (load_reg, operand))
12873 return true;
12874
12875 }
12876 return false;
12877 }
12878
12879
12880 /* When working around the Cortex-A53 erratum 835769,
12881 given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
12882 instruction and has a preceding memory instruction such that a NOP
12883 should be inserted between them. */
12884
12885 bool
12886 aarch64_madd_needs_nop (rtx_insn* insn)
12887 {
12888 enum attr_type attr_type;
12889 rtx_insn *prev;
12890 rtx body;
12891
12892 if (!TARGET_FIX_ERR_A53_835769)
12893 return false;
12894
12895 if (!INSN_P (insn) || recog_memoized (insn) < 0)
12896 return false;
12897
12898 attr_type = get_attr_type (insn);
12899 if (!is_madd_op (attr_type))
12900 return false;
12901
12902 prev = aarch64_prev_real_insn (insn);
12903 /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
12904 Restore recog state to INSN to avoid state corruption. */
12905 extract_constrain_insn_cached (insn);
12906
12907 if (!prev || !contains_mem_rtx_p (PATTERN (prev)))
12908 return false;
12909
12910 body = single_set (prev);
12911
12912 /* If the previous insn is a memory op and there is no dependency between
12913 it and the DImode madd, emit a NOP between them. If body is NULL then we
12914 have a complex memory operation, probably a load/store pair.
12915 Be conservative for now and emit a NOP. */
12916 if (GET_MODE (recog_data.operand[0]) == DImode
12917 && (!body || !dep_between_memop_and_curr (body)))
12918 return true;
12919
12920 return false;
12921
12922 }
12923
12924
12925 /* Implement FINAL_PRESCAN_INSN. */
12926
12927 void
12928 aarch64_final_prescan_insn (rtx_insn *insn)
12929 {
12930 if (aarch64_madd_needs_nop (insn))
12931 fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
12932 }
12933
12934
12935 /* Return true if BASE_OR_STEP is a valid immediate operand for an SVE INDEX
12936 instruction. */
12937
12938 bool
12939 aarch64_sve_index_immediate_p (rtx base_or_step)
12940 {
12941 return (CONST_INT_P (base_or_step)
12942 && IN_RANGE (INTVAL (base_or_step), -16, 15));
12943 }
12944
12945 /* Return true if X is a valid immediate for the SVE ADD and SUB
12946 instructions. Negate X first if NEGATE_P is true. */
12947
12948 bool
12949 aarch64_sve_arith_immediate_p (rtx x, bool negate_p)
12950 {
12951 rtx elt;
12952
12953 if (!const_vec_duplicate_p (x, &elt)
12954 || !CONST_INT_P (elt))
12955 return false;
12956
12957 HOST_WIDE_INT val = INTVAL (elt);
12958 if (negate_p)
12959 val = -val;
12960 val &= GET_MODE_MASK (GET_MODE_INNER (GET_MODE (x)));
12961
12962 if (val & 0xff)
12963 return IN_RANGE (val, 0, 0xff);
12964 return IN_RANGE (val, 0, 0xff00);
12965 }
12966
12967 /* Return true if X is a valid immediate operand for an SVE logical
12968 instruction such as AND. */
12969
12970 bool
12971 aarch64_sve_bitmask_immediate_p (rtx x)
12972 {
12973 rtx elt;
12974
12975 return (const_vec_duplicate_p (x, &elt)
12976 && CONST_INT_P (elt)
12977 && aarch64_bitmask_imm (INTVAL (elt),
12978 GET_MODE_INNER (GET_MODE (x))));
12979 }
12980
12981 /* Return true if X is a valid immediate for the SVE DUP and CPY
12982 instructions. */
12983
12984 bool
12985 aarch64_sve_dup_immediate_p (rtx x)
12986 {
12987 rtx elt;
12988
12989 if (!const_vec_duplicate_p (x, &elt)
12990 || !CONST_INT_P (elt))
12991 return false;
12992
12993 HOST_WIDE_INT val = INTVAL (elt);
12994 if (val & 0xff)
12995 return IN_RANGE (val, -0x80, 0x7f);
12996 return IN_RANGE (val, -0x8000, 0x7f00);
12997 }
12998
12999 /* Return true if X is a valid immediate operand for an SVE CMP instruction.
13000 SIGNED_P says whether the operand is signed rather than unsigned. */
13001
13002 bool
13003 aarch64_sve_cmp_immediate_p (rtx x, bool signed_p)
13004 {
13005 rtx elt;
13006
13007 return (const_vec_duplicate_p (x, &elt)
13008 && CONST_INT_P (elt)
13009 && (signed_p
13010 ? IN_RANGE (INTVAL (elt), -16, 15)
13011 : IN_RANGE (INTVAL (elt), 0, 127)));
13012 }
13013
13014 /* Return true if X is a valid immediate operand for an SVE FADD or FSUB
13015 instruction. Negate X first if NEGATE_P is true. */
13016
13017 bool
13018 aarch64_sve_float_arith_immediate_p (rtx x, bool negate_p)
13019 {
13020 rtx elt;
13021 REAL_VALUE_TYPE r;
13022
13023 if (!const_vec_duplicate_p (x, &elt)
13024 || GET_CODE (elt) != CONST_DOUBLE)
13025 return false;
13026
13027 r = *CONST_DOUBLE_REAL_VALUE (elt);
13028
13029 if (negate_p)
13030 r = real_value_negate (&r);
13031
13032 if (real_equal (&r, &dconst1))
13033 return true;
13034 if (real_equal (&r, &dconsthalf))
13035 return true;
13036 return false;
13037 }
13038
13039 /* Return true if X is a valid immediate operand for an SVE FMUL
13040 instruction. */
13041
13042 bool
13043 aarch64_sve_float_mul_immediate_p (rtx x)
13044 {
13045 rtx elt;
13046
13047 /* GCC will never generate a multiply with an immediate of 2, so there is no
13048 point testing for it (even though it is a valid constant). */
13049 return (const_vec_duplicate_p (x, &elt)
13050 && GET_CODE (elt) == CONST_DOUBLE
13051 && real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconsthalf));
13052 }
13053
13054 /* Return true if replicating VAL32 is a valid 2-byte or 4-byte immediate
13055 for the Advanced SIMD operation described by WHICH and INSN. If INFO
13056 is nonnull, use it to describe valid immediates. */
13057 static bool
13058 aarch64_advsimd_valid_immediate_hs (unsigned int val32,
13059 simd_immediate_info *info,
13060 enum simd_immediate_check which,
13061 simd_immediate_info::insn_type insn)
13062 {
13063 /* Try a 4-byte immediate with LSL. */
13064 for (unsigned int shift = 0; shift < 32; shift += 8)
13065 if ((val32 & (0xff << shift)) == val32)
13066 {
13067 if (info)
13068 *info = simd_immediate_info (SImode, val32 >> shift, insn,
13069 simd_immediate_info::LSL, shift);
13070 return true;
13071 }
13072
13073 /* Try a 2-byte immediate with LSL. */
13074 unsigned int imm16 = val32 & 0xffff;
13075 if (imm16 == (val32 >> 16))
13076 for (unsigned int shift = 0; shift < 16; shift += 8)
13077 if ((imm16 & (0xff << shift)) == imm16)
13078 {
13079 if (info)
13080 *info = simd_immediate_info (HImode, imm16 >> shift, insn,
13081 simd_immediate_info::LSL, shift);
13082 return true;
13083 }
13084
13085 /* Try a 4-byte immediate with MSL, except for cases that MVN
13086 can handle. */
13087 if (which == AARCH64_CHECK_MOV)
13088 for (unsigned int shift = 8; shift < 24; shift += 8)
13089 {
13090 unsigned int low = (1 << shift) - 1;
13091 if (((val32 & (0xff << shift)) | low) == val32)
13092 {
13093 if (info)
13094 *info = simd_immediate_info (SImode, val32 >> shift, insn,
13095 simd_immediate_info::MSL, shift);
13096 return true;
13097 }
13098 }
13099
13100 return false;
13101 }
13102
13103 /* Return true if replicating VAL64 is a valid immediate for the
13104 Advanced SIMD operation described by WHICH. If INFO is nonnull,
13105 use it to describe valid immediates. */
13106 static bool
13107 aarch64_advsimd_valid_immediate (unsigned HOST_WIDE_INT val64,
13108 simd_immediate_info *info,
13109 enum simd_immediate_check which)
13110 {
13111 unsigned int val32 = val64 & 0xffffffff;
13112 unsigned int val16 = val64 & 0xffff;
13113 unsigned int val8 = val64 & 0xff;
13114
13115 if (val32 == (val64 >> 32))
13116 {
13117 if ((which & AARCH64_CHECK_ORR) != 0
13118 && aarch64_advsimd_valid_immediate_hs (val32, info, which,
13119 simd_immediate_info::MOV))
13120 return true;
13121
13122 if ((which & AARCH64_CHECK_BIC) != 0
13123 && aarch64_advsimd_valid_immediate_hs (~val32, info, which,
13124 simd_immediate_info::MVN))
13125 return true;
13126
13127 /* Try using a replicated byte. */
13128 if (which == AARCH64_CHECK_MOV
13129 && val16 == (val32 >> 16)
13130 && val8 == (val16 >> 8))
13131 {
13132 if (info)
13133 *info = simd_immediate_info (QImode, val8);
13134 return true;
13135 }
13136 }
13137
13138 /* Try using a bit-to-bytemask. */
13139 if (which == AARCH64_CHECK_MOV)
13140 {
13141 unsigned int i;
13142 for (i = 0; i < 64; i += 8)
13143 {
13144 unsigned char byte = (val64 >> i) & 0xff;
13145 if (byte != 0 && byte != 0xff)
13146 break;
13147 }
13148 if (i == 64)
13149 {
13150 if (info)
13151 *info = simd_immediate_info (DImode, val64);
13152 return true;
13153 }
13154 }
13155 return false;
13156 }
13157
13158 /* Return true if replicating VAL64 gives a valid immediate for an SVE MOV
13159 instruction. If INFO is nonnull, use it to describe valid immediates. */
13160
13161 static bool
13162 aarch64_sve_valid_immediate (unsigned HOST_WIDE_INT val64,
13163 simd_immediate_info *info)
13164 {
13165 scalar_int_mode mode = DImode;
13166 unsigned int val32 = val64 & 0xffffffff;
13167 if (val32 == (val64 >> 32))
13168 {
13169 mode = SImode;
13170 unsigned int val16 = val32 & 0xffff;
13171 if (val16 == (val32 >> 16))
13172 {
13173 mode = HImode;
13174 unsigned int val8 = val16 & 0xff;
13175 if (val8 == (val16 >> 8))
13176 mode = QImode;
13177 }
13178 }
13179 HOST_WIDE_INT val = trunc_int_for_mode (val64, mode);
13180 if (IN_RANGE (val, -0x80, 0x7f))
13181 {
13182 /* DUP with no shift. */
13183 if (info)
13184 *info = simd_immediate_info (mode, val);
13185 return true;
13186 }
13187 if ((val & 0xff) == 0 && IN_RANGE (val, -0x8000, 0x7f00))
13188 {
13189 /* DUP with LSL #8. */
13190 if (info)
13191 *info = simd_immediate_info (mode, val);
13192 return true;
13193 }
13194 if (aarch64_bitmask_imm (val64, mode))
13195 {
13196 /* DUPM. */
13197 if (info)
13198 *info = simd_immediate_info (mode, val);
13199 return true;
13200 }
13201 return false;
13202 }
13203
13204 /* Return true if OP is a valid SIMD immediate for the operation
13205 described by WHICH. If INFO is nonnull, use it to describe valid
13206 immediates. */
13207 bool
13208 aarch64_simd_valid_immediate (rtx op, simd_immediate_info *info,
13209 enum simd_immediate_check which)
13210 {
13211 machine_mode mode = GET_MODE (op);
13212 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
13213 if (vec_flags == 0 || vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
13214 return false;
13215
13216 scalar_mode elt_mode = GET_MODE_INNER (mode);
13217 rtx base, step;
13218 unsigned int n_elts;
13219 if (GET_CODE (op) == CONST_VECTOR
13220 && CONST_VECTOR_DUPLICATE_P (op))
13221 n_elts = CONST_VECTOR_NPATTERNS (op);
13222 else if ((vec_flags & VEC_SVE_DATA)
13223 && const_vec_series_p (op, &base, &step))
13224 {
13225 gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_INT);
13226 if (!aarch64_sve_index_immediate_p (base)
13227 || !aarch64_sve_index_immediate_p (step))
13228 return false;
13229
13230 if (info)
13231 *info = simd_immediate_info (elt_mode, base, step);
13232 return true;
13233 }
13234 else if (GET_CODE (op) == CONST_VECTOR
13235 && CONST_VECTOR_NUNITS (op).is_constant (&n_elts))
13236 /* N_ELTS set above. */;
13237 else
13238 return false;
13239
13240 /* Handle PFALSE and PTRUE. */
13241 if (vec_flags & VEC_SVE_PRED)
13242 return (op == CONST0_RTX (mode)
13243 || op == CONSTM1_RTX (mode));
13244
13245 scalar_float_mode elt_float_mode;
13246 if (n_elts == 1
13247 && is_a <scalar_float_mode> (elt_mode, &elt_float_mode))
13248 {
13249 rtx elt = CONST_VECTOR_ENCODED_ELT (op, 0);
13250 if (aarch64_float_const_zero_rtx_p (elt)
13251 || aarch64_float_const_representable_p (elt))
13252 {
13253 if (info)
13254 *info = simd_immediate_info (elt_float_mode, elt);
13255 return true;
13256 }
13257 }
13258
13259 unsigned int elt_size = GET_MODE_SIZE (elt_mode);
13260 if (elt_size > 8)
13261 return false;
13262
13263 scalar_int_mode elt_int_mode = int_mode_for_mode (elt_mode).require ();
13264
13265 /* Expand the vector constant out into a byte vector, with the least
13266 significant byte of the register first. */
13267 auto_vec<unsigned char, 16> bytes;
13268 bytes.reserve (n_elts * elt_size);
13269 for (unsigned int i = 0; i < n_elts; i++)
13270 {
13271 /* The vector is provided in gcc endian-neutral fashion.
13272 For aarch64_be Advanced SIMD, it must be laid out in the vector
13273 register in reverse order. */
13274 bool swap_p = ((vec_flags & VEC_ADVSIMD) != 0 && BYTES_BIG_ENDIAN);
13275 rtx elt = CONST_VECTOR_ELT (op, swap_p ? (n_elts - 1 - i) : i);
13276
13277 if (elt_mode != elt_int_mode)
13278 elt = gen_lowpart (elt_int_mode, elt);
13279
13280 if (!CONST_INT_P (elt))
13281 return false;
13282
13283 unsigned HOST_WIDE_INT elt_val = INTVAL (elt);
13284 for (unsigned int byte = 0; byte < elt_size; byte++)
13285 {
13286 bytes.quick_push (elt_val & 0xff);
13287 elt_val >>= BITS_PER_UNIT;
13288 }
13289 }
13290
13291 /* The immediate must repeat every eight bytes. */
13292 unsigned int nbytes = bytes.length ();
13293 for (unsigned i = 8; i < nbytes; ++i)
13294 if (bytes[i] != bytes[i - 8])
13295 return false;
13296
13297 /* Get the repeating 8-byte value as an integer. No endian correction
13298 is needed here because bytes is already in lsb-first order. */
13299 unsigned HOST_WIDE_INT val64 = 0;
13300 for (unsigned int i = 0; i < 8; i++)
13301 val64 |= ((unsigned HOST_WIDE_INT) bytes[i % nbytes]
13302 << (i * BITS_PER_UNIT));
13303
13304 if (vec_flags & VEC_SVE_DATA)
13305 return aarch64_sve_valid_immediate (val64, info);
13306 else
13307 return aarch64_advsimd_valid_immediate (val64, info, which);
13308 }
13309
13310 /* Check whether X is a VEC_SERIES-like constant that starts at 0 and
13311 has a step in the range of INDEX. Return the index expression if so,
13312 otherwise return null. */
13313 rtx
13314 aarch64_check_zero_based_sve_index_immediate (rtx x)
13315 {
13316 rtx base, step;
13317 if (const_vec_series_p (x, &base, &step)
13318 && base == const0_rtx
13319 && aarch64_sve_index_immediate_p (step))
13320 return step;
13321 return NULL_RTX;
13322 }
13323
13324 /* Check of immediate shift constants are within range. */
13325 bool
13326 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
13327 {
13328 int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
13329 if (left)
13330 return aarch64_const_vec_all_same_in_range_p (x, 0, bit_width - 1);
13331 else
13332 return aarch64_const_vec_all_same_in_range_p (x, 1, bit_width);
13333 }
13334
13335 /* Return the bitmask CONST_INT to select the bits required by a zero extract
13336 operation of width WIDTH at bit position POS. */
13337
13338 rtx
13339 aarch64_mask_from_zextract_ops (rtx width, rtx pos)
13340 {
13341 gcc_assert (CONST_INT_P (width));
13342 gcc_assert (CONST_INT_P (pos));
13343
13344 unsigned HOST_WIDE_INT mask
13345 = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1;
13346 return GEN_INT (mask << UINTVAL (pos));
13347 }
13348
13349 bool
13350 aarch64_mov_operand_p (rtx x, machine_mode mode)
13351 {
13352 if (GET_CODE (x) == HIGH
13353 && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
13354 return true;
13355
13356 if (CONST_INT_P (x))
13357 return true;
13358
13359 if (VECTOR_MODE_P (GET_MODE (x)))
13360 return aarch64_simd_valid_immediate (x, NULL);
13361
13362 if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
13363 return true;
13364
13365 if (aarch64_sve_cnt_immediate_p (x))
13366 return true;
13367
13368 return aarch64_classify_symbolic_expression (x)
13369 == SYMBOL_TINY_ABSOLUTE;
13370 }
13371
13372 /* Return a const_int vector of VAL. */
13373 rtx
13374 aarch64_simd_gen_const_vector_dup (machine_mode mode, HOST_WIDE_INT val)
13375 {
13376 rtx c = gen_int_mode (val, GET_MODE_INNER (mode));
13377 return gen_const_vec_duplicate (mode, c);
13378 }
13379
13380 /* Check OP is a legal scalar immediate for the MOVI instruction. */
13381
13382 bool
13383 aarch64_simd_scalar_immediate_valid_for_move (rtx op, scalar_int_mode mode)
13384 {
13385 machine_mode vmode;
13386
13387 vmode = aarch64_simd_container_mode (mode, 64);
13388 rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
13389 return aarch64_simd_valid_immediate (op_v, NULL);
13390 }
13391
13392 /* Construct and return a PARALLEL RTX vector with elements numbering the
13393 lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
13394 the vector - from the perspective of the architecture. This does not
13395 line up with GCC's perspective on lane numbers, so we end up with
13396 different masks depending on our target endian-ness. The diagram
13397 below may help. We must draw the distinction when building masks
13398 which select one half of the vector. An instruction selecting
13399 architectural low-lanes for a big-endian target, must be described using
13400 a mask selecting GCC high-lanes.
13401
13402 Big-Endian Little-Endian
13403
13404 GCC 0 1 2 3 3 2 1 0
13405 | x | x | x | x | | x | x | x | x |
13406 Architecture 3 2 1 0 3 2 1 0
13407
13408 Low Mask: { 2, 3 } { 0, 1 }
13409 High Mask: { 0, 1 } { 2, 3 }
13410
13411 MODE Is the mode of the vector and NUNITS is the number of units in it. */
13412
13413 rtx
13414 aarch64_simd_vect_par_cnst_half (machine_mode mode, int nunits, bool high)
13415 {
13416 rtvec v = rtvec_alloc (nunits / 2);
13417 int high_base = nunits / 2;
13418 int low_base = 0;
13419 int base;
13420 rtx t1;
13421 int i;
13422
13423 if (BYTES_BIG_ENDIAN)
13424 base = high ? low_base : high_base;
13425 else
13426 base = high ? high_base : low_base;
13427
13428 for (i = 0; i < nunits / 2; i++)
13429 RTVEC_ELT (v, i) = GEN_INT (base + i);
13430
13431 t1 = gen_rtx_PARALLEL (mode, v);
13432 return t1;
13433 }
13434
13435 /* Check OP for validity as a PARALLEL RTX vector with elements
13436 numbering the lanes of either the high (HIGH == TRUE) or low lanes,
13437 from the perspective of the architecture. See the diagram above
13438 aarch64_simd_vect_par_cnst_half for more details. */
13439
13440 bool
13441 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
13442 bool high)
13443 {
13444 int nelts;
13445 if (!VECTOR_MODE_P (mode) || !GET_MODE_NUNITS (mode).is_constant (&nelts))
13446 return false;
13447
13448 rtx ideal = aarch64_simd_vect_par_cnst_half (mode, nelts, high);
13449 HOST_WIDE_INT count_op = XVECLEN (op, 0);
13450 HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
13451 int i = 0;
13452
13453 if (count_op != count_ideal)
13454 return false;
13455
13456 for (i = 0; i < count_ideal; i++)
13457 {
13458 rtx elt_op = XVECEXP (op, 0, i);
13459 rtx elt_ideal = XVECEXP (ideal, 0, i);
13460
13461 if (!CONST_INT_P (elt_op)
13462 || INTVAL (elt_ideal) != INTVAL (elt_op))
13463 return false;
13464 }
13465 return true;
13466 }
13467
13468 /* Bounds-check lanes. Ensure OPERAND lies between LOW (inclusive) and
13469 HIGH (exclusive). */
13470 void
13471 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
13472 const_tree exp)
13473 {
13474 HOST_WIDE_INT lane;
13475 gcc_assert (CONST_INT_P (operand));
13476 lane = INTVAL (operand);
13477
13478 if (lane < low || lane >= high)
13479 {
13480 if (exp)
13481 error ("%Klane %wd out of range %wd - %wd", exp, lane, low, high - 1);
13482 else
13483 error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
13484 }
13485 }
13486
13487 /* Peform endian correction on lane number N, which indexes a vector
13488 of mode MODE, and return the result as an SImode rtx. */
13489
13490 rtx
13491 aarch64_endian_lane_rtx (machine_mode mode, unsigned int n)
13492 {
13493 return gen_int_mode (ENDIAN_LANE_N (GET_MODE_NUNITS (mode), n), SImode);
13494 }
13495
13496 /* Return TRUE if OP is a valid vector addressing mode. */
13497
13498 bool
13499 aarch64_simd_mem_operand_p (rtx op)
13500 {
13501 return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
13502 || REG_P (XEXP (op, 0)));
13503 }
13504
13505 /* Return true if OP is a valid MEM operand for an SVE LD1R instruction. */
13506
13507 bool
13508 aarch64_sve_ld1r_operand_p (rtx op)
13509 {
13510 struct aarch64_address_info addr;
13511 scalar_mode mode;
13512
13513 return (MEM_P (op)
13514 && is_a <scalar_mode> (GET_MODE (op), &mode)
13515 && aarch64_classify_address (&addr, XEXP (op, 0), mode, false)
13516 && addr.type == ADDRESS_REG_IMM
13517 && offset_6bit_unsigned_scaled_p (mode, addr.const_offset));
13518 }
13519
13520 /* Return true if OP is a valid MEM operand for an SVE LDR instruction.
13521 The conditions for STR are the same. */
13522 bool
13523 aarch64_sve_ldr_operand_p (rtx op)
13524 {
13525 struct aarch64_address_info addr;
13526
13527 return (MEM_P (op)
13528 && aarch64_classify_address (&addr, XEXP (op, 0), GET_MODE (op),
13529 false, ADDR_QUERY_ANY)
13530 && addr.type == ADDRESS_REG_IMM);
13531 }
13532
13533 /* Return true if OP is a valid MEM operand for an SVE_STRUCT mode.
13534 We need to be able to access the individual pieces, so the range
13535 is different from LD[234] and ST[234]. */
13536 bool
13537 aarch64_sve_struct_memory_operand_p (rtx op)
13538 {
13539 if (!MEM_P (op))
13540 return false;
13541
13542 machine_mode mode = GET_MODE (op);
13543 struct aarch64_address_info addr;
13544 if (!aarch64_classify_address (&addr, XEXP (op, 0), SVE_BYTE_MODE, false,
13545 ADDR_QUERY_ANY)
13546 || addr.type != ADDRESS_REG_IMM)
13547 return false;
13548
13549 poly_int64 first = addr.const_offset;
13550 poly_int64 last = first + GET_MODE_SIZE (mode) - BYTES_PER_SVE_VECTOR;
13551 return (offset_4bit_signed_scaled_p (SVE_BYTE_MODE, first)
13552 && offset_4bit_signed_scaled_p (SVE_BYTE_MODE, last));
13553 }
13554
13555 /* Emit a register copy from operand to operand, taking care not to
13556 early-clobber source registers in the process.
13557
13558 COUNT is the number of components into which the copy needs to be
13559 decomposed. */
13560 void
13561 aarch64_simd_emit_reg_reg_move (rtx *operands, machine_mode mode,
13562 unsigned int count)
13563 {
13564 unsigned int i;
13565 int rdest = REGNO (operands[0]);
13566 int rsrc = REGNO (operands[1]);
13567
13568 if (!reg_overlap_mentioned_p (operands[0], operands[1])
13569 || rdest < rsrc)
13570 for (i = 0; i < count; i++)
13571 emit_move_insn (gen_rtx_REG (mode, rdest + i),
13572 gen_rtx_REG (mode, rsrc + i));
13573 else
13574 for (i = 0; i < count; i++)
13575 emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
13576 gen_rtx_REG (mode, rsrc + count - i - 1));
13577 }
13578
13579 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
13580 one of VSTRUCT modes: OI, CI, or XI. */
13581 int
13582 aarch64_simd_attr_length_rglist (machine_mode mode)
13583 {
13584 /* This is only used (and only meaningful) for Advanced SIMD, not SVE. */
13585 return (GET_MODE_SIZE (mode).to_constant () / UNITS_PER_VREG) * 4;
13586 }
13587
13588 /* Implement target hook TARGET_VECTOR_ALIGNMENT. The AAPCS64 sets the maximum
13589 alignment of a vector to 128 bits. SVE predicates have an alignment of
13590 16 bits. */
13591 static HOST_WIDE_INT
13592 aarch64_simd_vector_alignment (const_tree type)
13593 {
13594 if (TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
13595 /* ??? Checking the mode isn't ideal, but VECTOR_BOOLEAN_TYPE_P can
13596 be set for non-predicate vectors of booleans. Modes are the most
13597 direct way we have of identifying real SVE predicate types. */
13598 return GET_MODE_CLASS (TYPE_MODE (type)) == MODE_VECTOR_BOOL ? 16 : 128;
13599 HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
13600 return MIN (align, 128);
13601 }
13602
13603 /* Implement target hook TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT. */
13604 static HOST_WIDE_INT
13605 aarch64_vectorize_preferred_vector_alignment (const_tree type)
13606 {
13607 if (aarch64_sve_data_mode_p (TYPE_MODE (type)))
13608 {
13609 /* If the length of the vector is fixed, try to align to that length,
13610 otherwise don't try to align at all. */
13611 HOST_WIDE_INT result;
13612 if (!BITS_PER_SVE_VECTOR.is_constant (&result))
13613 result = TYPE_ALIGN (TREE_TYPE (type));
13614 return result;
13615 }
13616 return TYPE_ALIGN (type);
13617 }
13618
13619 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE. */
13620 static bool
13621 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
13622 {
13623 if (is_packed)
13624 return false;
13625
13626 /* For fixed-length vectors, check that the vectorizer will aim for
13627 full-vector alignment. This isn't true for generic GCC vectors
13628 that are wider than the ABI maximum of 128 bits. */
13629 if (TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
13630 && (wi::to_widest (TYPE_SIZE (type))
13631 != aarch64_vectorize_preferred_vector_alignment (type)))
13632 return false;
13633
13634 /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned. */
13635 return true;
13636 }
13637
13638 /* Return true if the vector misalignment factor is supported by the
13639 target. */
13640 static bool
13641 aarch64_builtin_support_vector_misalignment (machine_mode mode,
13642 const_tree type, int misalignment,
13643 bool is_packed)
13644 {
13645 if (TARGET_SIMD && STRICT_ALIGNMENT)
13646 {
13647 /* Return if movmisalign pattern is not supported for this mode. */
13648 if (optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing)
13649 return false;
13650
13651 /* Misalignment factor is unknown at compile time. */
13652 if (misalignment == -1)
13653 return false;
13654 }
13655 return default_builtin_support_vector_misalignment (mode, type, misalignment,
13656 is_packed);
13657 }
13658
13659 /* If VALS is a vector constant that can be loaded into a register
13660 using DUP, generate instructions to do so and return an RTX to
13661 assign to the register. Otherwise return NULL_RTX. */
13662 static rtx
13663 aarch64_simd_dup_constant (rtx vals)
13664 {
13665 machine_mode mode = GET_MODE (vals);
13666 machine_mode inner_mode = GET_MODE_INNER (mode);
13667 rtx x;
13668
13669 if (!const_vec_duplicate_p (vals, &x))
13670 return NULL_RTX;
13671
13672 /* We can load this constant by using DUP and a constant in a
13673 single ARM register. This will be cheaper than a vector
13674 load. */
13675 x = copy_to_mode_reg (inner_mode, x);
13676 return gen_vec_duplicate (mode, x);
13677 }
13678
13679
13680 /* Generate code to load VALS, which is a PARALLEL containing only
13681 constants (for vec_init) or CONST_VECTOR, efficiently into a
13682 register. Returns an RTX to copy into the register, or NULL_RTX
13683 for a PARALLEL that can not be converted into a CONST_VECTOR. */
13684 static rtx
13685 aarch64_simd_make_constant (rtx vals)
13686 {
13687 machine_mode mode = GET_MODE (vals);
13688 rtx const_dup;
13689 rtx const_vec = NULL_RTX;
13690 int n_const = 0;
13691 int i;
13692
13693 if (GET_CODE (vals) == CONST_VECTOR)
13694 const_vec = vals;
13695 else if (GET_CODE (vals) == PARALLEL)
13696 {
13697 /* A CONST_VECTOR must contain only CONST_INTs and
13698 CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
13699 Only store valid constants in a CONST_VECTOR. */
13700 int n_elts = XVECLEN (vals, 0);
13701 for (i = 0; i < n_elts; ++i)
13702 {
13703 rtx x = XVECEXP (vals, 0, i);
13704 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
13705 n_const++;
13706 }
13707 if (n_const == n_elts)
13708 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
13709 }
13710 else
13711 gcc_unreachable ();
13712
13713 if (const_vec != NULL_RTX
13714 && aarch64_simd_valid_immediate (const_vec, NULL))
13715 /* Load using MOVI/MVNI. */
13716 return const_vec;
13717 else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
13718 /* Loaded using DUP. */
13719 return const_dup;
13720 else if (const_vec != NULL_RTX)
13721 /* Load from constant pool. We can not take advantage of single-cycle
13722 LD1 because we need a PC-relative addressing mode. */
13723 return const_vec;
13724 else
13725 /* A PARALLEL containing something not valid inside CONST_VECTOR.
13726 We can not construct an initializer. */
13727 return NULL_RTX;
13728 }
13729
13730 /* Expand a vector initialisation sequence, such that TARGET is
13731 initialised to contain VALS. */
13732
13733 void
13734 aarch64_expand_vector_init (rtx target, rtx vals)
13735 {
13736 machine_mode mode = GET_MODE (target);
13737 scalar_mode inner_mode = GET_MODE_INNER (mode);
13738 /* The number of vector elements. */
13739 int n_elts = XVECLEN (vals, 0);
13740 /* The number of vector elements which are not constant. */
13741 int n_var = 0;
13742 rtx any_const = NULL_RTX;
13743 /* The first element of vals. */
13744 rtx v0 = XVECEXP (vals, 0, 0);
13745 bool all_same = true;
13746
13747 /* Count the number of variable elements to initialise. */
13748 for (int i = 0; i < n_elts; ++i)
13749 {
13750 rtx x = XVECEXP (vals, 0, i);
13751 if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
13752 ++n_var;
13753 else
13754 any_const = x;
13755
13756 all_same &= rtx_equal_p (x, v0);
13757 }
13758
13759 /* No variable elements, hand off to aarch64_simd_make_constant which knows
13760 how best to handle this. */
13761 if (n_var == 0)
13762 {
13763 rtx constant = aarch64_simd_make_constant (vals);
13764 if (constant != NULL_RTX)
13765 {
13766 emit_move_insn (target, constant);
13767 return;
13768 }
13769 }
13770
13771 /* Splat a single non-constant element if we can. */
13772 if (all_same)
13773 {
13774 rtx x = copy_to_mode_reg (inner_mode, v0);
13775 aarch64_emit_move (target, gen_vec_duplicate (mode, x));
13776 return;
13777 }
13778
13779 enum insn_code icode = optab_handler (vec_set_optab, mode);
13780 gcc_assert (icode != CODE_FOR_nothing);
13781
13782 /* If there are only variable elements, try to optimize
13783 the insertion using dup for the most common element
13784 followed by insertions. */
13785
13786 /* The algorithm will fill matches[*][0] with the earliest matching element,
13787 and matches[X][1] with the count of duplicate elements (if X is the
13788 earliest element which has duplicates). */
13789
13790 if (n_var == n_elts && n_elts <= 16)
13791 {
13792 int matches[16][2] = {0};
13793 for (int i = 0; i < n_elts; i++)
13794 {
13795 for (int j = 0; j <= i; j++)
13796 {
13797 if (rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, j)))
13798 {
13799 matches[i][0] = j;
13800 matches[j][1]++;
13801 break;
13802 }
13803 }
13804 }
13805 int maxelement = 0;
13806 int maxv = 0;
13807 for (int i = 0; i < n_elts; i++)
13808 if (matches[i][1] > maxv)
13809 {
13810 maxelement = i;
13811 maxv = matches[i][1];
13812 }
13813
13814 /* Create a duplicate of the most common element, unless all elements
13815 are equally useless to us, in which case just immediately set the
13816 vector register using the first element. */
13817
13818 if (maxv == 1)
13819 {
13820 /* For vectors of two 64-bit elements, we can do even better. */
13821 if (n_elts == 2
13822 && (inner_mode == E_DImode
13823 || inner_mode == E_DFmode))
13824
13825 {
13826 rtx x0 = XVECEXP (vals, 0, 0);
13827 rtx x1 = XVECEXP (vals, 0, 1);
13828 /* Combine can pick up this case, but handling it directly
13829 here leaves clearer RTL.
13830
13831 This is load_pair_lanes<mode>, and also gives us a clean-up
13832 for store_pair_lanes<mode>. */
13833 if (memory_operand (x0, inner_mode)
13834 && memory_operand (x1, inner_mode)
13835 && !STRICT_ALIGNMENT
13836 && rtx_equal_p (XEXP (x1, 0),
13837 plus_constant (Pmode,
13838 XEXP (x0, 0),
13839 GET_MODE_SIZE (inner_mode))))
13840 {
13841 rtx t;
13842 if (inner_mode == DFmode)
13843 t = gen_load_pair_lanesdf (target, x0, x1);
13844 else
13845 t = gen_load_pair_lanesdi (target, x0, x1);
13846 emit_insn (t);
13847 return;
13848 }
13849 }
13850 /* The subreg-move sequence below will move into lane zero of the
13851 vector register. For big-endian we want that position to hold
13852 the last element of VALS. */
13853 maxelement = BYTES_BIG_ENDIAN ? n_elts - 1 : 0;
13854 rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
13855 aarch64_emit_move (target, lowpart_subreg (mode, x, inner_mode));
13856 }
13857 else
13858 {
13859 rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
13860 aarch64_emit_move (target, gen_vec_duplicate (mode, x));
13861 }
13862
13863 /* Insert the rest. */
13864 for (int i = 0; i < n_elts; i++)
13865 {
13866 rtx x = XVECEXP (vals, 0, i);
13867 if (matches[i][0] == maxelement)
13868 continue;
13869 x = copy_to_mode_reg (inner_mode, x);
13870 emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
13871 }
13872 return;
13873 }
13874
13875 /* Initialise a vector which is part-variable. We want to first try
13876 to build those lanes which are constant in the most efficient way we
13877 can. */
13878 if (n_var != n_elts)
13879 {
13880 rtx copy = copy_rtx (vals);
13881
13882 /* Load constant part of vector. We really don't care what goes into the
13883 parts we will overwrite, but we're more likely to be able to load the
13884 constant efficiently if it has fewer, larger, repeating parts
13885 (see aarch64_simd_valid_immediate). */
13886 for (int i = 0; i < n_elts; i++)
13887 {
13888 rtx x = XVECEXP (vals, 0, i);
13889 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
13890 continue;
13891 rtx subst = any_const;
13892 for (int bit = n_elts / 2; bit > 0; bit /= 2)
13893 {
13894 /* Look in the copied vector, as more elements are const. */
13895 rtx test = XVECEXP (copy, 0, i ^ bit);
13896 if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
13897 {
13898 subst = test;
13899 break;
13900 }
13901 }
13902 XVECEXP (copy, 0, i) = subst;
13903 }
13904 aarch64_expand_vector_init (target, copy);
13905 }
13906
13907 /* Insert the variable lanes directly. */
13908 for (int i = 0; i < n_elts; i++)
13909 {
13910 rtx x = XVECEXP (vals, 0, i);
13911 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
13912 continue;
13913 x = copy_to_mode_reg (inner_mode, x);
13914 emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
13915 }
13916 }
13917
13918 static unsigned HOST_WIDE_INT
13919 aarch64_shift_truncation_mask (machine_mode mode)
13920 {
13921 if (!SHIFT_COUNT_TRUNCATED || aarch64_vector_data_mode_p (mode))
13922 return 0;
13923 return GET_MODE_UNIT_BITSIZE (mode) - 1;
13924 }
13925
13926 /* Select a format to encode pointers in exception handling data. */
13927 int
13928 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
13929 {
13930 int type;
13931 switch (aarch64_cmodel)
13932 {
13933 case AARCH64_CMODEL_TINY:
13934 case AARCH64_CMODEL_TINY_PIC:
13935 case AARCH64_CMODEL_SMALL:
13936 case AARCH64_CMODEL_SMALL_PIC:
13937 case AARCH64_CMODEL_SMALL_SPIC:
13938 /* text+got+data < 4Gb. 4-byte signed relocs are sufficient
13939 for everything. */
13940 type = DW_EH_PE_sdata4;
13941 break;
13942 default:
13943 /* No assumptions here. 8-byte relocs required. */
13944 type = DW_EH_PE_sdata8;
13945 break;
13946 }
13947 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
13948 }
13949
13950 /* The last .arch and .tune assembly strings that we printed. */
13951 static std::string aarch64_last_printed_arch_string;
13952 static std::string aarch64_last_printed_tune_string;
13953
13954 /* Implement ASM_DECLARE_FUNCTION_NAME. Output the ISA features used
13955 by the function fndecl. */
13956
13957 void
13958 aarch64_declare_function_name (FILE *stream, const char* name,
13959 tree fndecl)
13960 {
13961 tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
13962
13963 struct cl_target_option *targ_options;
13964 if (target_parts)
13965 targ_options = TREE_TARGET_OPTION (target_parts);
13966 else
13967 targ_options = TREE_TARGET_OPTION (target_option_current_node);
13968 gcc_assert (targ_options);
13969
13970 const struct processor *this_arch
13971 = aarch64_get_arch (targ_options->x_explicit_arch);
13972
13973 unsigned long isa_flags = targ_options->x_aarch64_isa_flags;
13974 std::string extension
13975 = aarch64_get_extension_string_for_isa_flags (isa_flags,
13976 this_arch->flags);
13977 /* Only update the assembler .arch string if it is distinct from the last
13978 such string we printed. */
13979 std::string to_print = this_arch->name + extension;
13980 if (to_print != aarch64_last_printed_arch_string)
13981 {
13982 asm_fprintf (asm_out_file, "\t.arch %s\n", to_print.c_str ());
13983 aarch64_last_printed_arch_string = to_print;
13984 }
13985
13986 /* Print the cpu name we're tuning for in the comments, might be
13987 useful to readers of the generated asm. Do it only when it changes
13988 from function to function and verbose assembly is requested. */
13989 const struct processor *this_tune
13990 = aarch64_get_tune_cpu (targ_options->x_explicit_tune_core);
13991
13992 if (flag_debug_asm && aarch64_last_printed_tune_string != this_tune->name)
13993 {
13994 asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
13995 this_tune->name);
13996 aarch64_last_printed_tune_string = this_tune->name;
13997 }
13998
13999 /* Don't forget the type directive for ELF. */
14000 ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
14001 ASM_OUTPUT_LABEL (stream, name);
14002 }
14003
14004 /* Implements TARGET_ASM_FILE_START. Output the assembly header. */
14005
14006 static void
14007 aarch64_start_file (void)
14008 {
14009 struct cl_target_option *default_options
14010 = TREE_TARGET_OPTION (target_option_default_node);
14011
14012 const struct processor *default_arch
14013 = aarch64_get_arch (default_options->x_explicit_arch);
14014 unsigned long default_isa_flags = default_options->x_aarch64_isa_flags;
14015 std::string extension
14016 = aarch64_get_extension_string_for_isa_flags (default_isa_flags,
14017 default_arch->flags);
14018
14019 aarch64_last_printed_arch_string = default_arch->name + extension;
14020 aarch64_last_printed_tune_string = "";
14021 asm_fprintf (asm_out_file, "\t.arch %s\n",
14022 aarch64_last_printed_arch_string.c_str ());
14023
14024 default_file_start ();
14025 }
14026
14027 /* Emit load exclusive. */
14028
14029 static void
14030 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
14031 rtx mem, rtx model_rtx)
14032 {
14033 emit_insn (gen_aarch64_load_exclusive (mode, rval, mem, model_rtx));
14034 }
14035
14036 /* Emit store exclusive. */
14037
14038 static void
14039 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
14040 rtx rval, rtx mem, rtx model_rtx)
14041 {
14042 emit_insn (gen_aarch64_store_exclusive (mode, bval, rval, mem, model_rtx));
14043 }
14044
14045 /* Mark the previous jump instruction as unlikely. */
14046
14047 static void
14048 aarch64_emit_unlikely_jump (rtx insn)
14049 {
14050 rtx_insn *jump = emit_jump_insn (insn);
14051 add_reg_br_prob_note (jump, profile_probability::very_unlikely ());
14052 }
14053
14054 /* Expand a compare and swap pattern. */
14055
14056 void
14057 aarch64_expand_compare_and_swap (rtx operands[])
14058 {
14059 rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x;
14060 machine_mode mode, cmp_mode;
14061
14062 bval = operands[0];
14063 rval = operands[1];
14064 mem = operands[2];
14065 oldval = operands[3];
14066 newval = operands[4];
14067 is_weak = operands[5];
14068 mod_s = operands[6];
14069 mod_f = operands[7];
14070 mode = GET_MODE (mem);
14071 cmp_mode = mode;
14072
14073 /* Normally the succ memory model must be stronger than fail, but in the
14074 unlikely event of fail being ACQUIRE and succ being RELEASE we need to
14075 promote succ to ACQ_REL so that we don't lose the acquire semantics. */
14076
14077 if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
14078 && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
14079 mod_s = GEN_INT (MEMMODEL_ACQ_REL);
14080
14081 switch (mode)
14082 {
14083 case E_QImode:
14084 case E_HImode:
14085 /* For short modes, we're going to perform the comparison in SImode,
14086 so do the zero-extension now. */
14087 cmp_mode = SImode;
14088 rval = gen_reg_rtx (SImode);
14089 oldval = convert_modes (SImode, mode, oldval, true);
14090 /* Fall through. */
14091
14092 case E_SImode:
14093 case E_DImode:
14094 /* Force the value into a register if needed. */
14095 if (!aarch64_plus_operand (oldval, mode))
14096 oldval = force_reg (cmp_mode, oldval);
14097 break;
14098
14099 default:
14100 gcc_unreachable ();
14101 }
14102
14103 if (TARGET_LSE)
14104 emit_insn (gen_aarch64_compare_and_swap_lse (mode, rval, mem, oldval,
14105 newval, is_weak, mod_s,
14106 mod_f));
14107 else
14108 emit_insn (gen_aarch64_compare_and_swap (mode, rval, mem, oldval, newval,
14109 is_weak, mod_s, mod_f));
14110
14111
14112 if (mode == QImode || mode == HImode)
14113 emit_move_insn (operands[1], gen_lowpart (mode, rval));
14114
14115 x = gen_rtx_REG (CCmode, CC_REGNUM);
14116 x = gen_rtx_EQ (SImode, x, const0_rtx);
14117 emit_insn (gen_rtx_SET (bval, x));
14118 }
14119
14120 /* Test whether the target supports using a atomic load-operate instruction.
14121 CODE is the operation and AFTER is TRUE if the data in memory after the
14122 operation should be returned and FALSE if the data before the operation
14123 should be returned. Returns FALSE if the operation isn't supported by the
14124 architecture. */
14125
14126 bool
14127 aarch64_atomic_ldop_supported_p (enum rtx_code code)
14128 {
14129 if (!TARGET_LSE)
14130 return false;
14131
14132 switch (code)
14133 {
14134 case SET:
14135 case AND:
14136 case IOR:
14137 case XOR:
14138 case MINUS:
14139 case PLUS:
14140 return true;
14141 default:
14142 return false;
14143 }
14144 }
14145
14146 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
14147 sequence implementing an atomic operation. */
14148
14149 static void
14150 aarch64_emit_post_barrier (enum memmodel model)
14151 {
14152 const enum memmodel base_model = memmodel_base (model);
14153
14154 if (is_mm_sync (model)
14155 && (base_model == MEMMODEL_ACQUIRE
14156 || base_model == MEMMODEL_ACQ_REL
14157 || base_model == MEMMODEL_SEQ_CST))
14158 {
14159 emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
14160 }
14161 }
14162
14163 /* Emit an atomic compare-and-swap operation. RVAL is the destination register
14164 for the data in memory. EXPECTED is the value expected to be in memory.
14165 DESIRED is the value to store to memory. MEM is the memory location. MODEL
14166 is the memory ordering to use. */
14167
14168 void
14169 aarch64_gen_atomic_cas (rtx rval, rtx mem,
14170 rtx expected, rtx desired,
14171 rtx model)
14172 {
14173 machine_mode mode;
14174
14175 mode = GET_MODE (mem);
14176
14177 /* Move the expected value into the CAS destination register. */
14178 emit_insn (gen_rtx_SET (rval, expected));
14179
14180 /* Emit the CAS. */
14181 emit_insn (gen_aarch64_atomic_cas (mode, rval, mem, desired, model));
14182
14183 /* Compare the expected value with the value loaded by the CAS, to establish
14184 whether the swap was made. */
14185 aarch64_gen_compare_reg (EQ, rval, expected);
14186 }
14187
14188 /* Split a compare and swap pattern. */
14189
14190 void
14191 aarch64_split_compare_and_swap (rtx operands[])
14192 {
14193 rtx rval, mem, oldval, newval, scratch;
14194 machine_mode mode;
14195 bool is_weak;
14196 rtx_code_label *label1, *label2;
14197 rtx x, cond;
14198 enum memmodel model;
14199 rtx model_rtx;
14200
14201 rval = operands[0];
14202 mem = operands[1];
14203 oldval = operands[2];
14204 newval = operands[3];
14205 is_weak = (operands[4] != const0_rtx);
14206 model_rtx = operands[5];
14207 scratch = operands[7];
14208 mode = GET_MODE (mem);
14209 model = memmodel_from_int (INTVAL (model_rtx));
14210
14211 /* When OLDVAL is zero and we want the strong version we can emit a tighter
14212 loop:
14213 .label1:
14214 LD[A]XR rval, [mem]
14215 CBNZ rval, .label2
14216 ST[L]XR scratch, newval, [mem]
14217 CBNZ scratch, .label1
14218 .label2:
14219 CMP rval, 0. */
14220 bool strong_zero_p = !is_weak && oldval == const0_rtx;
14221
14222 label1 = NULL;
14223 if (!is_weak)
14224 {
14225 label1 = gen_label_rtx ();
14226 emit_label (label1);
14227 }
14228 label2 = gen_label_rtx ();
14229
14230 /* The initial load can be relaxed for a __sync operation since a final
14231 barrier will be emitted to stop code hoisting. */
14232 if (is_mm_sync (model))
14233 aarch64_emit_load_exclusive (mode, rval, mem,
14234 GEN_INT (MEMMODEL_RELAXED));
14235 else
14236 aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
14237
14238 if (strong_zero_p)
14239 {
14240 if (aarch64_track_speculation)
14241 {
14242 /* Emit an explicit compare instruction, so that we can correctly
14243 track the condition codes. */
14244 rtx cc_reg = aarch64_gen_compare_reg (NE, rval, const0_rtx);
14245 x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
14246 }
14247 else
14248 x = gen_rtx_NE (VOIDmode, rval, const0_rtx);
14249
14250 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
14251 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
14252 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
14253 }
14254 else
14255 {
14256 cond = aarch64_gen_compare_reg (NE, rval, oldval);
14257 x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
14258 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
14259 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
14260 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
14261 }
14262
14263 aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
14264
14265 if (!is_weak)
14266 {
14267 if (aarch64_track_speculation)
14268 {
14269 /* Emit an explicit compare instruction, so that we can correctly
14270 track the condition codes. */
14271 rtx cc_reg = aarch64_gen_compare_reg (NE, scratch, const0_rtx);
14272 x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
14273 }
14274 else
14275 x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
14276
14277 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
14278 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
14279 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
14280 }
14281 else
14282 {
14283 cond = gen_rtx_REG (CCmode, CC_REGNUM);
14284 x = gen_rtx_COMPARE (CCmode, scratch, const0_rtx);
14285 emit_insn (gen_rtx_SET (cond, x));
14286 }
14287
14288 emit_label (label2);
14289 /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
14290 to set the condition flags. If this is not used it will be removed by
14291 later passes. */
14292 if (strong_zero_p)
14293 {
14294 cond = gen_rtx_REG (CCmode, CC_REGNUM);
14295 x = gen_rtx_COMPARE (CCmode, rval, const0_rtx);
14296 emit_insn (gen_rtx_SET (cond, x));
14297 }
14298 /* Emit any final barrier needed for a __sync operation. */
14299 if (is_mm_sync (model))
14300 aarch64_emit_post_barrier (model);
14301 }
14302
14303 /* Emit a BIC instruction. */
14304
14305 static void
14306 aarch64_emit_bic (machine_mode mode, rtx dst, rtx s1, rtx s2, int shift)
14307 {
14308 rtx shift_rtx = GEN_INT (shift);
14309 rtx (*gen) (rtx, rtx, rtx, rtx);
14310
14311 switch (mode)
14312 {
14313 case E_SImode: gen = gen_and_one_cmpl_lshrsi3; break;
14314 case E_DImode: gen = gen_and_one_cmpl_lshrdi3; break;
14315 default:
14316 gcc_unreachable ();
14317 }
14318
14319 emit_insn (gen (dst, s2, shift_rtx, s1));
14320 }
14321
14322 /* Emit an atomic swap. */
14323
14324 static void
14325 aarch64_emit_atomic_swap (machine_mode mode, rtx dst, rtx value,
14326 rtx mem, rtx model)
14327 {
14328 emit_insn (gen_aarch64_atomic_swp (mode, dst, mem, value, model));
14329 }
14330
14331 /* Emit an atomic load+operate. CODE is the operation. OUT_DATA is the
14332 location to store the data read from memory. OUT_RESULT is the location to
14333 store the result of the operation. MEM is the memory location to read and
14334 modify. MODEL_RTX is the memory ordering to use. VALUE is the second
14335 operand for the operation. Either OUT_DATA or OUT_RESULT, but not both, can
14336 be NULL. */
14337
14338 void
14339 aarch64_gen_atomic_ldop (enum rtx_code code, rtx out_data, rtx out_result,
14340 rtx mem, rtx value, rtx model_rtx)
14341 {
14342 machine_mode mode = GET_MODE (mem);
14343 machine_mode wmode = (mode == DImode ? DImode : SImode);
14344 const bool short_mode = (mode < SImode);
14345 int ldop_code;
14346 rtx src;
14347 rtx x;
14348
14349 if (out_data)
14350 out_data = gen_lowpart (mode, out_data);
14351
14352 if (out_result)
14353 out_result = gen_lowpart (mode, out_result);
14354
14355 /* Make sure the value is in a register, putting it into a destination
14356 register if it needs to be manipulated. */
14357 if (!register_operand (value, mode)
14358 || code == AND || code == MINUS)
14359 {
14360 src = out_result ? out_result : out_data;
14361 emit_move_insn (src, gen_lowpart (mode, value));
14362 }
14363 else
14364 src = value;
14365 gcc_assert (register_operand (src, mode));
14366
14367 /* Preprocess the data for the operation as necessary. If the operation is
14368 a SET then emit a swap instruction and finish. */
14369 switch (code)
14370 {
14371 case SET:
14372 aarch64_emit_atomic_swap (mode, out_data, src, mem, model_rtx);
14373 return;
14374
14375 case MINUS:
14376 /* Negate the value and treat it as a PLUS. */
14377 {
14378 rtx neg_src;
14379
14380 /* Resize the value if necessary. */
14381 if (short_mode)
14382 src = gen_lowpart (wmode, src);
14383
14384 neg_src = gen_rtx_NEG (wmode, src);
14385 emit_insn (gen_rtx_SET (src, neg_src));
14386
14387 if (short_mode)
14388 src = gen_lowpart (mode, src);
14389 }
14390 /* Fall-through. */
14391 case PLUS:
14392 ldop_code = UNSPECV_ATOMIC_LDOP_PLUS;
14393 break;
14394
14395 case IOR:
14396 ldop_code = UNSPECV_ATOMIC_LDOP_OR;
14397 break;
14398
14399 case XOR:
14400 ldop_code = UNSPECV_ATOMIC_LDOP_XOR;
14401 break;
14402
14403 case AND:
14404 {
14405 rtx not_src;
14406
14407 /* Resize the value if necessary. */
14408 if (short_mode)
14409 src = gen_lowpart (wmode, src);
14410
14411 not_src = gen_rtx_NOT (wmode, src);
14412 emit_insn (gen_rtx_SET (src, not_src));
14413
14414 if (short_mode)
14415 src = gen_lowpart (mode, src);
14416 }
14417 ldop_code = UNSPECV_ATOMIC_LDOP_BIC;
14418 break;
14419
14420 default:
14421 /* The operation can't be done with atomic instructions. */
14422 gcc_unreachable ();
14423 }
14424
14425 emit_insn (gen_aarch64_atomic_load (ldop_code, mode,
14426 out_data, mem, src, model_rtx));
14427
14428 /* If necessary, calculate the data in memory after the update by redoing the
14429 operation from values in registers. */
14430 if (!out_result)
14431 return;
14432
14433 if (short_mode)
14434 {
14435 src = gen_lowpart (wmode, src);
14436 out_data = gen_lowpart (wmode, out_data);
14437 out_result = gen_lowpart (wmode, out_result);
14438 }
14439
14440 x = NULL_RTX;
14441
14442 switch (code)
14443 {
14444 case MINUS:
14445 case PLUS:
14446 x = gen_rtx_PLUS (wmode, out_data, src);
14447 break;
14448 case IOR:
14449 x = gen_rtx_IOR (wmode, out_data, src);
14450 break;
14451 case XOR:
14452 x = gen_rtx_XOR (wmode, out_data, src);
14453 break;
14454 case AND:
14455 aarch64_emit_bic (wmode, out_result, out_data, src, 0);
14456 return;
14457 default:
14458 gcc_unreachable ();
14459 }
14460
14461 emit_set_insn (out_result, x);
14462
14463 return;
14464 }
14465
14466 /* Split an atomic operation. */
14467
14468 void
14469 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
14470 rtx value, rtx model_rtx, rtx cond)
14471 {
14472 machine_mode mode = GET_MODE (mem);
14473 machine_mode wmode = (mode == DImode ? DImode : SImode);
14474 const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
14475 const bool is_sync = is_mm_sync (model);
14476 rtx_code_label *label;
14477 rtx x;
14478
14479 /* Split the atomic operation into a sequence. */
14480 label = gen_label_rtx ();
14481 emit_label (label);
14482
14483 if (new_out)
14484 new_out = gen_lowpart (wmode, new_out);
14485 if (old_out)
14486 old_out = gen_lowpart (wmode, old_out);
14487 else
14488 old_out = new_out;
14489 value = simplify_gen_subreg (wmode, value, mode, 0);
14490
14491 /* The initial load can be relaxed for a __sync operation since a final
14492 barrier will be emitted to stop code hoisting. */
14493 if (is_sync)
14494 aarch64_emit_load_exclusive (mode, old_out, mem,
14495 GEN_INT (MEMMODEL_RELAXED));
14496 else
14497 aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
14498
14499 switch (code)
14500 {
14501 case SET:
14502 new_out = value;
14503 break;
14504
14505 case NOT:
14506 x = gen_rtx_AND (wmode, old_out, value);
14507 emit_insn (gen_rtx_SET (new_out, x));
14508 x = gen_rtx_NOT (wmode, new_out);
14509 emit_insn (gen_rtx_SET (new_out, x));
14510 break;
14511
14512 case MINUS:
14513 if (CONST_INT_P (value))
14514 {
14515 value = GEN_INT (-INTVAL (value));
14516 code = PLUS;
14517 }
14518 /* Fall through. */
14519
14520 default:
14521 x = gen_rtx_fmt_ee (code, wmode, old_out, value);
14522 emit_insn (gen_rtx_SET (new_out, x));
14523 break;
14524 }
14525
14526 aarch64_emit_store_exclusive (mode, cond, mem,
14527 gen_lowpart (mode, new_out), model_rtx);
14528
14529 if (aarch64_track_speculation)
14530 {
14531 /* Emit an explicit compare instruction, so that we can correctly
14532 track the condition codes. */
14533 rtx cc_reg = aarch64_gen_compare_reg (NE, cond, const0_rtx);
14534 x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
14535 }
14536 else
14537 x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
14538
14539 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
14540 gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
14541 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
14542
14543 /* Emit any final barrier needed for a __sync operation. */
14544 if (is_sync)
14545 aarch64_emit_post_barrier (model);
14546 }
14547
14548 static void
14549 aarch64_init_libfuncs (void)
14550 {
14551 /* Half-precision float operations. The compiler handles all operations
14552 with NULL libfuncs by converting to SFmode. */
14553
14554 /* Conversions. */
14555 set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
14556 set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
14557
14558 /* Arithmetic. */
14559 set_optab_libfunc (add_optab, HFmode, NULL);
14560 set_optab_libfunc (sdiv_optab, HFmode, NULL);
14561 set_optab_libfunc (smul_optab, HFmode, NULL);
14562 set_optab_libfunc (neg_optab, HFmode, NULL);
14563 set_optab_libfunc (sub_optab, HFmode, NULL);
14564
14565 /* Comparisons. */
14566 set_optab_libfunc (eq_optab, HFmode, NULL);
14567 set_optab_libfunc (ne_optab, HFmode, NULL);
14568 set_optab_libfunc (lt_optab, HFmode, NULL);
14569 set_optab_libfunc (le_optab, HFmode, NULL);
14570 set_optab_libfunc (ge_optab, HFmode, NULL);
14571 set_optab_libfunc (gt_optab, HFmode, NULL);
14572 set_optab_libfunc (unord_optab, HFmode, NULL);
14573 }
14574
14575 /* Target hook for c_mode_for_suffix. */
14576 static machine_mode
14577 aarch64_c_mode_for_suffix (char suffix)
14578 {
14579 if (suffix == 'q')
14580 return TFmode;
14581
14582 return VOIDmode;
14583 }
14584
14585 /* We can only represent floating point constants which will fit in
14586 "quarter-precision" values. These values are characterised by
14587 a sign bit, a 4-bit mantissa and a 3-bit exponent. And are given
14588 by:
14589
14590 (-1)^s * (n/16) * 2^r
14591
14592 Where:
14593 's' is the sign bit.
14594 'n' is an integer in the range 16 <= n <= 31.
14595 'r' is an integer in the range -3 <= r <= 4. */
14596
14597 /* Return true iff X can be represented by a quarter-precision
14598 floating point immediate operand X. Note, we cannot represent 0.0. */
14599 bool
14600 aarch64_float_const_representable_p (rtx x)
14601 {
14602 /* This represents our current view of how many bits
14603 make up the mantissa. */
14604 int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
14605 int exponent;
14606 unsigned HOST_WIDE_INT mantissa, mask;
14607 REAL_VALUE_TYPE r, m;
14608 bool fail;
14609
14610 if (!CONST_DOUBLE_P (x))
14611 return false;
14612
14613 if (GET_MODE (x) == VOIDmode
14614 || (GET_MODE (x) == HFmode && !TARGET_FP_F16INST))
14615 return false;
14616
14617 r = *CONST_DOUBLE_REAL_VALUE (x);
14618
14619 /* We cannot represent infinities, NaNs or +/-zero. We won't
14620 know if we have +zero until we analyse the mantissa, but we
14621 can reject the other invalid values. */
14622 if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
14623 || REAL_VALUE_MINUS_ZERO (r))
14624 return false;
14625
14626 /* Extract exponent. */
14627 r = real_value_abs (&r);
14628 exponent = REAL_EXP (&r);
14629
14630 /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
14631 highest (sign) bit, with a fixed binary point at bit point_pos.
14632 m1 holds the low part of the mantissa, m2 the high part.
14633 WARNING: If we ever have a representation using more than 2 * H_W_I - 1
14634 bits for the mantissa, this can fail (low bits will be lost). */
14635 real_ldexp (&m, &r, point_pos - exponent);
14636 wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
14637
14638 /* If the low part of the mantissa has bits set we cannot represent
14639 the value. */
14640 if (w.ulow () != 0)
14641 return false;
14642 /* We have rejected the lower HOST_WIDE_INT, so update our
14643 understanding of how many bits lie in the mantissa and
14644 look only at the high HOST_WIDE_INT. */
14645 mantissa = w.elt (1);
14646 point_pos -= HOST_BITS_PER_WIDE_INT;
14647
14648 /* We can only represent values with a mantissa of the form 1.xxxx. */
14649 mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
14650 if ((mantissa & mask) != 0)
14651 return false;
14652
14653 /* Having filtered unrepresentable values, we may now remove all
14654 but the highest 5 bits. */
14655 mantissa >>= point_pos - 5;
14656
14657 /* We cannot represent the value 0.0, so reject it. This is handled
14658 elsewhere. */
14659 if (mantissa == 0)
14660 return false;
14661
14662 /* Then, as bit 4 is always set, we can mask it off, leaving
14663 the mantissa in the range [0, 15]. */
14664 mantissa &= ~(1 << 4);
14665 gcc_assert (mantissa <= 15);
14666
14667 /* GCC internally does not use IEEE754-like encoding (where normalized
14668 significands are in the range [1, 2). GCC uses [0.5, 1) (see real.c).
14669 Our mantissa values are shifted 4 places to the left relative to
14670 normalized IEEE754 so we must modify the exponent returned by REAL_EXP
14671 by 5 places to correct for GCC's representation. */
14672 exponent = 5 - exponent;
14673
14674 return (exponent >= 0 && exponent <= 7);
14675 }
14676
14677 /* Returns the string with the instruction for AdvSIMD MOVI, MVNI, ORR or BIC
14678 immediate with a CONST_VECTOR of MODE and WIDTH. WHICH selects whether to
14679 output MOVI/MVNI, ORR or BIC immediate. */
14680 char*
14681 aarch64_output_simd_mov_immediate (rtx const_vector, unsigned width,
14682 enum simd_immediate_check which)
14683 {
14684 bool is_valid;
14685 static char templ[40];
14686 const char *mnemonic;
14687 const char *shift_op;
14688 unsigned int lane_count = 0;
14689 char element_char;
14690
14691 struct simd_immediate_info info;
14692
14693 /* This will return true to show const_vector is legal for use as either
14694 a AdvSIMD MOVI instruction (or, implicitly, MVNI), ORR or BIC immediate.
14695 It will also update INFO to show how the immediate should be generated.
14696 WHICH selects whether to check for MOVI/MVNI, ORR or BIC. */
14697 is_valid = aarch64_simd_valid_immediate (const_vector, &info, which);
14698 gcc_assert (is_valid);
14699
14700 element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
14701 lane_count = width / GET_MODE_BITSIZE (info.elt_mode);
14702
14703 if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
14704 {
14705 gcc_assert (info.shift == 0 && info.insn == simd_immediate_info::MOV);
14706 /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
14707 move immediate path. */
14708 if (aarch64_float_const_zero_rtx_p (info.value))
14709 info.value = GEN_INT (0);
14710 else
14711 {
14712 const unsigned int buf_size = 20;
14713 char float_buf[buf_size] = {'\0'};
14714 real_to_decimal_for_mode (float_buf,
14715 CONST_DOUBLE_REAL_VALUE (info.value),
14716 buf_size, buf_size, 1, info.elt_mode);
14717
14718 if (lane_count == 1)
14719 snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
14720 else
14721 snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
14722 lane_count, element_char, float_buf);
14723 return templ;
14724 }
14725 }
14726
14727 gcc_assert (CONST_INT_P (info.value));
14728
14729 if (which == AARCH64_CHECK_MOV)
14730 {
14731 mnemonic = info.insn == simd_immediate_info::MVN ? "mvni" : "movi";
14732 shift_op = info.modifier == simd_immediate_info::MSL ? "msl" : "lsl";
14733 if (lane_count == 1)
14734 snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
14735 mnemonic, UINTVAL (info.value));
14736 else if (info.shift)
14737 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
14738 HOST_WIDE_INT_PRINT_HEX ", %s %d", mnemonic, lane_count,
14739 element_char, UINTVAL (info.value), shift_op, info.shift);
14740 else
14741 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
14742 HOST_WIDE_INT_PRINT_HEX, mnemonic, lane_count,
14743 element_char, UINTVAL (info.value));
14744 }
14745 else
14746 {
14747 /* For AARCH64_CHECK_BIC and AARCH64_CHECK_ORR. */
14748 mnemonic = info.insn == simd_immediate_info::MVN ? "bic" : "orr";
14749 if (info.shift)
14750 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
14751 HOST_WIDE_INT_PRINT_DEC ", %s #%d", mnemonic, lane_count,
14752 element_char, UINTVAL (info.value), "lsl", info.shift);
14753 else
14754 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
14755 HOST_WIDE_INT_PRINT_DEC, mnemonic, lane_count,
14756 element_char, UINTVAL (info.value));
14757 }
14758 return templ;
14759 }
14760
14761 char*
14762 aarch64_output_scalar_simd_mov_immediate (rtx immediate, scalar_int_mode mode)
14763 {
14764
14765 /* If a floating point number was passed and we desire to use it in an
14766 integer mode do the conversion to integer. */
14767 if (CONST_DOUBLE_P (immediate) && GET_MODE_CLASS (mode) == MODE_INT)
14768 {
14769 unsigned HOST_WIDE_INT ival;
14770 if (!aarch64_reinterpret_float_as_int (immediate, &ival))
14771 gcc_unreachable ();
14772 immediate = gen_int_mode (ival, mode);
14773 }
14774
14775 machine_mode vmode;
14776 /* use a 64 bit mode for everything except for DI/DF mode, where we use
14777 a 128 bit vector mode. */
14778 int width = GET_MODE_BITSIZE (mode) == 64 ? 128 : 64;
14779
14780 vmode = aarch64_simd_container_mode (mode, width);
14781 rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
14782 return aarch64_output_simd_mov_immediate (v_op, width);
14783 }
14784
14785 /* Return the output string to use for moving immediate CONST_VECTOR
14786 into an SVE register. */
14787
14788 char *
14789 aarch64_output_sve_mov_immediate (rtx const_vector)
14790 {
14791 static char templ[40];
14792 struct simd_immediate_info info;
14793 char element_char;
14794
14795 bool is_valid = aarch64_simd_valid_immediate (const_vector, &info);
14796 gcc_assert (is_valid);
14797
14798 element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
14799
14800 if (info.step)
14801 {
14802 snprintf (templ, sizeof (templ), "index\t%%0.%c, #"
14803 HOST_WIDE_INT_PRINT_DEC ", #" HOST_WIDE_INT_PRINT_DEC,
14804 element_char, INTVAL (info.value), INTVAL (info.step));
14805 return templ;
14806 }
14807
14808 if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
14809 {
14810 if (aarch64_float_const_zero_rtx_p (info.value))
14811 info.value = GEN_INT (0);
14812 else
14813 {
14814 const int buf_size = 20;
14815 char float_buf[buf_size] = {};
14816 real_to_decimal_for_mode (float_buf,
14817 CONST_DOUBLE_REAL_VALUE (info.value),
14818 buf_size, buf_size, 1, info.elt_mode);
14819
14820 snprintf (templ, sizeof (templ), "fmov\t%%0.%c, #%s",
14821 element_char, float_buf);
14822 return templ;
14823 }
14824 }
14825
14826 snprintf (templ, sizeof (templ), "mov\t%%0.%c, #" HOST_WIDE_INT_PRINT_DEC,
14827 element_char, INTVAL (info.value));
14828 return templ;
14829 }
14830
14831 /* Return the asm format for a PTRUE instruction whose destination has
14832 mode MODE. SUFFIX is the element size suffix. */
14833
14834 char *
14835 aarch64_output_ptrue (machine_mode mode, char suffix)
14836 {
14837 unsigned int nunits;
14838 static char buf[sizeof ("ptrue\t%0.N, vlNNNNN")];
14839 if (GET_MODE_NUNITS (mode).is_constant (&nunits))
14840 snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, vl%d", suffix, nunits);
14841 else
14842 snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, all", suffix);
14843 return buf;
14844 }
14845
14846 /* Split operands into moves from op[1] + op[2] into op[0]. */
14847
14848 void
14849 aarch64_split_combinev16qi (rtx operands[3])
14850 {
14851 unsigned int dest = REGNO (operands[0]);
14852 unsigned int src1 = REGNO (operands[1]);
14853 unsigned int src2 = REGNO (operands[2]);
14854 machine_mode halfmode = GET_MODE (operands[1]);
14855 unsigned int halfregs = REG_NREGS (operands[1]);
14856 rtx destlo, desthi;
14857
14858 gcc_assert (halfmode == V16QImode);
14859
14860 if (src1 == dest && src2 == dest + halfregs)
14861 {
14862 /* No-op move. Can't split to nothing; emit something. */
14863 emit_note (NOTE_INSN_DELETED);
14864 return;
14865 }
14866
14867 /* Preserve register attributes for variable tracking. */
14868 destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
14869 desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
14870 GET_MODE_SIZE (halfmode));
14871
14872 /* Special case of reversed high/low parts. */
14873 if (reg_overlap_mentioned_p (operands[2], destlo)
14874 && reg_overlap_mentioned_p (operands[1], desthi))
14875 {
14876 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
14877 emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
14878 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
14879 }
14880 else if (!reg_overlap_mentioned_p (operands[2], destlo))
14881 {
14882 /* Try to avoid unnecessary moves if part of the result
14883 is in the right place already. */
14884 if (src1 != dest)
14885 emit_move_insn (destlo, operands[1]);
14886 if (src2 != dest + halfregs)
14887 emit_move_insn (desthi, operands[2]);
14888 }
14889 else
14890 {
14891 if (src2 != dest + halfregs)
14892 emit_move_insn (desthi, operands[2]);
14893 if (src1 != dest)
14894 emit_move_insn (destlo, operands[1]);
14895 }
14896 }
14897
14898 /* vec_perm support. */
14899
14900 struct expand_vec_perm_d
14901 {
14902 rtx target, op0, op1;
14903 vec_perm_indices perm;
14904 machine_mode vmode;
14905 unsigned int vec_flags;
14906 bool one_vector_p;
14907 bool testing_p;
14908 };
14909
14910 /* Generate a variable permutation. */
14911
14912 static void
14913 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
14914 {
14915 machine_mode vmode = GET_MODE (target);
14916 bool one_vector_p = rtx_equal_p (op0, op1);
14917
14918 gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
14919 gcc_checking_assert (GET_MODE (op0) == vmode);
14920 gcc_checking_assert (GET_MODE (op1) == vmode);
14921 gcc_checking_assert (GET_MODE (sel) == vmode);
14922 gcc_checking_assert (TARGET_SIMD);
14923
14924 if (one_vector_p)
14925 {
14926 if (vmode == V8QImode)
14927 {
14928 /* Expand the argument to a V16QI mode by duplicating it. */
14929 rtx pair = gen_reg_rtx (V16QImode);
14930 emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
14931 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
14932 }
14933 else
14934 {
14935 emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
14936 }
14937 }
14938 else
14939 {
14940 rtx pair;
14941
14942 if (vmode == V8QImode)
14943 {
14944 pair = gen_reg_rtx (V16QImode);
14945 emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
14946 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
14947 }
14948 else
14949 {
14950 pair = gen_reg_rtx (OImode);
14951 emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
14952 emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
14953 }
14954 }
14955 }
14956
14957 /* Expand a vec_perm with the operands given by TARGET, OP0, OP1 and SEL.
14958 NELT is the number of elements in the vector. */
14959
14960 void
14961 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel,
14962 unsigned int nelt)
14963 {
14964 machine_mode vmode = GET_MODE (target);
14965 bool one_vector_p = rtx_equal_p (op0, op1);
14966 rtx mask;
14967
14968 /* The TBL instruction does not use a modulo index, so we must take care
14969 of that ourselves. */
14970 mask = aarch64_simd_gen_const_vector_dup (vmode,
14971 one_vector_p ? nelt - 1 : 2 * nelt - 1);
14972 sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
14973
14974 /* For big-endian, we also need to reverse the index within the vector
14975 (but not which vector). */
14976 if (BYTES_BIG_ENDIAN)
14977 {
14978 /* If one_vector_p, mask is a vector of (nelt - 1)'s already. */
14979 if (!one_vector_p)
14980 mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
14981 sel = expand_simple_binop (vmode, XOR, sel, mask,
14982 NULL, 0, OPTAB_LIB_WIDEN);
14983 }
14984 aarch64_expand_vec_perm_1 (target, op0, op1, sel);
14985 }
14986
14987 /* Generate (set TARGET (unspec [OP0 OP1] CODE)). */
14988
14989 static void
14990 emit_unspec2 (rtx target, int code, rtx op0, rtx op1)
14991 {
14992 emit_insn (gen_rtx_SET (target,
14993 gen_rtx_UNSPEC (GET_MODE (target),
14994 gen_rtvec (2, op0, op1), code)));
14995 }
14996
14997 /* Expand an SVE vec_perm with the given operands. */
14998
14999 void
15000 aarch64_expand_sve_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
15001 {
15002 machine_mode data_mode = GET_MODE (target);
15003 machine_mode sel_mode = GET_MODE (sel);
15004 /* Enforced by the pattern condition. */
15005 int nunits = GET_MODE_NUNITS (sel_mode).to_constant ();
15006
15007 /* Note: vec_perm indices are supposed to wrap when they go beyond the
15008 size of the two value vectors, i.e. the upper bits of the indices
15009 are effectively ignored. SVE TBL instead produces 0 for any
15010 out-of-range indices, so we need to modulo all the vec_perm indices
15011 to ensure they are all in range. */
15012 rtx sel_reg = force_reg (sel_mode, sel);
15013
15014 /* Check if the sel only references the first values vector. */
15015 if (GET_CODE (sel) == CONST_VECTOR
15016 && aarch64_const_vec_all_in_range_p (sel, 0, nunits - 1))
15017 {
15018 emit_unspec2 (target, UNSPEC_TBL, op0, sel_reg);
15019 return;
15020 }
15021
15022 /* Check if the two values vectors are the same. */
15023 if (rtx_equal_p (op0, op1))
15024 {
15025 rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode, nunits - 1);
15026 rtx sel_mod = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
15027 NULL, 0, OPTAB_DIRECT);
15028 emit_unspec2 (target, UNSPEC_TBL, op0, sel_mod);
15029 return;
15030 }
15031
15032 /* Run TBL on for each value vector and combine the results. */
15033
15034 rtx res0 = gen_reg_rtx (data_mode);
15035 rtx res1 = gen_reg_rtx (data_mode);
15036 rtx neg_num_elems = aarch64_simd_gen_const_vector_dup (sel_mode, -nunits);
15037 if (GET_CODE (sel) != CONST_VECTOR
15038 || !aarch64_const_vec_all_in_range_p (sel, 0, 2 * nunits - 1))
15039 {
15040 rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode,
15041 2 * nunits - 1);
15042 sel_reg = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
15043 NULL, 0, OPTAB_DIRECT);
15044 }
15045 emit_unspec2 (res0, UNSPEC_TBL, op0, sel_reg);
15046 rtx sel_sub = expand_simple_binop (sel_mode, PLUS, sel_reg, neg_num_elems,
15047 NULL, 0, OPTAB_DIRECT);
15048 emit_unspec2 (res1, UNSPEC_TBL, op1, sel_sub);
15049 if (GET_MODE_CLASS (data_mode) == MODE_VECTOR_INT)
15050 emit_insn (gen_rtx_SET (target, gen_rtx_IOR (data_mode, res0, res1)));
15051 else
15052 emit_unspec2 (target, UNSPEC_IORF, res0, res1);
15053 }
15054
15055 /* Recognize patterns suitable for the TRN instructions. */
15056 static bool
15057 aarch64_evpc_trn (struct expand_vec_perm_d *d)
15058 {
15059 HOST_WIDE_INT odd;
15060 poly_uint64 nelt = d->perm.length ();
15061 rtx out, in0, in1, x;
15062 machine_mode vmode = d->vmode;
15063
15064 if (GET_MODE_UNIT_SIZE (vmode) > 8)
15065 return false;
15066
15067 /* Note that these are little-endian tests.
15068 We correct for big-endian later. */
15069 if (!d->perm[0].is_constant (&odd)
15070 || (odd != 0 && odd != 1)
15071 || !d->perm.series_p (0, 2, odd, 2)
15072 || !d->perm.series_p (1, 2, nelt + odd, 2))
15073 return false;
15074
15075 /* Success! */
15076 if (d->testing_p)
15077 return true;
15078
15079 in0 = d->op0;
15080 in1 = d->op1;
15081 /* We don't need a big-endian lane correction for SVE; see the comment
15082 at the head of aarch64-sve.md for details. */
15083 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
15084 {
15085 x = in0, in0 = in1, in1 = x;
15086 odd = !odd;
15087 }
15088 out = d->target;
15089
15090 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
15091 odd ? UNSPEC_TRN2 : UNSPEC_TRN1));
15092 return true;
15093 }
15094
15095 /* Recognize patterns suitable for the UZP instructions. */
15096 static bool
15097 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
15098 {
15099 HOST_WIDE_INT odd;
15100 rtx out, in0, in1, x;
15101 machine_mode vmode = d->vmode;
15102
15103 if (GET_MODE_UNIT_SIZE (vmode) > 8)
15104 return false;
15105
15106 /* Note that these are little-endian tests.
15107 We correct for big-endian later. */
15108 if (!d->perm[0].is_constant (&odd)
15109 || (odd != 0 && odd != 1)
15110 || !d->perm.series_p (0, 1, odd, 2))
15111 return false;
15112
15113 /* Success! */
15114 if (d->testing_p)
15115 return true;
15116
15117 in0 = d->op0;
15118 in1 = d->op1;
15119 /* We don't need a big-endian lane correction for SVE; see the comment
15120 at the head of aarch64-sve.md for details. */
15121 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
15122 {
15123 x = in0, in0 = in1, in1 = x;
15124 odd = !odd;
15125 }
15126 out = d->target;
15127
15128 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
15129 odd ? UNSPEC_UZP2 : UNSPEC_UZP1));
15130 return true;
15131 }
15132
15133 /* Recognize patterns suitable for the ZIP instructions. */
15134 static bool
15135 aarch64_evpc_zip (struct expand_vec_perm_d *d)
15136 {
15137 unsigned int high;
15138 poly_uint64 nelt = d->perm.length ();
15139 rtx out, in0, in1, x;
15140 machine_mode vmode = d->vmode;
15141
15142 if (GET_MODE_UNIT_SIZE (vmode) > 8)
15143 return false;
15144
15145 /* Note that these are little-endian tests.
15146 We correct for big-endian later. */
15147 poly_uint64 first = d->perm[0];
15148 if ((maybe_ne (first, 0U) && maybe_ne (first * 2, nelt))
15149 || !d->perm.series_p (0, 2, first, 1)
15150 || !d->perm.series_p (1, 2, first + nelt, 1))
15151 return false;
15152 high = maybe_ne (first, 0U);
15153
15154 /* Success! */
15155 if (d->testing_p)
15156 return true;
15157
15158 in0 = d->op0;
15159 in1 = d->op1;
15160 /* We don't need a big-endian lane correction for SVE; see the comment
15161 at the head of aarch64-sve.md for details. */
15162 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
15163 {
15164 x = in0, in0 = in1, in1 = x;
15165 high = !high;
15166 }
15167 out = d->target;
15168
15169 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
15170 high ? UNSPEC_ZIP2 : UNSPEC_ZIP1));
15171 return true;
15172 }
15173
15174 /* Recognize patterns for the EXT insn. */
15175
15176 static bool
15177 aarch64_evpc_ext (struct expand_vec_perm_d *d)
15178 {
15179 HOST_WIDE_INT location;
15180 rtx offset;
15181
15182 /* The first element always refers to the first vector.
15183 Check if the extracted indices are increasing by one. */
15184 if (d->vec_flags == VEC_SVE_PRED
15185 || !d->perm[0].is_constant (&location)
15186 || !d->perm.series_p (0, 1, location, 1))
15187 return false;
15188
15189 /* Success! */
15190 if (d->testing_p)
15191 return true;
15192
15193 /* The case where (location == 0) is a no-op for both big- and little-endian,
15194 and is removed by the mid-end at optimization levels -O1 and higher.
15195
15196 We don't need a big-endian lane correction for SVE; see the comment
15197 at the head of aarch64-sve.md for details. */
15198 if (BYTES_BIG_ENDIAN && location != 0 && d->vec_flags == VEC_ADVSIMD)
15199 {
15200 /* After setup, we want the high elements of the first vector (stored
15201 at the LSB end of the register), and the low elements of the second
15202 vector (stored at the MSB end of the register). So swap. */
15203 std::swap (d->op0, d->op1);
15204 /* location != 0 (above), so safe to assume (nelt - location) < nelt.
15205 to_constant () is safe since this is restricted to Advanced SIMD
15206 vectors. */
15207 location = d->perm.length ().to_constant () - location;
15208 }
15209
15210 offset = GEN_INT (location);
15211 emit_set_insn (d->target,
15212 gen_rtx_UNSPEC (d->vmode,
15213 gen_rtvec (3, d->op0, d->op1, offset),
15214 UNSPEC_EXT));
15215 return true;
15216 }
15217
15218 /* Recognize patterns for the REV{64,32,16} insns, which reverse elements
15219 within each 64-bit, 32-bit or 16-bit granule. */
15220
15221 static bool
15222 aarch64_evpc_rev_local (struct expand_vec_perm_d *d)
15223 {
15224 HOST_WIDE_INT diff;
15225 unsigned int i, size, unspec;
15226 machine_mode pred_mode;
15227
15228 if (d->vec_flags == VEC_SVE_PRED
15229 || !d->one_vector_p
15230 || !d->perm[0].is_constant (&diff))
15231 return false;
15232
15233 size = (diff + 1) * GET_MODE_UNIT_SIZE (d->vmode);
15234 if (size == 8)
15235 {
15236 unspec = UNSPEC_REV64;
15237 pred_mode = VNx2BImode;
15238 }
15239 else if (size == 4)
15240 {
15241 unspec = UNSPEC_REV32;
15242 pred_mode = VNx4BImode;
15243 }
15244 else if (size == 2)
15245 {
15246 unspec = UNSPEC_REV16;
15247 pred_mode = VNx8BImode;
15248 }
15249 else
15250 return false;
15251
15252 unsigned int step = diff + 1;
15253 for (i = 0; i < step; ++i)
15254 if (!d->perm.series_p (i, step, diff - i, step))
15255 return false;
15256
15257 /* Success! */
15258 if (d->testing_p)
15259 return true;
15260
15261 rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), unspec);
15262 if (d->vec_flags == VEC_SVE_DATA)
15263 {
15264 rtx pred = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
15265 src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (2, pred, src),
15266 UNSPEC_MERGE_PTRUE);
15267 }
15268 emit_set_insn (d->target, src);
15269 return true;
15270 }
15271
15272 /* Recognize patterns for the REV insn, which reverses elements within
15273 a full vector. */
15274
15275 static bool
15276 aarch64_evpc_rev_global (struct expand_vec_perm_d *d)
15277 {
15278 poly_uint64 nelt = d->perm.length ();
15279
15280 if (!d->one_vector_p || d->vec_flags != VEC_SVE_DATA)
15281 return false;
15282
15283 if (!d->perm.series_p (0, 1, nelt - 1, -1))
15284 return false;
15285
15286 /* Success! */
15287 if (d->testing_p)
15288 return true;
15289
15290 rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), UNSPEC_REV);
15291 emit_set_insn (d->target, src);
15292 return true;
15293 }
15294
15295 static bool
15296 aarch64_evpc_dup (struct expand_vec_perm_d *d)
15297 {
15298 rtx out = d->target;
15299 rtx in0;
15300 HOST_WIDE_INT elt;
15301 machine_mode vmode = d->vmode;
15302 rtx lane;
15303
15304 if (d->vec_flags == VEC_SVE_PRED
15305 || d->perm.encoding ().encoded_nelts () != 1
15306 || !d->perm[0].is_constant (&elt))
15307 return false;
15308
15309 if (d->vec_flags == VEC_SVE_DATA && elt >= 64 * GET_MODE_UNIT_SIZE (vmode))
15310 return false;
15311
15312 /* Success! */
15313 if (d->testing_p)
15314 return true;
15315
15316 /* The generic preparation in aarch64_expand_vec_perm_const_1
15317 swaps the operand order and the permute indices if it finds
15318 d->perm[0] to be in the second operand. Thus, we can always
15319 use d->op0 and need not do any extra arithmetic to get the
15320 correct lane number. */
15321 in0 = d->op0;
15322 lane = GEN_INT (elt); /* The pattern corrects for big-endian. */
15323
15324 rtx parallel = gen_rtx_PARALLEL (vmode, gen_rtvec (1, lane));
15325 rtx select = gen_rtx_VEC_SELECT (GET_MODE_INNER (vmode), in0, parallel);
15326 emit_set_insn (out, gen_rtx_VEC_DUPLICATE (vmode, select));
15327 return true;
15328 }
15329
15330 static bool
15331 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
15332 {
15333 rtx rperm[MAX_COMPILE_TIME_VEC_BYTES], sel;
15334 machine_mode vmode = d->vmode;
15335
15336 /* Make sure that the indices are constant. */
15337 unsigned int encoded_nelts = d->perm.encoding ().encoded_nelts ();
15338 for (unsigned int i = 0; i < encoded_nelts; ++i)
15339 if (!d->perm[i].is_constant ())
15340 return false;
15341
15342 if (d->testing_p)
15343 return true;
15344
15345 /* Generic code will try constant permutation twice. Once with the
15346 original mode and again with the elements lowered to QImode.
15347 So wait and don't do the selector expansion ourselves. */
15348 if (vmode != V8QImode && vmode != V16QImode)
15349 return false;
15350
15351 /* to_constant is safe since this routine is specific to Advanced SIMD
15352 vectors. */
15353 unsigned int nelt = d->perm.length ().to_constant ();
15354 for (unsigned int i = 0; i < nelt; ++i)
15355 /* If big-endian and two vectors we end up with a weird mixed-endian
15356 mode on NEON. Reverse the index within each word but not the word
15357 itself. to_constant is safe because we checked is_constant above. */
15358 rperm[i] = GEN_INT (BYTES_BIG_ENDIAN
15359 ? d->perm[i].to_constant () ^ (nelt - 1)
15360 : d->perm[i].to_constant ());
15361
15362 sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
15363 sel = force_reg (vmode, sel);
15364
15365 aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
15366 return true;
15367 }
15368
15369 /* Try to implement D using an SVE TBL instruction. */
15370
15371 static bool
15372 aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
15373 {
15374 unsigned HOST_WIDE_INT nelt;
15375
15376 /* Permuting two variable-length vectors could overflow the
15377 index range. */
15378 if (!d->one_vector_p && !d->perm.length ().is_constant (&nelt))
15379 return false;
15380
15381 if (d->testing_p)
15382 return true;
15383
15384 machine_mode sel_mode = mode_for_int_vector (d->vmode).require ();
15385 rtx sel = vec_perm_indices_to_rtx (sel_mode, d->perm);
15386 if (d->one_vector_p)
15387 emit_unspec2 (d->target, UNSPEC_TBL, d->op0, force_reg (sel_mode, sel));
15388 else
15389 aarch64_expand_sve_vec_perm (d->target, d->op0, d->op1, sel);
15390 return true;
15391 }
15392
15393 static bool
15394 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
15395 {
15396 /* The pattern matching functions above are written to look for a small
15397 number to begin the sequence (0, 1, N/2). If we begin with an index
15398 from the second operand, we can swap the operands. */
15399 poly_int64 nelt = d->perm.length ();
15400 if (known_ge (d->perm[0], nelt))
15401 {
15402 d->perm.rotate_inputs (1);
15403 std::swap (d->op0, d->op1);
15404 }
15405
15406 if ((d->vec_flags == VEC_ADVSIMD
15407 || d->vec_flags == VEC_SVE_DATA
15408 || d->vec_flags == VEC_SVE_PRED)
15409 && known_gt (nelt, 1))
15410 {
15411 if (aarch64_evpc_rev_local (d))
15412 return true;
15413 else if (aarch64_evpc_rev_global (d))
15414 return true;
15415 else if (aarch64_evpc_ext (d))
15416 return true;
15417 else if (aarch64_evpc_dup (d))
15418 return true;
15419 else if (aarch64_evpc_zip (d))
15420 return true;
15421 else if (aarch64_evpc_uzp (d))
15422 return true;
15423 else if (aarch64_evpc_trn (d))
15424 return true;
15425 if (d->vec_flags == VEC_SVE_DATA)
15426 return aarch64_evpc_sve_tbl (d);
15427 else if (d->vec_flags == VEC_ADVSIMD)
15428 return aarch64_evpc_tbl (d);
15429 }
15430 return false;
15431 }
15432
15433 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST. */
15434
15435 static bool
15436 aarch64_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
15437 rtx op1, const vec_perm_indices &sel)
15438 {
15439 struct expand_vec_perm_d d;
15440
15441 /* Check whether the mask can be applied to a single vector. */
15442 if (sel.ninputs () == 1
15443 || (op0 && rtx_equal_p (op0, op1)))
15444 d.one_vector_p = true;
15445 else if (sel.all_from_input_p (0))
15446 {
15447 d.one_vector_p = true;
15448 op1 = op0;
15449 }
15450 else if (sel.all_from_input_p (1))
15451 {
15452 d.one_vector_p = true;
15453 op0 = op1;
15454 }
15455 else
15456 d.one_vector_p = false;
15457
15458 d.perm.new_vector (sel.encoding (), d.one_vector_p ? 1 : 2,
15459 sel.nelts_per_input ());
15460 d.vmode = vmode;
15461 d.vec_flags = aarch64_classify_vector_mode (d.vmode);
15462 d.target = target;
15463 d.op0 = op0;
15464 d.op1 = op1;
15465 d.testing_p = !target;
15466
15467 if (!d.testing_p)
15468 return aarch64_expand_vec_perm_const_1 (&d);
15469
15470 rtx_insn *last = get_last_insn ();
15471 bool ret = aarch64_expand_vec_perm_const_1 (&d);
15472 gcc_assert (last == get_last_insn ());
15473
15474 return ret;
15475 }
15476
15477 /* Generate a byte permute mask for a register of mode MODE,
15478 which has NUNITS units. */
15479
15480 rtx
15481 aarch64_reverse_mask (machine_mode mode, unsigned int nunits)
15482 {
15483 /* We have to reverse each vector because we dont have
15484 a permuted load that can reverse-load according to ABI rules. */
15485 rtx mask;
15486 rtvec v = rtvec_alloc (16);
15487 unsigned int i, j;
15488 unsigned int usize = GET_MODE_UNIT_SIZE (mode);
15489
15490 gcc_assert (BYTES_BIG_ENDIAN);
15491 gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
15492
15493 for (i = 0; i < nunits; i++)
15494 for (j = 0; j < usize; j++)
15495 RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
15496 mask = gen_rtx_CONST_VECTOR (V16QImode, v);
15497 return force_reg (V16QImode, mask);
15498 }
15499
15500 /* Return true if X is a valid second operand for the SVE instruction
15501 that implements integer comparison OP_CODE. */
15502
15503 static bool
15504 aarch64_sve_cmp_operand_p (rtx_code op_code, rtx x)
15505 {
15506 if (register_operand (x, VOIDmode))
15507 return true;
15508
15509 switch (op_code)
15510 {
15511 case LTU:
15512 case LEU:
15513 case GEU:
15514 case GTU:
15515 return aarch64_sve_cmp_immediate_p (x, false);
15516 case LT:
15517 case LE:
15518 case GE:
15519 case GT:
15520 case NE:
15521 case EQ:
15522 return aarch64_sve_cmp_immediate_p (x, true);
15523 default:
15524 gcc_unreachable ();
15525 }
15526 }
15527
15528 /* Use predicated SVE instructions to implement the equivalent of:
15529
15530 (set TARGET OP)
15531
15532 given that PTRUE is an all-true predicate of the appropriate mode. */
15533
15534 static void
15535 aarch64_emit_sve_ptrue_op (rtx target, rtx ptrue, rtx op)
15536 {
15537 rtx unspec = gen_rtx_UNSPEC (GET_MODE (target),
15538 gen_rtvec (2, ptrue, op),
15539 UNSPEC_MERGE_PTRUE);
15540 rtx_insn *insn = emit_set_insn (target, unspec);
15541 set_unique_reg_note (insn, REG_EQUAL, copy_rtx (op));
15542 }
15543
15544 /* Likewise, but also clobber the condition codes. */
15545
15546 static void
15547 aarch64_emit_sve_ptrue_op_cc (rtx target, rtx ptrue, rtx op)
15548 {
15549 rtx unspec = gen_rtx_UNSPEC (GET_MODE (target),
15550 gen_rtvec (2, ptrue, op),
15551 UNSPEC_MERGE_PTRUE);
15552 rtx_insn *insn = emit_insn (gen_set_clobber_cc (target, unspec));
15553 set_unique_reg_note (insn, REG_EQUAL, copy_rtx (op));
15554 }
15555
15556 /* Return the UNSPEC_COND_* code for comparison CODE. */
15557
15558 static unsigned int
15559 aarch64_unspec_cond_code (rtx_code code)
15560 {
15561 switch (code)
15562 {
15563 case NE:
15564 return UNSPEC_COND_NE;
15565 case EQ:
15566 return UNSPEC_COND_EQ;
15567 case LT:
15568 return UNSPEC_COND_LT;
15569 case GT:
15570 return UNSPEC_COND_GT;
15571 case LE:
15572 return UNSPEC_COND_LE;
15573 case GE:
15574 return UNSPEC_COND_GE;
15575 default:
15576 gcc_unreachable ();
15577 }
15578 }
15579
15580 /* Emit:
15581
15582 (set TARGET (unspec [PRED OP0 OP1] UNSPEC_COND_<X>))
15583
15584 where <X> is the operation associated with comparison CODE. This form
15585 of instruction is used when (and (CODE OP0 OP1) PRED) would have different
15586 semantics, such as when PRED might not be all-true and when comparing
15587 inactive lanes could have side effects. */
15588
15589 static void
15590 aarch64_emit_sve_predicated_cond (rtx target, rtx_code code,
15591 rtx pred, rtx op0, rtx op1)
15592 {
15593 rtx unspec = gen_rtx_UNSPEC (GET_MODE (pred),
15594 gen_rtvec (3, pred, op0, op1),
15595 aarch64_unspec_cond_code (code));
15596 emit_set_insn (target, unspec);
15597 }
15598
15599 /* Expand an SVE integer comparison using the SVE equivalent of:
15600
15601 (set TARGET (CODE OP0 OP1)). */
15602
15603 void
15604 aarch64_expand_sve_vec_cmp_int (rtx target, rtx_code code, rtx op0, rtx op1)
15605 {
15606 machine_mode pred_mode = GET_MODE (target);
15607 machine_mode data_mode = GET_MODE (op0);
15608
15609 if (!aarch64_sve_cmp_operand_p (code, op1))
15610 op1 = force_reg (data_mode, op1);
15611
15612 rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
15613 rtx cond = gen_rtx_fmt_ee (code, pred_mode, op0, op1);
15614 aarch64_emit_sve_ptrue_op_cc (target, ptrue, cond);
15615 }
15616
15617 /* Emit the SVE equivalent of:
15618
15619 (set TMP1 (CODE1 OP0 OP1))
15620 (set TMP2 (CODE2 OP0 OP1))
15621 (set TARGET (ior:PRED_MODE TMP1 TMP2))
15622
15623 PTRUE is an all-true predicate with the same mode as TARGET. */
15624
15625 static void
15626 aarch64_emit_sve_or_conds (rtx target, rtx_code code1, rtx_code code2,
15627 rtx ptrue, rtx op0, rtx op1)
15628 {
15629 machine_mode pred_mode = GET_MODE (ptrue);
15630 rtx tmp1 = gen_reg_rtx (pred_mode);
15631 aarch64_emit_sve_ptrue_op (tmp1, ptrue,
15632 gen_rtx_fmt_ee (code1, pred_mode, op0, op1));
15633 rtx tmp2 = gen_reg_rtx (pred_mode);
15634 aarch64_emit_sve_ptrue_op (tmp2, ptrue,
15635 gen_rtx_fmt_ee (code2, pred_mode, op0, op1));
15636 aarch64_emit_binop (target, ior_optab, tmp1, tmp2);
15637 }
15638
15639 /* Emit the SVE equivalent of:
15640
15641 (set TMP (CODE OP0 OP1))
15642 (set TARGET (not TMP))
15643
15644 PTRUE is an all-true predicate with the same mode as TARGET. */
15645
15646 static void
15647 aarch64_emit_sve_inverted_cond (rtx target, rtx ptrue, rtx_code code,
15648 rtx op0, rtx op1)
15649 {
15650 machine_mode pred_mode = GET_MODE (ptrue);
15651 rtx tmp = gen_reg_rtx (pred_mode);
15652 aarch64_emit_sve_ptrue_op (tmp, ptrue,
15653 gen_rtx_fmt_ee (code, pred_mode, op0, op1));
15654 aarch64_emit_unop (target, one_cmpl_optab, tmp);
15655 }
15656
15657 /* Expand an SVE floating-point comparison using the SVE equivalent of:
15658
15659 (set TARGET (CODE OP0 OP1))
15660
15661 If CAN_INVERT_P is true, the caller can also handle inverted results;
15662 return true if the result is in fact inverted. */
15663
15664 bool
15665 aarch64_expand_sve_vec_cmp_float (rtx target, rtx_code code,
15666 rtx op0, rtx op1, bool can_invert_p)
15667 {
15668 machine_mode pred_mode = GET_MODE (target);
15669 machine_mode data_mode = GET_MODE (op0);
15670
15671 rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
15672 switch (code)
15673 {
15674 case UNORDERED:
15675 /* UNORDERED has no immediate form. */
15676 op1 = force_reg (data_mode, op1);
15677 /* fall through */
15678 case LT:
15679 case LE:
15680 case GT:
15681 case GE:
15682 case EQ:
15683 case NE:
15684 {
15685 /* There is native support for the comparison. */
15686 rtx cond = gen_rtx_fmt_ee (code, pred_mode, op0, op1);
15687 aarch64_emit_sve_ptrue_op (target, ptrue, cond);
15688 return false;
15689 }
15690
15691 case LTGT:
15692 /* This is a trapping operation (LT or GT). */
15693 aarch64_emit_sve_or_conds (target, LT, GT, ptrue, op0, op1);
15694 return false;
15695
15696 case UNEQ:
15697 if (!flag_trapping_math)
15698 {
15699 /* This would trap for signaling NaNs. */
15700 op1 = force_reg (data_mode, op1);
15701 aarch64_emit_sve_or_conds (target, UNORDERED, EQ, ptrue, op0, op1);
15702 return false;
15703 }
15704 /* fall through */
15705 case UNLT:
15706 case UNLE:
15707 case UNGT:
15708 case UNGE:
15709 if (flag_trapping_math)
15710 {
15711 /* Work out which elements are ordered. */
15712 rtx ordered = gen_reg_rtx (pred_mode);
15713 op1 = force_reg (data_mode, op1);
15714 aarch64_emit_sve_inverted_cond (ordered, ptrue, UNORDERED, op0, op1);
15715
15716 /* Test the opposite condition for the ordered elements,
15717 then invert the result. */
15718 if (code == UNEQ)
15719 code = NE;
15720 else
15721 code = reverse_condition_maybe_unordered (code);
15722 if (can_invert_p)
15723 {
15724 aarch64_emit_sve_predicated_cond (target, code,
15725 ordered, op0, op1);
15726 return true;
15727 }
15728 rtx tmp = gen_reg_rtx (pred_mode);
15729 aarch64_emit_sve_predicated_cond (tmp, code, ordered, op0, op1);
15730 aarch64_emit_unop (target, one_cmpl_optab, tmp);
15731 return false;
15732 }
15733 break;
15734
15735 case ORDERED:
15736 /* ORDERED has no immediate form. */
15737 op1 = force_reg (data_mode, op1);
15738 break;
15739
15740 default:
15741 gcc_unreachable ();
15742 }
15743
15744 /* There is native support for the inverse comparison. */
15745 code = reverse_condition_maybe_unordered (code);
15746 if (can_invert_p)
15747 {
15748 rtx cond = gen_rtx_fmt_ee (code, pred_mode, op0, op1);
15749 aarch64_emit_sve_ptrue_op (target, ptrue, cond);
15750 return true;
15751 }
15752 aarch64_emit_sve_inverted_cond (target, ptrue, code, op0, op1);
15753 return false;
15754 }
15755
15756 /* Expand an SVE vcond pattern with operands OPS. DATA_MODE is the mode
15757 of the data being selected and CMP_MODE is the mode of the values being
15758 compared. */
15759
15760 void
15761 aarch64_expand_sve_vcond (machine_mode data_mode, machine_mode cmp_mode,
15762 rtx *ops)
15763 {
15764 machine_mode pred_mode
15765 = aarch64_get_mask_mode (GET_MODE_NUNITS (cmp_mode),
15766 GET_MODE_SIZE (cmp_mode)).require ();
15767 rtx pred = gen_reg_rtx (pred_mode);
15768 if (FLOAT_MODE_P (cmp_mode))
15769 {
15770 if (aarch64_expand_sve_vec_cmp_float (pred, GET_CODE (ops[3]),
15771 ops[4], ops[5], true))
15772 std::swap (ops[1], ops[2]);
15773 }
15774 else
15775 aarch64_expand_sve_vec_cmp_int (pred, GET_CODE (ops[3]), ops[4], ops[5]);
15776
15777 rtvec vec = gen_rtvec (3, pred, ops[1], ops[2]);
15778 emit_set_insn (ops[0], gen_rtx_UNSPEC (data_mode, vec, UNSPEC_SEL));
15779 }
15780
15781 /* Implement TARGET_MODES_TIEABLE_P. In principle we should always return
15782 true. However due to issues with register allocation it is preferable
15783 to avoid tieing integer scalar and FP scalar modes. Executing integer
15784 operations in general registers is better than treating them as scalar
15785 vector operations. This reduces latency and avoids redundant int<->FP
15786 moves. So tie modes if they are either the same class, or vector modes
15787 with other vector modes, vector structs or any scalar mode. */
15788
15789 static bool
15790 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
15791 {
15792 if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
15793 return true;
15794
15795 /* We specifically want to allow elements of "structure" modes to
15796 be tieable to the structure. This more general condition allows
15797 other rarer situations too. The reason we don't extend this to
15798 predicate modes is that there are no predicate structure modes
15799 nor any specific instructions for extracting part of a predicate
15800 register. */
15801 if (aarch64_vector_data_mode_p (mode1)
15802 && aarch64_vector_data_mode_p (mode2))
15803 return true;
15804
15805 /* Also allow any scalar modes with vectors. */
15806 if (aarch64_vector_mode_supported_p (mode1)
15807 || aarch64_vector_mode_supported_p (mode2))
15808 return true;
15809
15810 return false;
15811 }
15812
15813 /* Return a new RTX holding the result of moving POINTER forward by
15814 AMOUNT bytes. */
15815
15816 static rtx
15817 aarch64_move_pointer (rtx pointer, poly_int64 amount)
15818 {
15819 rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
15820
15821 return adjust_automodify_address (pointer, GET_MODE (pointer),
15822 next, amount);
15823 }
15824
15825 /* Return a new RTX holding the result of moving POINTER forward by the
15826 size of the mode it points to. */
15827
15828 static rtx
15829 aarch64_progress_pointer (rtx pointer)
15830 {
15831 return aarch64_move_pointer (pointer, GET_MODE_SIZE (GET_MODE (pointer)));
15832 }
15833
15834 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
15835 MODE bytes. */
15836
15837 static void
15838 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
15839 machine_mode mode)
15840 {
15841 rtx reg = gen_reg_rtx (mode);
15842
15843 /* "Cast" the pointers to the correct mode. */
15844 *src = adjust_address (*src, mode, 0);
15845 *dst = adjust_address (*dst, mode, 0);
15846 /* Emit the memcpy. */
15847 emit_move_insn (reg, *src);
15848 emit_move_insn (*dst, reg);
15849 /* Move the pointers forward. */
15850 *src = aarch64_progress_pointer (*src);
15851 *dst = aarch64_progress_pointer (*dst);
15852 }
15853
15854 /* Expand movmem, as if from a __builtin_memcpy. Return true if
15855 we succeed, otherwise return false. */
15856
15857 bool
15858 aarch64_expand_movmem (rtx *operands)
15859 {
15860 int n, mode_bits;
15861 rtx dst = operands[0];
15862 rtx src = operands[1];
15863 rtx base;
15864 machine_mode cur_mode = BLKmode, next_mode;
15865 bool speed_p = !optimize_function_for_size_p (cfun);
15866
15867 /* When optimizing for size, give a better estimate of the length of a
15868 memcpy call, but use the default otherwise. Moves larger than 8 bytes
15869 will always require an even number of instructions to do now. And each
15870 operation requires both a load+store, so devide the max number by 2. */
15871 int max_num_moves = (speed_p ? 16 : AARCH64_CALL_RATIO) / 2;
15872
15873 /* We can't do anything smart if the amount to copy is not constant. */
15874 if (!CONST_INT_P (operands[2]))
15875 return false;
15876
15877 n = INTVAL (operands[2]);
15878
15879 /* Try to keep the number of instructions low. For all cases we will do at
15880 most two moves for the residual amount, since we'll always overlap the
15881 remainder. */
15882 if (((n / 16) + (n % 16 ? 2 : 0)) > max_num_moves)
15883 return false;
15884
15885 base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
15886 dst = adjust_automodify_address (dst, VOIDmode, base, 0);
15887
15888 base = copy_to_mode_reg (Pmode, XEXP (src, 0));
15889 src = adjust_automodify_address (src, VOIDmode, base, 0);
15890
15891 /* Convert n to bits to make the rest of the code simpler. */
15892 n = n * BITS_PER_UNIT;
15893
15894 /* Maximum amount to copy in one go. The AArch64 back-end has integer modes
15895 larger than TImode, but we should not use them for loads/stores here. */
15896 const int copy_limit = GET_MODE_BITSIZE (TImode);
15897
15898 while (n > 0)
15899 {
15900 /* Find the largest mode in which to do the copy in without over reading
15901 or writing. */
15902 opt_scalar_int_mode mode_iter;
15903 FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
15904 if (GET_MODE_BITSIZE (mode_iter.require ()) <= MIN (n, copy_limit))
15905 cur_mode = mode_iter.require ();
15906
15907 gcc_assert (cur_mode != BLKmode);
15908
15909 mode_bits = GET_MODE_BITSIZE (cur_mode).to_constant ();
15910 aarch64_copy_one_block_and_progress_pointers (&src, &dst, cur_mode);
15911
15912 n -= mode_bits;
15913
15914 /* Do certain trailing copies as overlapping if it's going to be
15915 cheaper. i.e. less instructions to do so. For instance doing a 15
15916 byte copy it's more efficient to do two overlapping 8 byte copies than
15917 8 + 6 + 1. */
15918 if (n > 0 && n <= 8 * BITS_PER_UNIT)
15919 {
15920 next_mode = smallest_mode_for_size (n, MODE_INT);
15921 int n_bits = GET_MODE_BITSIZE (next_mode).to_constant ();
15922 src = aarch64_move_pointer (src, (n - n_bits) / BITS_PER_UNIT);
15923 dst = aarch64_move_pointer (dst, (n - n_bits) / BITS_PER_UNIT);
15924 n = n_bits;
15925 }
15926 }
15927
15928 return true;
15929 }
15930
15931 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
15932 SImode stores. Handle the case when the constant has identical
15933 bottom and top halves. This is beneficial when the two stores can be
15934 merged into an STP and we avoid synthesising potentially expensive
15935 immediates twice. Return true if such a split is possible. */
15936
15937 bool
15938 aarch64_split_dimode_const_store (rtx dst, rtx src)
15939 {
15940 rtx lo = gen_lowpart (SImode, src);
15941 rtx hi = gen_highpart_mode (SImode, DImode, src);
15942
15943 bool size_p = optimize_function_for_size_p (cfun);
15944
15945 if (!rtx_equal_p (lo, hi))
15946 return false;
15947
15948 unsigned int orig_cost
15949 = aarch64_internal_mov_immediate (NULL_RTX, src, false, DImode);
15950 unsigned int lo_cost
15951 = aarch64_internal_mov_immediate (NULL_RTX, lo, false, SImode);
15952
15953 /* We want to transform:
15954 MOV x1, 49370
15955 MOVK x1, 0x140, lsl 16
15956 MOVK x1, 0xc0da, lsl 32
15957 MOVK x1, 0x140, lsl 48
15958 STR x1, [x0]
15959 into:
15960 MOV w1, 49370
15961 MOVK w1, 0x140, lsl 16
15962 STP w1, w1, [x0]
15963 So we want to perform this only when we save two instructions
15964 or more. When optimizing for size, however, accept any code size
15965 savings we can. */
15966 if (size_p && orig_cost <= lo_cost)
15967 return false;
15968
15969 if (!size_p
15970 && (orig_cost <= lo_cost + 1))
15971 return false;
15972
15973 rtx mem_lo = adjust_address (dst, SImode, 0);
15974 if (!aarch64_mem_pair_operand (mem_lo, SImode))
15975 return false;
15976
15977 rtx tmp_reg = gen_reg_rtx (SImode);
15978 aarch64_expand_mov_immediate (tmp_reg, lo);
15979 rtx mem_hi = aarch64_move_pointer (mem_lo, GET_MODE_SIZE (SImode));
15980 /* Don't emit an explicit store pair as this may not be always profitable.
15981 Let the sched-fusion logic decide whether to merge them. */
15982 emit_move_insn (mem_lo, tmp_reg);
15983 emit_move_insn (mem_hi, tmp_reg);
15984
15985 return true;
15986 }
15987
15988 /* Generate RTL for a conditional branch with rtx comparison CODE in
15989 mode CC_MODE. The destination of the unlikely conditional branch
15990 is LABEL_REF. */
15991
15992 void
15993 aarch64_gen_unlikely_cbranch (enum rtx_code code, machine_mode cc_mode,
15994 rtx label_ref)
15995 {
15996 rtx x;
15997 x = gen_rtx_fmt_ee (code, VOIDmode,
15998 gen_rtx_REG (cc_mode, CC_REGNUM),
15999 const0_rtx);
16000
16001 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
16002 gen_rtx_LABEL_REF (VOIDmode, label_ref),
16003 pc_rtx);
16004 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
16005 }
16006
16007 /* Generate DImode scratch registers for 128-bit (TImode) addition.
16008
16009 OP1 represents the TImode destination operand 1
16010 OP2 represents the TImode destination operand 2
16011 LOW_DEST represents the low half (DImode) of TImode operand 0
16012 LOW_IN1 represents the low half (DImode) of TImode operand 1
16013 LOW_IN2 represents the low half (DImode) of TImode operand 2
16014 HIGH_DEST represents the high half (DImode) of TImode operand 0
16015 HIGH_IN1 represents the high half (DImode) of TImode operand 1
16016 HIGH_IN2 represents the high half (DImode) of TImode operand 2. */
16017
16018 void
16019 aarch64_addti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
16020 rtx *low_in1, rtx *low_in2,
16021 rtx *high_dest, rtx *high_in1,
16022 rtx *high_in2)
16023 {
16024 *low_dest = gen_reg_rtx (DImode);
16025 *low_in1 = gen_lowpart (DImode, op1);
16026 *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
16027 subreg_lowpart_offset (DImode, TImode));
16028 *high_dest = gen_reg_rtx (DImode);
16029 *high_in1 = gen_highpart (DImode, op1);
16030 *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
16031 subreg_highpart_offset (DImode, TImode));
16032 }
16033
16034 /* Generate DImode scratch registers for 128-bit (TImode) subtraction.
16035
16036 This function differs from 'arch64_addti_scratch_regs' in that
16037 OP1 can be an immediate constant (zero). We must call
16038 subreg_highpart_offset with DImode and TImode arguments, otherwise
16039 VOIDmode will be used for the const_int which generates an internal
16040 error from subreg_size_highpart_offset which does not expect a size of zero.
16041
16042 OP1 represents the TImode destination operand 1
16043 OP2 represents the TImode destination operand 2
16044 LOW_DEST represents the low half (DImode) of TImode operand 0
16045 LOW_IN1 represents the low half (DImode) of TImode operand 1
16046 LOW_IN2 represents the low half (DImode) of TImode operand 2
16047 HIGH_DEST represents the high half (DImode) of TImode operand 0
16048 HIGH_IN1 represents the high half (DImode) of TImode operand 1
16049 HIGH_IN2 represents the high half (DImode) of TImode operand 2. */
16050
16051
16052 void
16053 aarch64_subvti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
16054 rtx *low_in1, rtx *low_in2,
16055 rtx *high_dest, rtx *high_in1,
16056 rtx *high_in2)
16057 {
16058 *low_dest = gen_reg_rtx (DImode);
16059 *low_in1 = simplify_gen_subreg (DImode, op1, TImode,
16060 subreg_lowpart_offset (DImode, TImode));
16061
16062 *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
16063 subreg_lowpart_offset (DImode, TImode));
16064 *high_dest = gen_reg_rtx (DImode);
16065
16066 *high_in1 = simplify_gen_subreg (DImode, op1, TImode,
16067 subreg_highpart_offset (DImode, TImode));
16068 *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
16069 subreg_highpart_offset (DImode, TImode));
16070 }
16071
16072 /* Generate RTL for 128-bit (TImode) subtraction with overflow.
16073
16074 OP0 represents the TImode destination operand 0
16075 LOW_DEST represents the low half (DImode) of TImode operand 0
16076 LOW_IN1 represents the low half (DImode) of TImode operand 1
16077 LOW_IN2 represents the low half (DImode) of TImode operand 2
16078 HIGH_DEST represents the high half (DImode) of TImode operand 0
16079 HIGH_IN1 represents the high half (DImode) of TImode operand 1
16080 HIGH_IN2 represents the high half (DImode) of TImode operand 2. */
16081
16082 void
16083 aarch64_expand_subvti (rtx op0, rtx low_dest, rtx low_in1,
16084 rtx low_in2, rtx high_dest, rtx high_in1,
16085 rtx high_in2)
16086 {
16087 if (low_in2 == const0_rtx)
16088 {
16089 low_dest = low_in1;
16090 emit_insn (gen_subdi3_compare1 (high_dest, high_in1,
16091 force_reg (DImode, high_in2)));
16092 }
16093 else
16094 {
16095 if (CONST_INT_P (low_in2))
16096 {
16097 low_in2 = force_reg (DImode, GEN_INT (-UINTVAL (low_in2)));
16098 high_in2 = force_reg (DImode, high_in2);
16099 emit_insn (gen_adddi3_compareC (low_dest, low_in1, low_in2));
16100 }
16101 else
16102 emit_insn (gen_subdi3_compare1 (low_dest, low_in1, low_in2));
16103 emit_insn (gen_subdi3_carryinCV (high_dest,
16104 force_reg (DImode, high_in1),
16105 high_in2));
16106 }
16107
16108 emit_move_insn (gen_lowpart (DImode, op0), low_dest);
16109 emit_move_insn (gen_highpart (DImode, op0), high_dest);
16110
16111 }
16112
16113 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
16114
16115 static unsigned HOST_WIDE_INT
16116 aarch64_asan_shadow_offset (void)
16117 {
16118 return (HOST_WIDE_INT_1 << 36);
16119 }
16120
16121 static rtx
16122 aarch64_gen_ccmp_first (rtx_insn **prep_seq, rtx_insn **gen_seq,
16123 int code, tree treeop0, tree treeop1)
16124 {
16125 machine_mode op_mode, cmp_mode, cc_mode = CCmode;
16126 rtx op0, op1;
16127 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
16128 insn_code icode;
16129 struct expand_operand ops[4];
16130
16131 start_sequence ();
16132 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
16133
16134 op_mode = GET_MODE (op0);
16135 if (op_mode == VOIDmode)
16136 op_mode = GET_MODE (op1);
16137
16138 switch (op_mode)
16139 {
16140 case E_QImode:
16141 case E_HImode:
16142 case E_SImode:
16143 cmp_mode = SImode;
16144 icode = CODE_FOR_cmpsi;
16145 break;
16146
16147 case E_DImode:
16148 cmp_mode = DImode;
16149 icode = CODE_FOR_cmpdi;
16150 break;
16151
16152 case E_SFmode:
16153 cmp_mode = SFmode;
16154 cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
16155 icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpesf : CODE_FOR_fcmpsf;
16156 break;
16157
16158 case E_DFmode:
16159 cmp_mode = DFmode;
16160 cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
16161 icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpedf : CODE_FOR_fcmpdf;
16162 break;
16163
16164 default:
16165 end_sequence ();
16166 return NULL_RTX;
16167 }
16168
16169 op0 = prepare_operand (icode, op0, 0, op_mode, cmp_mode, unsignedp);
16170 op1 = prepare_operand (icode, op1, 1, op_mode, cmp_mode, unsignedp);
16171 if (!op0 || !op1)
16172 {
16173 end_sequence ();
16174 return NULL_RTX;
16175 }
16176 *prep_seq = get_insns ();
16177 end_sequence ();
16178
16179 create_fixed_operand (&ops[0], op0);
16180 create_fixed_operand (&ops[1], op1);
16181
16182 start_sequence ();
16183 if (!maybe_expand_insn (icode, 2, ops))
16184 {
16185 end_sequence ();
16186 return NULL_RTX;
16187 }
16188 *gen_seq = get_insns ();
16189 end_sequence ();
16190
16191 return gen_rtx_fmt_ee ((rtx_code) code, cc_mode,
16192 gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx);
16193 }
16194
16195 static rtx
16196 aarch64_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev,
16197 int cmp_code, tree treeop0, tree treeop1, int bit_code)
16198 {
16199 rtx op0, op1, target;
16200 machine_mode op_mode, cmp_mode, cc_mode = CCmode;
16201 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
16202 insn_code icode;
16203 struct expand_operand ops[6];
16204 int aarch64_cond;
16205
16206 push_to_sequence (*prep_seq);
16207 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
16208
16209 op_mode = GET_MODE (op0);
16210 if (op_mode == VOIDmode)
16211 op_mode = GET_MODE (op1);
16212
16213 switch (op_mode)
16214 {
16215 case E_QImode:
16216 case E_HImode:
16217 case E_SImode:
16218 cmp_mode = SImode;
16219 icode = CODE_FOR_ccmpsi;
16220 break;
16221
16222 case E_DImode:
16223 cmp_mode = DImode;
16224 icode = CODE_FOR_ccmpdi;
16225 break;
16226
16227 case E_SFmode:
16228 cmp_mode = SFmode;
16229 cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
16230 icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpesf : CODE_FOR_fccmpsf;
16231 break;
16232
16233 case E_DFmode:
16234 cmp_mode = DFmode;
16235 cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
16236 icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpedf : CODE_FOR_fccmpdf;
16237 break;
16238
16239 default:
16240 end_sequence ();
16241 return NULL_RTX;
16242 }
16243
16244 op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
16245 op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
16246 if (!op0 || !op1)
16247 {
16248 end_sequence ();
16249 return NULL_RTX;
16250 }
16251 *prep_seq = get_insns ();
16252 end_sequence ();
16253
16254 target = gen_rtx_REG (cc_mode, CC_REGNUM);
16255 aarch64_cond = aarch64_get_condition_code_1 (cc_mode, (rtx_code) cmp_code);
16256
16257 if (bit_code != AND)
16258 {
16259 prev = gen_rtx_fmt_ee (REVERSE_CONDITION (GET_CODE (prev),
16260 GET_MODE (XEXP (prev, 0))),
16261 VOIDmode, XEXP (prev, 0), const0_rtx);
16262 aarch64_cond = AARCH64_INVERSE_CONDITION_CODE (aarch64_cond);
16263 }
16264
16265 create_fixed_operand (&ops[0], XEXP (prev, 0));
16266 create_fixed_operand (&ops[1], target);
16267 create_fixed_operand (&ops[2], op0);
16268 create_fixed_operand (&ops[3], op1);
16269 create_fixed_operand (&ops[4], prev);
16270 create_fixed_operand (&ops[5], GEN_INT (aarch64_cond));
16271
16272 push_to_sequence (*gen_seq);
16273 if (!maybe_expand_insn (icode, 6, ops))
16274 {
16275 end_sequence ();
16276 return NULL_RTX;
16277 }
16278
16279 *gen_seq = get_insns ();
16280 end_sequence ();
16281
16282 return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
16283 }
16284
16285 #undef TARGET_GEN_CCMP_FIRST
16286 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
16287
16288 #undef TARGET_GEN_CCMP_NEXT
16289 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
16290
16291 /* Implement TARGET_SCHED_MACRO_FUSION_P. Return true if target supports
16292 instruction fusion of some sort. */
16293
16294 static bool
16295 aarch64_macro_fusion_p (void)
16296 {
16297 return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
16298 }
16299
16300
16301 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P. Return true if PREV and CURR
16302 should be kept together during scheduling. */
16303
16304 static bool
16305 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
16306 {
16307 rtx set_dest;
16308 rtx prev_set = single_set (prev);
16309 rtx curr_set = single_set (curr);
16310 /* prev and curr are simple SET insns i.e. no flag setting or branching. */
16311 bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
16312
16313 if (!aarch64_macro_fusion_p ())
16314 return false;
16315
16316 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK))
16317 {
16318 /* We are trying to match:
16319 prev (mov) == (set (reg r0) (const_int imm16))
16320 curr (movk) == (set (zero_extract (reg r0)
16321 (const_int 16)
16322 (const_int 16))
16323 (const_int imm16_1)) */
16324
16325 set_dest = SET_DEST (curr_set);
16326
16327 if (GET_CODE (set_dest) == ZERO_EXTRACT
16328 && CONST_INT_P (SET_SRC (curr_set))
16329 && CONST_INT_P (SET_SRC (prev_set))
16330 && CONST_INT_P (XEXP (set_dest, 2))
16331 && INTVAL (XEXP (set_dest, 2)) == 16
16332 && REG_P (XEXP (set_dest, 0))
16333 && REG_P (SET_DEST (prev_set))
16334 && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
16335 {
16336 return true;
16337 }
16338 }
16339
16340 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD))
16341 {
16342
16343 /* We're trying to match:
16344 prev (adrp) == (set (reg r1)
16345 (high (symbol_ref ("SYM"))))
16346 curr (add) == (set (reg r0)
16347 (lo_sum (reg r1)
16348 (symbol_ref ("SYM"))))
16349 Note that r0 need not necessarily be the same as r1, especially
16350 during pre-regalloc scheduling. */
16351
16352 if (satisfies_constraint_Ush (SET_SRC (prev_set))
16353 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
16354 {
16355 if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
16356 && REG_P (XEXP (SET_SRC (curr_set), 0))
16357 && REGNO (XEXP (SET_SRC (curr_set), 0))
16358 == REGNO (SET_DEST (prev_set))
16359 && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
16360 XEXP (SET_SRC (curr_set), 1)))
16361 return true;
16362 }
16363 }
16364
16365 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK))
16366 {
16367
16368 /* We're trying to match:
16369 prev (movk) == (set (zero_extract (reg r0)
16370 (const_int 16)
16371 (const_int 32))
16372 (const_int imm16_1))
16373 curr (movk) == (set (zero_extract (reg r0)
16374 (const_int 16)
16375 (const_int 48))
16376 (const_int imm16_2)) */
16377
16378 if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
16379 && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
16380 && REG_P (XEXP (SET_DEST (prev_set), 0))
16381 && REG_P (XEXP (SET_DEST (curr_set), 0))
16382 && REGNO (XEXP (SET_DEST (prev_set), 0))
16383 == REGNO (XEXP (SET_DEST (curr_set), 0))
16384 && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
16385 && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
16386 && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
16387 && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
16388 && CONST_INT_P (SET_SRC (prev_set))
16389 && CONST_INT_P (SET_SRC (curr_set)))
16390 return true;
16391
16392 }
16393 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR))
16394 {
16395 /* We're trying to match:
16396 prev (adrp) == (set (reg r0)
16397 (high (symbol_ref ("SYM"))))
16398 curr (ldr) == (set (reg r1)
16399 (mem (lo_sum (reg r0)
16400 (symbol_ref ("SYM")))))
16401 or
16402 curr (ldr) == (set (reg r1)
16403 (zero_extend (mem
16404 (lo_sum (reg r0)
16405 (symbol_ref ("SYM")))))) */
16406 if (satisfies_constraint_Ush (SET_SRC (prev_set))
16407 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
16408 {
16409 rtx curr_src = SET_SRC (curr_set);
16410
16411 if (GET_CODE (curr_src) == ZERO_EXTEND)
16412 curr_src = XEXP (curr_src, 0);
16413
16414 if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
16415 && REG_P (XEXP (XEXP (curr_src, 0), 0))
16416 && REGNO (XEXP (XEXP (curr_src, 0), 0))
16417 == REGNO (SET_DEST (prev_set))
16418 && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
16419 XEXP (SET_SRC (prev_set), 0)))
16420 return true;
16421 }
16422 }
16423
16424 if (aarch64_fusion_enabled_p (AARCH64_FUSE_AES_AESMC)
16425 && aarch_crypto_can_dual_issue (prev, curr))
16426 return true;
16427
16428 if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH)
16429 && any_condjump_p (curr))
16430 {
16431 enum attr_type prev_type = get_attr_type (prev);
16432
16433 unsigned int condreg1, condreg2;
16434 rtx cc_reg_1;
16435 aarch64_fixed_condition_code_regs (&condreg1, &condreg2);
16436 cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
16437
16438 if (reg_referenced_p (cc_reg_1, PATTERN (curr))
16439 && prev
16440 && modified_in_p (cc_reg_1, prev))
16441 {
16442 /* FIXME: this misses some which is considered simple arthematic
16443 instructions for ThunderX. Simple shifts are missed here. */
16444 if (prev_type == TYPE_ALUS_SREG
16445 || prev_type == TYPE_ALUS_IMM
16446 || prev_type == TYPE_LOGICS_REG
16447 || prev_type == TYPE_LOGICS_IMM)
16448 return true;
16449 }
16450 }
16451
16452 if (prev_set
16453 && curr_set
16454 && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH)
16455 && any_condjump_p (curr))
16456 {
16457 /* We're trying to match:
16458 prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
16459 curr (cbz) == (set (pc) (if_then_else (eq/ne) (r0)
16460 (const_int 0))
16461 (label_ref ("SYM"))
16462 (pc)) */
16463 if (SET_DEST (curr_set) == (pc_rtx)
16464 && GET_CODE (SET_SRC (curr_set)) == IF_THEN_ELSE
16465 && REG_P (XEXP (XEXP (SET_SRC (curr_set), 0), 0))
16466 && REG_P (SET_DEST (prev_set))
16467 && REGNO (SET_DEST (prev_set))
16468 == REGNO (XEXP (XEXP (SET_SRC (curr_set), 0), 0)))
16469 {
16470 /* Fuse ALU operations followed by conditional branch instruction. */
16471 switch (get_attr_type (prev))
16472 {
16473 case TYPE_ALU_IMM:
16474 case TYPE_ALU_SREG:
16475 case TYPE_ADC_REG:
16476 case TYPE_ADC_IMM:
16477 case TYPE_ADCS_REG:
16478 case TYPE_ADCS_IMM:
16479 case TYPE_LOGIC_REG:
16480 case TYPE_LOGIC_IMM:
16481 case TYPE_CSEL:
16482 case TYPE_ADR:
16483 case TYPE_MOV_IMM:
16484 case TYPE_SHIFT_REG:
16485 case TYPE_SHIFT_IMM:
16486 case TYPE_BFM:
16487 case TYPE_RBIT:
16488 case TYPE_REV:
16489 case TYPE_EXTEND:
16490 return true;
16491
16492 default:;
16493 }
16494 }
16495 }
16496
16497 return false;
16498 }
16499
16500 /* Return true iff the instruction fusion described by OP is enabled. */
16501
16502 bool
16503 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op)
16504 {
16505 return (aarch64_tune_params.fusible_ops & op) != 0;
16506 }
16507
16508 /* If MEM is in the form of [base+offset], extract the two parts
16509 of address and set to BASE and OFFSET, otherwise return false
16510 after clearing BASE and OFFSET. */
16511
16512 bool
16513 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
16514 {
16515 rtx addr;
16516
16517 gcc_assert (MEM_P (mem));
16518
16519 addr = XEXP (mem, 0);
16520
16521 if (REG_P (addr))
16522 {
16523 *base = addr;
16524 *offset = const0_rtx;
16525 return true;
16526 }
16527
16528 if (GET_CODE (addr) == PLUS
16529 && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
16530 {
16531 *base = XEXP (addr, 0);
16532 *offset = XEXP (addr, 1);
16533 return true;
16534 }
16535
16536 *base = NULL_RTX;
16537 *offset = NULL_RTX;
16538
16539 return false;
16540 }
16541
16542 /* Types for scheduling fusion. */
16543 enum sched_fusion_type
16544 {
16545 SCHED_FUSION_NONE = 0,
16546 SCHED_FUSION_LD_SIGN_EXTEND,
16547 SCHED_FUSION_LD_ZERO_EXTEND,
16548 SCHED_FUSION_LD,
16549 SCHED_FUSION_ST,
16550 SCHED_FUSION_NUM
16551 };
16552
16553 /* If INSN is a load or store of address in the form of [base+offset],
16554 extract the two parts and set to BASE and OFFSET. Return scheduling
16555 fusion type this INSN is. */
16556
16557 static enum sched_fusion_type
16558 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
16559 {
16560 rtx x, dest, src;
16561 enum sched_fusion_type fusion = SCHED_FUSION_LD;
16562
16563 gcc_assert (INSN_P (insn));
16564 x = PATTERN (insn);
16565 if (GET_CODE (x) != SET)
16566 return SCHED_FUSION_NONE;
16567
16568 src = SET_SRC (x);
16569 dest = SET_DEST (x);
16570
16571 machine_mode dest_mode = GET_MODE (dest);
16572
16573 if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
16574 return SCHED_FUSION_NONE;
16575
16576 if (GET_CODE (src) == SIGN_EXTEND)
16577 {
16578 fusion = SCHED_FUSION_LD_SIGN_EXTEND;
16579 src = XEXP (src, 0);
16580 if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
16581 return SCHED_FUSION_NONE;
16582 }
16583 else if (GET_CODE (src) == ZERO_EXTEND)
16584 {
16585 fusion = SCHED_FUSION_LD_ZERO_EXTEND;
16586 src = XEXP (src, 0);
16587 if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
16588 return SCHED_FUSION_NONE;
16589 }
16590
16591 if (GET_CODE (src) == MEM && REG_P (dest))
16592 extract_base_offset_in_addr (src, base, offset);
16593 else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
16594 {
16595 fusion = SCHED_FUSION_ST;
16596 extract_base_offset_in_addr (dest, base, offset);
16597 }
16598 else
16599 return SCHED_FUSION_NONE;
16600
16601 if (*base == NULL_RTX || *offset == NULL_RTX)
16602 fusion = SCHED_FUSION_NONE;
16603
16604 return fusion;
16605 }
16606
16607 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
16608
16609 Currently we only support to fuse ldr or str instructions, so FUSION_PRI
16610 and PRI are only calculated for these instructions. For other instruction,
16611 FUSION_PRI and PRI are simply set to MAX_PRI - 1. In the future, other
16612 type instruction fusion can be added by returning different priorities.
16613
16614 It's important that irrelevant instructions get the largest FUSION_PRI. */
16615
16616 static void
16617 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
16618 int *fusion_pri, int *pri)
16619 {
16620 int tmp, off_val;
16621 rtx base, offset;
16622 enum sched_fusion_type fusion;
16623
16624 gcc_assert (INSN_P (insn));
16625
16626 tmp = max_pri - 1;
16627 fusion = fusion_load_store (insn, &base, &offset);
16628 if (fusion == SCHED_FUSION_NONE)
16629 {
16630 *pri = tmp;
16631 *fusion_pri = tmp;
16632 return;
16633 }
16634
16635 /* Set FUSION_PRI according to fusion type and base register. */
16636 *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
16637
16638 /* Calculate PRI. */
16639 tmp /= 2;
16640
16641 /* INSN with smaller offset goes first. */
16642 off_val = (int)(INTVAL (offset));
16643 if (off_val >= 0)
16644 tmp -= (off_val & 0xfffff);
16645 else
16646 tmp += ((- off_val) & 0xfffff);
16647
16648 *pri = tmp;
16649 return;
16650 }
16651
16652 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
16653 Adjust priority of sha1h instructions so they are scheduled before
16654 other SHA1 instructions. */
16655
16656 static int
16657 aarch64_sched_adjust_priority (rtx_insn *insn, int priority)
16658 {
16659 rtx x = PATTERN (insn);
16660
16661 if (GET_CODE (x) == SET)
16662 {
16663 x = SET_SRC (x);
16664
16665 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SHA1H)
16666 return priority + 10;
16667 }
16668
16669 return priority;
16670 }
16671
16672 /* Given OPERANDS of consecutive load/store, check if we can merge
16673 them into ldp/stp. LOAD is true if they are load instructions.
16674 MODE is the mode of memory operands. */
16675
16676 bool
16677 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
16678 machine_mode mode)
16679 {
16680 HOST_WIDE_INT offval_1, offval_2, msize;
16681 enum reg_class rclass_1, rclass_2;
16682 rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
16683
16684 if (load)
16685 {
16686 mem_1 = operands[1];
16687 mem_2 = operands[3];
16688 reg_1 = operands[0];
16689 reg_2 = operands[2];
16690 gcc_assert (REG_P (reg_1) && REG_P (reg_2));
16691 if (REGNO (reg_1) == REGNO (reg_2))
16692 return false;
16693 }
16694 else
16695 {
16696 mem_1 = operands[0];
16697 mem_2 = operands[2];
16698 reg_1 = operands[1];
16699 reg_2 = operands[3];
16700 }
16701
16702 /* The mems cannot be volatile. */
16703 if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
16704 return false;
16705
16706 /* If we have SImode and slow unaligned ldp,
16707 check the alignment to be at least 8 byte. */
16708 if (mode == SImode
16709 && (aarch64_tune_params.extra_tuning_flags
16710 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
16711 && !optimize_size
16712 && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
16713 return false;
16714
16715 /* Check if the addresses are in the form of [base+offset]. */
16716 extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
16717 if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
16718 return false;
16719 extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
16720 if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
16721 return false;
16722
16723 /* Check if the bases are same. */
16724 if (!rtx_equal_p (base_1, base_2))
16725 return false;
16726
16727 /* The operands must be of the same size. */
16728 gcc_assert (known_eq (GET_MODE_SIZE (GET_MODE (mem_1)),
16729 GET_MODE_SIZE (GET_MODE (mem_2))));
16730
16731 offval_1 = INTVAL (offset_1);
16732 offval_2 = INTVAL (offset_2);
16733 /* We should only be trying this for fixed-sized modes. There is no
16734 SVE LDP/STP instruction. */
16735 msize = GET_MODE_SIZE (mode).to_constant ();
16736 /* Check if the offsets are consecutive. */
16737 if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
16738 return false;
16739
16740 /* Check if the addresses are clobbered by load. */
16741 if (load)
16742 {
16743 if (reg_mentioned_p (reg_1, mem_1))
16744 return false;
16745
16746 /* In increasing order, the last load can clobber the address. */
16747 if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
16748 return false;
16749 }
16750
16751 /* One of the memory accesses must be a mempair operand.
16752 If it is not the first one, they need to be swapped by the
16753 peephole. */
16754 if (!aarch64_mem_pair_operand (mem_1, GET_MODE (mem_1))
16755 && !aarch64_mem_pair_operand (mem_2, GET_MODE (mem_2)))
16756 return false;
16757
16758 if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
16759 rclass_1 = FP_REGS;
16760 else
16761 rclass_1 = GENERAL_REGS;
16762
16763 if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
16764 rclass_2 = FP_REGS;
16765 else
16766 rclass_2 = GENERAL_REGS;
16767
16768 /* Check if the registers are of same class. */
16769 if (rclass_1 != rclass_2)
16770 return false;
16771
16772 return true;
16773 }
16774
16775 /* Given OPERANDS of consecutive load/store that can be merged,
16776 swap them if they are not in ascending order. */
16777 void
16778 aarch64_swap_ldrstr_operands (rtx* operands, bool load)
16779 {
16780 rtx mem_1, mem_2, base_1, base_2, offset_1, offset_2;
16781 HOST_WIDE_INT offval_1, offval_2;
16782
16783 if (load)
16784 {
16785 mem_1 = operands[1];
16786 mem_2 = operands[3];
16787 }
16788 else
16789 {
16790 mem_1 = operands[0];
16791 mem_2 = operands[2];
16792 }
16793
16794 extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
16795 extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
16796
16797 offval_1 = INTVAL (offset_1);
16798 offval_2 = INTVAL (offset_2);
16799
16800 if (offval_1 > offval_2)
16801 {
16802 /* Irrespective of whether this is a load or a store,
16803 we do the same swap. */
16804 std::swap (operands[0], operands[2]);
16805 std::swap (operands[1], operands[3]);
16806 }
16807 }
16808
16809 /* Taking X and Y to be HOST_WIDE_INT pointers, return the result of a
16810 comparison between the two. */
16811 int
16812 aarch64_host_wide_int_compare (const void *x, const void *y)
16813 {
16814 return wi::cmps (* ((const HOST_WIDE_INT *) x),
16815 * ((const HOST_WIDE_INT *) y));
16816 }
16817
16818 /* Taking X and Y to be pairs of RTX, one pointing to a MEM rtx and the
16819 other pointing to a REG rtx containing an offset, compare the offsets
16820 of the two pairs.
16821
16822 Return:
16823
16824 1 iff offset (X) > offset (Y)
16825 0 iff offset (X) == offset (Y)
16826 -1 iff offset (X) < offset (Y) */
16827 int
16828 aarch64_ldrstr_offset_compare (const void *x, const void *y)
16829 {
16830 const rtx * operands_1 = (const rtx *) x;
16831 const rtx * operands_2 = (const rtx *) y;
16832 rtx mem_1, mem_2, base, offset_1, offset_2;
16833
16834 if (MEM_P (operands_1[0]))
16835 mem_1 = operands_1[0];
16836 else
16837 mem_1 = operands_1[1];
16838
16839 if (MEM_P (operands_2[0]))
16840 mem_2 = operands_2[0];
16841 else
16842 mem_2 = operands_2[1];
16843
16844 /* Extract the offsets. */
16845 extract_base_offset_in_addr (mem_1, &base, &offset_1);
16846 extract_base_offset_in_addr (mem_2, &base, &offset_2);
16847
16848 gcc_assert (offset_1 != NULL_RTX && offset_2 != NULL_RTX);
16849
16850 return wi::cmps (INTVAL (offset_1), INTVAL (offset_2));
16851 }
16852
16853 /* Given OPERANDS of consecutive load/store, check if we can merge
16854 them into ldp/stp by adjusting the offset. LOAD is true if they
16855 are load instructions. MODE is the mode of memory operands.
16856
16857 Given below consecutive stores:
16858
16859 str w1, [xb, 0x100]
16860 str w1, [xb, 0x104]
16861 str w1, [xb, 0x108]
16862 str w1, [xb, 0x10c]
16863
16864 Though the offsets are out of the range supported by stp, we can
16865 still pair them after adjusting the offset, like:
16866
16867 add scratch, xb, 0x100
16868 stp w1, w1, [scratch]
16869 stp w1, w1, [scratch, 0x8]
16870
16871 The peephole patterns detecting this opportunity should guarantee
16872 the scratch register is avaliable. */
16873
16874 bool
16875 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
16876 scalar_mode mode)
16877 {
16878 const int num_insns = 4;
16879 enum reg_class rclass;
16880 HOST_WIDE_INT offvals[num_insns], msize;
16881 rtx mem[num_insns], reg[num_insns], base[num_insns], offset[num_insns];
16882
16883 if (load)
16884 {
16885 for (int i = 0; i < num_insns; i++)
16886 {
16887 reg[i] = operands[2 * i];
16888 mem[i] = operands[2 * i + 1];
16889
16890 gcc_assert (REG_P (reg[i]));
16891 }
16892
16893 /* Do not attempt to merge the loads if the loads clobber each other. */
16894 for (int i = 0; i < 8; i += 2)
16895 for (int j = i + 2; j < 8; j += 2)
16896 if (reg_overlap_mentioned_p (operands[i], operands[j]))
16897 return false;
16898 }
16899 else
16900 for (int i = 0; i < num_insns; i++)
16901 {
16902 mem[i] = operands[2 * i];
16903 reg[i] = operands[2 * i + 1];
16904 }
16905
16906 /* Skip if memory operand is by itself valid for ldp/stp. */
16907 if (!MEM_P (mem[0]) || aarch64_mem_pair_operand (mem[0], mode))
16908 return false;
16909
16910 for (int i = 0; i < num_insns; i++)
16911 {
16912 /* The mems cannot be volatile. */
16913 if (MEM_VOLATILE_P (mem[i]))
16914 return false;
16915
16916 /* Check if the addresses are in the form of [base+offset]. */
16917 extract_base_offset_in_addr (mem[i], base + i, offset + i);
16918 if (base[i] == NULL_RTX || offset[i] == NULL_RTX)
16919 return false;
16920 }
16921
16922 /* Check if the registers are of same class. */
16923 rclass = REG_P (reg[0]) && FP_REGNUM_P (REGNO (reg[0]))
16924 ? FP_REGS : GENERAL_REGS;
16925
16926 for (int i = 1; i < num_insns; i++)
16927 if (REG_P (reg[i]) && FP_REGNUM_P (REGNO (reg[i])))
16928 {
16929 if (rclass != FP_REGS)
16930 return false;
16931 }
16932 else
16933 {
16934 if (rclass != GENERAL_REGS)
16935 return false;
16936 }
16937
16938 /* Only the last register in the order in which they occur
16939 may be clobbered by the load. */
16940 if (rclass == GENERAL_REGS && load)
16941 for (int i = 0; i < num_insns - 1; i++)
16942 if (reg_mentioned_p (reg[i], mem[i]))
16943 return false;
16944
16945 /* Check if the bases are same. */
16946 for (int i = 0; i < num_insns - 1; i++)
16947 if (!rtx_equal_p (base[i], base[i + 1]))
16948 return false;
16949
16950 for (int i = 0; i < num_insns; i++)
16951 offvals[i] = INTVAL (offset[i]);
16952
16953 msize = GET_MODE_SIZE (mode);
16954
16955 /* Check if the offsets can be put in the right order to do a ldp/stp. */
16956 qsort (offvals, num_insns, sizeof (HOST_WIDE_INT),
16957 aarch64_host_wide_int_compare);
16958
16959 if (!(offvals[1] == offvals[0] + msize
16960 && offvals[3] == offvals[2] + msize))
16961 return false;
16962
16963 /* Check that offsets are within range of each other. The ldp/stp
16964 instructions have 7 bit immediate offsets, so use 0x80. */
16965 if (offvals[2] - offvals[0] >= msize * 0x80)
16966 return false;
16967
16968 /* The offsets must be aligned with respect to each other. */
16969 if (offvals[0] % msize != offvals[2] % msize)
16970 return false;
16971
16972 /* If we have SImode and slow unaligned ldp,
16973 check the alignment to be at least 8 byte. */
16974 if (mode == SImode
16975 && (aarch64_tune_params.extra_tuning_flags
16976 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
16977 && !optimize_size
16978 && MEM_ALIGN (mem[0]) < 8 * BITS_PER_UNIT)
16979 return false;
16980
16981 return true;
16982 }
16983
16984 /* Given OPERANDS of consecutive load/store, this function pairs them
16985 into LDP/STP after adjusting the offset. It depends on the fact
16986 that the operands can be sorted so the offsets are correct for STP.
16987 MODE is the mode of memory operands. CODE is the rtl operator
16988 which should be applied to all memory operands, it's SIGN_EXTEND,
16989 ZERO_EXTEND or UNKNOWN. */
16990
16991 bool
16992 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
16993 scalar_mode mode, RTX_CODE code)
16994 {
16995 rtx base, offset_1, offset_3, t1, t2;
16996 rtx mem_1, mem_2, mem_3, mem_4;
16997 rtx temp_operands[8];
16998 HOST_WIDE_INT off_val_1, off_val_3, base_off, new_off_1, new_off_3,
16999 stp_off_upper_limit, stp_off_lower_limit, msize;
17000
17001 /* We make changes on a copy as we may still bail out. */
17002 for (int i = 0; i < 8; i ++)
17003 temp_operands[i] = operands[i];
17004
17005 /* Sort the operands. */
17006 qsort (temp_operands, 4, 2 * sizeof (rtx *), aarch64_ldrstr_offset_compare);
17007
17008 if (load)
17009 {
17010 mem_1 = temp_operands[1];
17011 mem_2 = temp_operands[3];
17012 mem_3 = temp_operands[5];
17013 mem_4 = temp_operands[7];
17014 }
17015 else
17016 {
17017 mem_1 = temp_operands[0];
17018 mem_2 = temp_operands[2];
17019 mem_3 = temp_operands[4];
17020 mem_4 = temp_operands[6];
17021 gcc_assert (code == UNKNOWN);
17022 }
17023
17024 extract_base_offset_in_addr (mem_1, &base, &offset_1);
17025 extract_base_offset_in_addr (mem_3, &base, &offset_3);
17026 gcc_assert (base != NULL_RTX && offset_1 != NULL_RTX
17027 && offset_3 != NULL_RTX);
17028
17029 /* Adjust offset so it can fit in LDP/STP instruction. */
17030 msize = GET_MODE_SIZE (mode);
17031 stp_off_upper_limit = msize * (0x40 - 1);
17032 stp_off_lower_limit = - msize * 0x40;
17033
17034 off_val_1 = INTVAL (offset_1);
17035 off_val_3 = INTVAL (offset_3);
17036
17037 /* The base offset is optimally half way between the two STP/LDP offsets. */
17038 if (msize <= 4)
17039 base_off = (off_val_1 + off_val_3) / 2;
17040 else
17041 /* However, due to issues with negative LDP/STP offset generation for
17042 larger modes, for DF, DI and vector modes. we must not use negative
17043 addresses smaller than 9 signed unadjusted bits can store. This
17044 provides the most range in this case. */
17045 base_off = off_val_1;
17046
17047 /* Adjust the base so that it is aligned with the addresses but still
17048 optimal. */
17049 if (base_off % msize != off_val_1 % msize)
17050 /* Fix the offset, bearing in mind we want to make it bigger not
17051 smaller. */
17052 base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
17053 else if (msize <= 4)
17054 /* The negative range of LDP/STP is one larger than the positive range. */
17055 base_off += msize;
17056
17057 /* Check if base offset is too big or too small. We can attempt to resolve
17058 this issue by setting it to the maximum value and seeing if the offsets
17059 still fit. */
17060 if (base_off >= 0x1000)
17061 {
17062 base_off = 0x1000 - 1;
17063 /* We must still make sure that the base offset is aligned with respect
17064 to the address. But it may may not be made any bigger. */
17065 base_off -= (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
17066 }
17067
17068 /* Likewise for the case where the base is too small. */
17069 if (base_off <= -0x1000)
17070 {
17071 base_off = -0x1000 + 1;
17072 base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
17073 }
17074
17075 /* Offset of the first STP/LDP. */
17076 new_off_1 = off_val_1 - base_off;
17077
17078 /* Offset of the second STP/LDP. */
17079 new_off_3 = off_val_3 - base_off;
17080
17081 /* The offsets must be within the range of the LDP/STP instructions. */
17082 if (new_off_1 > stp_off_upper_limit || new_off_1 < stp_off_lower_limit
17083 || new_off_3 > stp_off_upper_limit || new_off_3 < stp_off_lower_limit)
17084 return false;
17085
17086 replace_equiv_address_nv (mem_1, plus_constant (Pmode, operands[8],
17087 new_off_1), true);
17088 replace_equiv_address_nv (mem_2, plus_constant (Pmode, operands[8],
17089 new_off_1 + msize), true);
17090 replace_equiv_address_nv (mem_3, plus_constant (Pmode, operands[8],
17091 new_off_3), true);
17092 replace_equiv_address_nv (mem_4, plus_constant (Pmode, operands[8],
17093 new_off_3 + msize), true);
17094
17095 if (!aarch64_mem_pair_operand (mem_1, mode)
17096 || !aarch64_mem_pair_operand (mem_3, mode))
17097 return false;
17098
17099 if (code == ZERO_EXTEND)
17100 {
17101 mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
17102 mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
17103 mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
17104 mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
17105 }
17106 else if (code == SIGN_EXTEND)
17107 {
17108 mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
17109 mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
17110 mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
17111 mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
17112 }
17113
17114 if (load)
17115 {
17116 operands[0] = temp_operands[0];
17117 operands[1] = mem_1;
17118 operands[2] = temp_operands[2];
17119 operands[3] = mem_2;
17120 operands[4] = temp_operands[4];
17121 operands[5] = mem_3;
17122 operands[6] = temp_operands[6];
17123 operands[7] = mem_4;
17124 }
17125 else
17126 {
17127 operands[0] = mem_1;
17128 operands[1] = temp_operands[1];
17129 operands[2] = mem_2;
17130 operands[3] = temp_operands[3];
17131 operands[4] = mem_3;
17132 operands[5] = temp_operands[5];
17133 operands[6] = mem_4;
17134 operands[7] = temp_operands[7];
17135 }
17136
17137 /* Emit adjusting instruction. */
17138 emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, base_off)));
17139 /* Emit ldp/stp instructions. */
17140 t1 = gen_rtx_SET (operands[0], operands[1]);
17141 t2 = gen_rtx_SET (operands[2], operands[3]);
17142 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
17143 t1 = gen_rtx_SET (operands[4], operands[5]);
17144 t2 = gen_rtx_SET (operands[6], operands[7]);
17145 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
17146 return true;
17147 }
17148
17149 /* Implement TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE. Assume for now that
17150 it isn't worth branching around empty masked ops (including masked
17151 stores). */
17152
17153 static bool
17154 aarch64_empty_mask_is_expensive (unsigned)
17155 {
17156 return false;
17157 }
17158
17159 /* Return 1 if pseudo register should be created and used to hold
17160 GOT address for PIC code. */
17161
17162 bool
17163 aarch64_use_pseudo_pic_reg (void)
17164 {
17165 return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
17166 }
17167
17168 /* Implement TARGET_UNSPEC_MAY_TRAP_P. */
17169
17170 static int
17171 aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
17172 {
17173 switch (XINT (x, 1))
17174 {
17175 case UNSPEC_GOTSMALLPIC:
17176 case UNSPEC_GOTSMALLPIC28K:
17177 case UNSPEC_GOTTINYPIC:
17178 return 0;
17179 default:
17180 break;
17181 }
17182
17183 return default_unspec_may_trap_p (x, flags);
17184 }
17185
17186
17187 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
17188 return the log2 of that value. Otherwise return -1. */
17189
17190 int
17191 aarch64_fpconst_pow_of_2 (rtx x)
17192 {
17193 const REAL_VALUE_TYPE *r;
17194
17195 if (!CONST_DOUBLE_P (x))
17196 return -1;
17197
17198 r = CONST_DOUBLE_REAL_VALUE (x);
17199
17200 if (REAL_VALUE_NEGATIVE (*r)
17201 || REAL_VALUE_ISNAN (*r)
17202 || REAL_VALUE_ISINF (*r)
17203 || !real_isinteger (r, DFmode))
17204 return -1;
17205
17206 return exact_log2 (real_to_integer (r));
17207 }
17208
17209 /* If X is a vector of equal CONST_DOUBLE values and that value is
17210 Y, return the aarch64_fpconst_pow_of_2 of Y. Otherwise return -1. */
17211
17212 int
17213 aarch64_vec_fpconst_pow_of_2 (rtx x)
17214 {
17215 int nelts;
17216 if (GET_CODE (x) != CONST_VECTOR
17217 || !CONST_VECTOR_NUNITS (x).is_constant (&nelts))
17218 return -1;
17219
17220 if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
17221 return -1;
17222
17223 int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
17224 if (firstval <= 0)
17225 return -1;
17226
17227 for (int i = 1; i < nelts; i++)
17228 if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
17229 return -1;
17230
17231 return firstval;
17232 }
17233
17234 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
17235 to float.
17236
17237 __fp16 always promotes through this hook.
17238 _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
17239 through the generic excess precision logic rather than here. */
17240
17241 static tree
17242 aarch64_promoted_type (const_tree t)
17243 {
17244 if (SCALAR_FLOAT_TYPE_P (t)
17245 && TYPE_MAIN_VARIANT (t) == aarch64_fp16_type_node)
17246 return float_type_node;
17247
17248 return NULL_TREE;
17249 }
17250
17251 /* Implement the TARGET_OPTAB_SUPPORTED_P hook. */
17252
17253 static bool
17254 aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
17255 optimization_type opt_type)
17256 {
17257 switch (op)
17258 {
17259 case rsqrt_optab:
17260 return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
17261
17262 default:
17263 return true;
17264 }
17265 }
17266
17267 /* Implement the TARGET_DWARF_POLY_INDETERMINATE_VALUE hook. */
17268
17269 static unsigned int
17270 aarch64_dwarf_poly_indeterminate_value (unsigned int i, unsigned int *factor,
17271 int *offset)
17272 {
17273 /* Polynomial invariant 1 == (VG / 2) - 1. */
17274 gcc_assert (i == 1);
17275 *factor = 2;
17276 *offset = 1;
17277 return AARCH64_DWARF_VG;
17278 }
17279
17280 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
17281 if MODE is HFmode, and punt to the generic implementation otherwise. */
17282
17283 static bool
17284 aarch64_libgcc_floating_mode_supported_p (scalar_float_mode mode)
17285 {
17286 return (mode == HFmode
17287 ? true
17288 : default_libgcc_floating_mode_supported_p (mode));
17289 }
17290
17291 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
17292 if MODE is HFmode, and punt to the generic implementation otherwise. */
17293
17294 static bool
17295 aarch64_scalar_mode_supported_p (scalar_mode mode)
17296 {
17297 return (mode == HFmode
17298 ? true
17299 : default_scalar_mode_supported_p (mode));
17300 }
17301
17302 /* Set the value of FLT_EVAL_METHOD.
17303 ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
17304
17305 0: evaluate all operations and constants, whose semantic type has at
17306 most the range and precision of type float, to the range and
17307 precision of float; evaluate all other operations and constants to
17308 the range and precision of the semantic type;
17309
17310 N, where _FloatN is a supported interchange floating type
17311 evaluate all operations and constants, whose semantic type has at
17312 most the range and precision of _FloatN type, to the range and
17313 precision of the _FloatN type; evaluate all other operations and
17314 constants to the range and precision of the semantic type;
17315
17316 If we have the ARMv8.2-A extensions then we support _Float16 in native
17317 precision, so we should set this to 16. Otherwise, we support the type,
17318 but want to evaluate expressions in float precision, so set this to
17319 0. */
17320
17321 static enum flt_eval_method
17322 aarch64_excess_precision (enum excess_precision_type type)
17323 {
17324 switch (type)
17325 {
17326 case EXCESS_PRECISION_TYPE_FAST:
17327 case EXCESS_PRECISION_TYPE_STANDARD:
17328 /* We can calculate either in 16-bit range and precision or
17329 32-bit range and precision. Make that decision based on whether
17330 we have native support for the ARMv8.2-A 16-bit floating-point
17331 instructions or not. */
17332 return (TARGET_FP_F16INST
17333 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
17334 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT);
17335 case EXCESS_PRECISION_TYPE_IMPLICIT:
17336 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16;
17337 default:
17338 gcc_unreachable ();
17339 }
17340 return FLT_EVAL_METHOD_UNPREDICTABLE;
17341 }
17342
17343 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN. Return true if INSN can be
17344 scheduled for speculative execution. Reject the long-running division
17345 and square-root instructions. */
17346
17347 static bool
17348 aarch64_sched_can_speculate_insn (rtx_insn *insn)
17349 {
17350 switch (get_attr_type (insn))
17351 {
17352 case TYPE_SDIV:
17353 case TYPE_UDIV:
17354 case TYPE_FDIVS:
17355 case TYPE_FDIVD:
17356 case TYPE_FSQRTS:
17357 case TYPE_FSQRTD:
17358 case TYPE_NEON_FP_SQRT_S:
17359 case TYPE_NEON_FP_SQRT_D:
17360 case TYPE_NEON_FP_SQRT_S_Q:
17361 case TYPE_NEON_FP_SQRT_D_Q:
17362 case TYPE_NEON_FP_DIV_S:
17363 case TYPE_NEON_FP_DIV_D:
17364 case TYPE_NEON_FP_DIV_S_Q:
17365 case TYPE_NEON_FP_DIV_D_Q:
17366 return false;
17367 default:
17368 return true;
17369 }
17370 }
17371
17372 /* Implement TARGET_COMPUTE_PRESSURE_CLASSES. */
17373
17374 static int
17375 aarch64_compute_pressure_classes (reg_class *classes)
17376 {
17377 int i = 0;
17378 classes[i++] = GENERAL_REGS;
17379 classes[i++] = FP_REGS;
17380 /* PR_REGS isn't a useful pressure class because many predicate pseudo
17381 registers need to go in PR_LO_REGS at some point during their
17382 lifetime. Splitting it into two halves has the effect of making
17383 all predicates count against PR_LO_REGS, so that we try whenever
17384 possible to restrict the number of live predicates to 8. This
17385 greatly reduces the amount of spilling in certain loops. */
17386 classes[i++] = PR_LO_REGS;
17387 classes[i++] = PR_HI_REGS;
17388 return i;
17389 }
17390
17391 /* Implement TARGET_CAN_CHANGE_MODE_CLASS. */
17392
17393 static bool
17394 aarch64_can_change_mode_class (machine_mode from,
17395 machine_mode to, reg_class_t)
17396 {
17397 if (BYTES_BIG_ENDIAN)
17398 {
17399 bool from_sve_p = aarch64_sve_data_mode_p (from);
17400 bool to_sve_p = aarch64_sve_data_mode_p (to);
17401
17402 /* Don't allow changes between SVE data modes and non-SVE modes.
17403 See the comment at the head of aarch64-sve.md for details. */
17404 if (from_sve_p != to_sve_p)
17405 return false;
17406
17407 /* Don't allow changes in element size: lane 0 of the new vector
17408 would not then be lane 0 of the old vector. See the comment
17409 above aarch64_maybe_expand_sve_subreg_move for a more detailed
17410 description.
17411
17412 In the worst case, this forces a register to be spilled in
17413 one mode and reloaded in the other, which handles the
17414 endianness correctly. */
17415 if (from_sve_p && GET_MODE_UNIT_SIZE (from) != GET_MODE_UNIT_SIZE (to))
17416 return false;
17417 }
17418 return true;
17419 }
17420
17421 /* Implement TARGET_EARLY_REMAT_MODES. */
17422
17423 static void
17424 aarch64_select_early_remat_modes (sbitmap modes)
17425 {
17426 /* SVE values are not normally live across a call, so it should be
17427 worth doing early rematerialization even in VL-specific mode. */
17428 for (int i = 0; i < NUM_MACHINE_MODES; ++i)
17429 {
17430 machine_mode mode = (machine_mode) i;
17431 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
17432 if (vec_flags & VEC_ANY_SVE)
17433 bitmap_set_bit (modes, i);
17434 }
17435 }
17436
17437 /* Override the default target speculation_safe_value. */
17438 static rtx
17439 aarch64_speculation_safe_value (machine_mode mode,
17440 rtx result, rtx val, rtx failval)
17441 {
17442 /* Maybe we should warn if falling back to hard barriers. They are
17443 likely to be noticably more expensive than the alternative below. */
17444 if (!aarch64_track_speculation)
17445 return default_speculation_safe_value (mode, result, val, failval);
17446
17447 if (!REG_P (val))
17448 val = copy_to_mode_reg (mode, val);
17449
17450 if (!aarch64_reg_or_zero (failval, mode))
17451 failval = copy_to_mode_reg (mode, failval);
17452
17453 switch (mode)
17454 {
17455 case E_QImode:
17456 emit_insn (gen_despeculate_copyqi (result, val, failval));
17457 break;
17458 case E_HImode:
17459 emit_insn (gen_despeculate_copyhi (result, val, failval));
17460 break;
17461 case E_SImode:
17462 emit_insn (gen_despeculate_copysi (result, val, failval));
17463 break;
17464 case E_DImode:
17465 emit_insn (gen_despeculate_copydi (result, val, failval));
17466 break;
17467 case E_TImode:
17468 emit_insn (gen_despeculate_copyti (result, val, failval));
17469 break;
17470 default:
17471 gcc_unreachable ();
17472 }
17473 return result;
17474 }
17475
17476 /* Target-specific selftests. */
17477
17478 #if CHECKING_P
17479
17480 namespace selftest {
17481
17482 /* Selftest for the RTL loader.
17483 Verify that the RTL loader copes with a dump from
17484 print_rtx_function. This is essentially just a test that class
17485 function_reader can handle a real dump, but it also verifies
17486 that lookup_reg_by_dump_name correctly handles hard regs.
17487 The presence of hard reg names in the dump means that the test is
17488 target-specific, hence it is in this file. */
17489
17490 static void
17491 aarch64_test_loading_full_dump ()
17492 {
17493 rtl_dump_test t (SELFTEST_LOCATION, locate_file ("aarch64/times-two.rtl"));
17494
17495 ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
17496
17497 rtx_insn *insn_1 = get_insn_by_uid (1);
17498 ASSERT_EQ (NOTE, GET_CODE (insn_1));
17499
17500 rtx_insn *insn_15 = get_insn_by_uid (15);
17501 ASSERT_EQ (INSN, GET_CODE (insn_15));
17502 ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
17503
17504 /* Verify crtl->return_rtx. */
17505 ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
17506 ASSERT_EQ (0, REGNO (crtl->return_rtx));
17507 ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
17508 }
17509
17510 /* Run all target-specific selftests. */
17511
17512 static void
17513 aarch64_run_selftests (void)
17514 {
17515 aarch64_test_loading_full_dump ();
17516 }
17517
17518 } // namespace selftest
17519
17520 #endif /* #if CHECKING_P */
17521
17522 #undef TARGET_ADDRESS_COST
17523 #define TARGET_ADDRESS_COST aarch64_address_cost
17524
17525 /* This hook will determines whether unnamed bitfields affect the alignment
17526 of the containing structure. The hook returns true if the structure
17527 should inherit the alignment requirements of an unnamed bitfield's
17528 type. */
17529 #undef TARGET_ALIGN_ANON_BITFIELD
17530 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
17531
17532 #undef TARGET_ASM_ALIGNED_DI_OP
17533 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
17534
17535 #undef TARGET_ASM_ALIGNED_HI_OP
17536 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
17537
17538 #undef TARGET_ASM_ALIGNED_SI_OP
17539 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
17540
17541 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
17542 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
17543 hook_bool_const_tree_hwi_hwi_const_tree_true
17544
17545 #undef TARGET_ASM_FILE_START
17546 #define TARGET_ASM_FILE_START aarch64_start_file
17547
17548 #undef TARGET_ASM_OUTPUT_MI_THUNK
17549 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
17550
17551 #undef TARGET_ASM_SELECT_RTX_SECTION
17552 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
17553
17554 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
17555 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
17556
17557 #undef TARGET_BUILD_BUILTIN_VA_LIST
17558 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
17559
17560 #undef TARGET_CALLEE_COPIES
17561 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
17562
17563 #undef TARGET_CAN_ELIMINATE
17564 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
17565
17566 #undef TARGET_CAN_INLINE_P
17567 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
17568
17569 #undef TARGET_CANNOT_FORCE_CONST_MEM
17570 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
17571
17572 #undef TARGET_CASE_VALUES_THRESHOLD
17573 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
17574
17575 #undef TARGET_CONDITIONAL_REGISTER_USAGE
17576 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
17577
17578 /* Only the least significant bit is used for initialization guard
17579 variables. */
17580 #undef TARGET_CXX_GUARD_MASK_BIT
17581 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
17582
17583 #undef TARGET_C_MODE_FOR_SUFFIX
17584 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
17585
17586 #ifdef TARGET_BIG_ENDIAN_DEFAULT
17587 #undef TARGET_DEFAULT_TARGET_FLAGS
17588 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
17589 #endif
17590
17591 #undef TARGET_CLASS_MAX_NREGS
17592 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
17593
17594 #undef TARGET_BUILTIN_DECL
17595 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
17596
17597 #undef TARGET_BUILTIN_RECIPROCAL
17598 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
17599
17600 #undef TARGET_C_EXCESS_PRECISION
17601 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
17602
17603 #undef TARGET_EXPAND_BUILTIN
17604 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
17605
17606 #undef TARGET_EXPAND_BUILTIN_VA_START
17607 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
17608
17609 #undef TARGET_FOLD_BUILTIN
17610 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
17611
17612 #undef TARGET_FUNCTION_ARG
17613 #define TARGET_FUNCTION_ARG aarch64_function_arg
17614
17615 #undef TARGET_FUNCTION_ARG_ADVANCE
17616 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
17617
17618 #undef TARGET_FUNCTION_ARG_BOUNDARY
17619 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
17620
17621 #undef TARGET_FUNCTION_ARG_PADDING
17622 #define TARGET_FUNCTION_ARG_PADDING aarch64_function_arg_padding
17623
17624 #undef TARGET_GET_RAW_RESULT_MODE
17625 #define TARGET_GET_RAW_RESULT_MODE aarch64_get_reg_raw_mode
17626 #undef TARGET_GET_RAW_ARG_MODE
17627 #define TARGET_GET_RAW_ARG_MODE aarch64_get_reg_raw_mode
17628
17629 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
17630 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
17631
17632 #undef TARGET_FUNCTION_VALUE
17633 #define TARGET_FUNCTION_VALUE aarch64_function_value
17634
17635 #undef TARGET_FUNCTION_VALUE_REGNO_P
17636 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
17637
17638 #undef TARGET_GIMPLE_FOLD_BUILTIN
17639 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
17640
17641 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
17642 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
17643
17644 #undef TARGET_INIT_BUILTINS
17645 #define TARGET_INIT_BUILTINS aarch64_init_builtins
17646
17647 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
17648 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
17649 aarch64_ira_change_pseudo_allocno_class
17650
17651 #undef TARGET_LEGITIMATE_ADDRESS_P
17652 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
17653
17654 #undef TARGET_LEGITIMATE_CONSTANT_P
17655 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
17656
17657 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
17658 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
17659 aarch64_legitimize_address_displacement
17660
17661 #undef TARGET_LIBGCC_CMP_RETURN_MODE
17662 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
17663
17664 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
17665 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
17666 aarch64_libgcc_floating_mode_supported_p
17667
17668 #undef TARGET_MANGLE_TYPE
17669 #define TARGET_MANGLE_TYPE aarch64_mangle_type
17670
17671 #undef TARGET_MEMORY_MOVE_COST
17672 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
17673
17674 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
17675 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
17676
17677 #undef TARGET_MUST_PASS_IN_STACK
17678 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
17679
17680 /* This target hook should return true if accesses to volatile bitfields
17681 should use the narrowest mode possible. It should return false if these
17682 accesses should use the bitfield container type. */
17683 #undef TARGET_NARROW_VOLATILE_BITFIELD
17684 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
17685
17686 #undef TARGET_OPTION_OVERRIDE
17687 #define TARGET_OPTION_OVERRIDE aarch64_override_options
17688
17689 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
17690 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
17691 aarch64_override_options_after_change
17692
17693 #undef TARGET_OPTION_SAVE
17694 #define TARGET_OPTION_SAVE aarch64_option_save
17695
17696 #undef TARGET_OPTION_RESTORE
17697 #define TARGET_OPTION_RESTORE aarch64_option_restore
17698
17699 #undef TARGET_OPTION_PRINT
17700 #define TARGET_OPTION_PRINT aarch64_option_print
17701
17702 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
17703 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
17704
17705 #undef TARGET_SET_CURRENT_FUNCTION
17706 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
17707
17708 #undef TARGET_PASS_BY_REFERENCE
17709 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
17710
17711 #undef TARGET_PREFERRED_RELOAD_CLASS
17712 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
17713
17714 #undef TARGET_SCHED_REASSOCIATION_WIDTH
17715 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
17716
17717 #undef TARGET_PROMOTED_TYPE
17718 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
17719
17720 #undef TARGET_SECONDARY_RELOAD
17721 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
17722
17723 #undef TARGET_SHIFT_TRUNCATION_MASK
17724 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
17725
17726 #undef TARGET_SETUP_INCOMING_VARARGS
17727 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
17728
17729 #undef TARGET_STRUCT_VALUE_RTX
17730 #define TARGET_STRUCT_VALUE_RTX aarch64_struct_value_rtx
17731
17732 #undef TARGET_REGISTER_MOVE_COST
17733 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
17734
17735 #undef TARGET_RETURN_IN_MEMORY
17736 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
17737
17738 #undef TARGET_RETURN_IN_MSB
17739 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
17740
17741 #undef TARGET_RTX_COSTS
17742 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
17743
17744 #undef TARGET_SCALAR_MODE_SUPPORTED_P
17745 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
17746
17747 #undef TARGET_SCHED_ISSUE_RATE
17748 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
17749
17750 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
17751 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
17752 aarch64_sched_first_cycle_multipass_dfa_lookahead
17753
17754 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
17755 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
17756 aarch64_first_cycle_multipass_dfa_lookahead_guard
17757
17758 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
17759 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
17760 aarch64_get_separate_components
17761
17762 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
17763 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
17764 aarch64_components_for_bb
17765
17766 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
17767 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
17768 aarch64_disqualify_components
17769
17770 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
17771 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
17772 aarch64_emit_prologue_components
17773
17774 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
17775 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
17776 aarch64_emit_epilogue_components
17777
17778 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
17779 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
17780 aarch64_set_handled_components
17781
17782 #undef TARGET_TRAMPOLINE_INIT
17783 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
17784
17785 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
17786 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
17787
17788 #undef TARGET_VECTOR_MODE_SUPPORTED_P
17789 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
17790
17791 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
17792 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
17793 aarch64_builtin_support_vector_misalignment
17794
17795 #undef TARGET_ARRAY_MODE
17796 #define TARGET_ARRAY_MODE aarch64_array_mode
17797
17798 #undef TARGET_ARRAY_MODE_SUPPORTED_P
17799 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
17800
17801 #undef TARGET_VECTORIZE_ADD_STMT_COST
17802 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
17803
17804 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
17805 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
17806 aarch64_builtin_vectorization_cost
17807
17808 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
17809 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
17810
17811 #undef TARGET_VECTORIZE_BUILTINS
17812 #define TARGET_VECTORIZE_BUILTINS
17813
17814 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
17815 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
17816 aarch64_builtin_vectorized_function
17817
17818 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
17819 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
17820 aarch64_autovectorize_vector_sizes
17821
17822 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
17823 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
17824 aarch64_atomic_assign_expand_fenv
17825
17826 /* Section anchor support. */
17827
17828 #undef TARGET_MIN_ANCHOR_OFFSET
17829 #define TARGET_MIN_ANCHOR_OFFSET -256
17830
17831 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
17832 byte offset; we can do much more for larger data types, but have no way
17833 to determine the size of the access. We assume accesses are aligned. */
17834 #undef TARGET_MAX_ANCHOR_OFFSET
17835 #define TARGET_MAX_ANCHOR_OFFSET 4095
17836
17837 #undef TARGET_VECTOR_ALIGNMENT
17838 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
17839
17840 #undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
17841 #define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
17842 aarch64_vectorize_preferred_vector_alignment
17843 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
17844 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
17845 aarch64_simd_vector_alignment_reachable
17846
17847 /* vec_perm support. */
17848
17849 #undef TARGET_VECTORIZE_VEC_PERM_CONST
17850 #define TARGET_VECTORIZE_VEC_PERM_CONST \
17851 aarch64_vectorize_vec_perm_const
17852
17853 #undef TARGET_VECTORIZE_GET_MASK_MODE
17854 #define TARGET_VECTORIZE_GET_MASK_MODE aarch64_get_mask_mode
17855 #undef TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE
17856 #define TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE \
17857 aarch64_empty_mask_is_expensive
17858 #undef TARGET_PREFERRED_ELSE_VALUE
17859 #define TARGET_PREFERRED_ELSE_VALUE \
17860 aarch64_preferred_else_value
17861
17862 #undef TARGET_INIT_LIBFUNCS
17863 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
17864
17865 #undef TARGET_FIXED_CONDITION_CODE_REGS
17866 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
17867
17868 #undef TARGET_FLAGS_REGNUM
17869 #define TARGET_FLAGS_REGNUM CC_REGNUM
17870
17871 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
17872 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
17873
17874 #undef TARGET_ASAN_SHADOW_OFFSET
17875 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
17876
17877 #undef TARGET_LEGITIMIZE_ADDRESS
17878 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
17879
17880 #undef TARGET_SCHED_CAN_SPECULATE_INSN
17881 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
17882
17883 #undef TARGET_CAN_USE_DOLOOP_P
17884 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
17885
17886 #undef TARGET_SCHED_ADJUST_PRIORITY
17887 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
17888
17889 #undef TARGET_SCHED_MACRO_FUSION_P
17890 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
17891
17892 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
17893 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
17894
17895 #undef TARGET_SCHED_FUSION_PRIORITY
17896 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
17897
17898 #undef TARGET_UNSPEC_MAY_TRAP_P
17899 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
17900
17901 #undef TARGET_USE_PSEUDO_PIC_REG
17902 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
17903
17904 #undef TARGET_PRINT_OPERAND
17905 #define TARGET_PRINT_OPERAND aarch64_print_operand
17906
17907 #undef TARGET_PRINT_OPERAND_ADDRESS
17908 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
17909
17910 #undef TARGET_OPTAB_SUPPORTED_P
17911 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
17912
17913 #undef TARGET_OMIT_STRUCT_RETURN_REG
17914 #define TARGET_OMIT_STRUCT_RETURN_REG true
17915
17916 #undef TARGET_DWARF_POLY_INDETERMINATE_VALUE
17917 #define TARGET_DWARF_POLY_INDETERMINATE_VALUE \
17918 aarch64_dwarf_poly_indeterminate_value
17919
17920 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors. */
17921 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
17922 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
17923
17924 #undef TARGET_HARD_REGNO_NREGS
17925 #define TARGET_HARD_REGNO_NREGS aarch64_hard_regno_nregs
17926 #undef TARGET_HARD_REGNO_MODE_OK
17927 #define TARGET_HARD_REGNO_MODE_OK aarch64_hard_regno_mode_ok
17928
17929 #undef TARGET_MODES_TIEABLE_P
17930 #define TARGET_MODES_TIEABLE_P aarch64_modes_tieable_p
17931
17932 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
17933 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
17934 aarch64_hard_regno_call_part_clobbered
17935
17936 #undef TARGET_CONSTANT_ALIGNMENT
17937 #define TARGET_CONSTANT_ALIGNMENT aarch64_constant_alignment
17938
17939 #undef TARGET_COMPUTE_PRESSURE_CLASSES
17940 #define TARGET_COMPUTE_PRESSURE_CLASSES aarch64_compute_pressure_classes
17941
17942 #undef TARGET_CAN_CHANGE_MODE_CLASS
17943 #define TARGET_CAN_CHANGE_MODE_CLASS aarch64_can_change_mode_class
17944
17945 #undef TARGET_SELECT_EARLY_REMAT_MODES
17946 #define TARGET_SELECT_EARLY_REMAT_MODES aarch64_select_early_remat_modes
17947
17948 #undef TARGET_SPECULATION_SAFE_VALUE
17949 #define TARGET_SPECULATION_SAFE_VALUE aarch64_speculation_safe_value
17950
17951 #if CHECKING_P
17952 #undef TARGET_RUN_TARGET_SELFTESTS
17953 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
17954 #endif /* #if CHECKING_P */
17955
17956 struct gcc_target targetm = TARGET_INITIALIZER;
17957
17958 #include "gt-aarch64.h"