1 /* Machine description for AArch64 architecture.
2 Copyright (C) 2009-2019 Free Software Foundation, Inc.
3 Contributed by ARM Ltd.
5 This file is part of GCC.
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3, or (at your option)
12 GCC is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
21 #define IN_TARGET_CODE 1
24 #define INCLUDE_STRING
26 #include "coretypes.h"
37 #include "stringpool.h"
44 #include "diagnostic.h"
45 #include "insn-attr.h"
47 #include "fold-const.h"
48 #include "stor-layout.h"
56 #include "langhooks.h"
61 #include "gimple-iterator.h"
62 #include "tree-vectorizer.h"
63 #include "aarch64-cost-tables.h"
67 #include "tm-constrs.h"
68 #include "sched-int.h"
69 #include "target-globals.h"
70 #include "common/common-target.h"
73 #include "selftest-rtl.h"
74 #include "rtx-vector-builder.h"
77 /* This file should be included last. */
78 #include "target-def.h"
80 /* Defined for convenience. */
81 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
83 /* Information about a legitimate vector immediate operand. */
84 struct simd_immediate_info
86 enum insn_type
{ MOV
, MVN
, INDEX
, PTRUE
};
87 enum modifier_type
{ LSL
, MSL
};
89 simd_immediate_info () {}
90 simd_immediate_info (scalar_float_mode
, rtx
);
91 simd_immediate_info (scalar_int_mode
, unsigned HOST_WIDE_INT
,
92 insn_type
= MOV
, modifier_type
= LSL
,
94 simd_immediate_info (scalar_mode
, rtx
, rtx
);
95 simd_immediate_info (scalar_int_mode
, aarch64_svpattern
);
97 /* The mode of the elements. */
100 /* The instruction to use to move the immediate into a vector. */
105 /* For MOV and MVN. */
108 /* The value of each element. */
111 /* The kind of shift modifier to use, and the number of bits to shift.
112 This is (LSL, 0) if no shift is needed. */
113 modifier_type modifier
;
120 /* The value of the first element and the step to be added for each
121 subsequent element. */
126 aarch64_svpattern pattern
;
130 /* Construct a floating-point immediate in which each element has mode
131 ELT_MODE_IN and value VALUE_IN. */
132 inline simd_immediate_info
133 ::simd_immediate_info (scalar_float_mode elt_mode_in
, rtx value_in
)
134 : elt_mode (elt_mode_in
), insn (MOV
)
136 u
.mov
.value
= value_in
;
137 u
.mov
.modifier
= LSL
;
141 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
142 and value VALUE_IN. The other parameters are as for the structure
144 inline simd_immediate_info
145 ::simd_immediate_info (scalar_int_mode elt_mode_in
,
146 unsigned HOST_WIDE_INT value_in
,
147 insn_type insn_in
, modifier_type modifier_in
,
148 unsigned int shift_in
)
149 : elt_mode (elt_mode_in
), insn (insn_in
)
151 u
.mov
.value
= gen_int_mode (value_in
, elt_mode_in
);
152 u
.mov
.modifier
= modifier_in
;
153 u
.mov
.shift
= shift_in
;
156 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
157 and where element I is equal to BASE_IN + I * STEP_IN. */
158 inline simd_immediate_info
159 ::simd_immediate_info (scalar_mode elt_mode_in
, rtx base_in
, rtx step_in
)
160 : elt_mode (elt_mode_in
), insn (INDEX
)
162 u
.index
.base
= base_in
;
163 u
.index
.step
= step_in
;
166 /* Construct a predicate that controls elements of mode ELT_MODE_IN
167 and has PTRUE pattern PATTERN_IN. */
168 inline simd_immediate_info
169 ::simd_immediate_info (scalar_int_mode elt_mode_in
,
170 aarch64_svpattern pattern_in
)
171 : elt_mode (elt_mode_in
), insn (PTRUE
)
173 u
.pattern
= pattern_in
;
176 /* The current code model. */
177 enum aarch64_code_model aarch64_cmodel
;
179 /* The number of 64-bit elements in an SVE vector. */
180 poly_uint16 aarch64_sve_vg
;
183 #undef TARGET_HAVE_TLS
184 #define TARGET_HAVE_TLS 1
187 static bool aarch64_composite_type_p (const_tree
, machine_mode
);
188 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode
,
190 machine_mode
*, int *,
192 static void aarch64_elf_asm_constructor (rtx
, int) ATTRIBUTE_UNUSED
;
193 static void aarch64_elf_asm_destructor (rtx
, int) ATTRIBUTE_UNUSED
;
194 static void aarch64_override_options_after_change (void);
195 static bool aarch64_vector_mode_supported_p (machine_mode
);
196 static int aarch64_address_cost (rtx
, machine_mode
, addr_space_t
, bool);
197 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode
,
201 static machine_mode
aarch64_simd_container_mode (scalar_mode
, poly_int64
);
202 static bool aarch64_print_address_internal (FILE*, machine_mode
, rtx
,
203 aarch64_addr_query_type
);
204 static HOST_WIDE_INT
aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val
);
206 /* Major revision number of the ARM Architecture implemented by the target. */
207 unsigned aarch64_architecture_version
;
209 /* The processor for which instructions should be scheduled. */
210 enum aarch64_processor aarch64_tune
= cortexa53
;
212 /* Mask to specify which instruction scheduling options should be used. */
213 uint64_t aarch64_tune_flags
= 0;
215 /* Global flag for PC relative loads. */
216 bool aarch64_pcrelative_literal_loads
;
218 /* Global flag for whether frame pointer is enabled. */
219 bool aarch64_use_frame_pointer
;
221 #define BRANCH_PROTECT_STR_MAX 255
222 char *accepted_branch_protection_string
= NULL
;
224 static enum aarch64_parse_opt_result
225 aarch64_parse_branch_protection (const char*, char**);
227 /* Support for command line parsing of boolean flags in the tuning
229 struct aarch64_flag_desc
235 #define AARCH64_FUSION_PAIR(name, internal_name) \
236 { name, AARCH64_FUSE_##internal_name },
237 static const struct aarch64_flag_desc aarch64_fusible_pairs
[] =
239 { "none", AARCH64_FUSE_NOTHING
},
240 #include "aarch64-fusion-pairs.def"
241 { "all", AARCH64_FUSE_ALL
},
242 { NULL
, AARCH64_FUSE_NOTHING
}
245 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
246 { name, AARCH64_EXTRA_TUNE_##internal_name },
247 static const struct aarch64_flag_desc aarch64_tuning_flags
[] =
249 { "none", AARCH64_EXTRA_TUNE_NONE
},
250 #include "aarch64-tuning-flags.def"
251 { "all", AARCH64_EXTRA_TUNE_ALL
},
252 { NULL
, AARCH64_EXTRA_TUNE_NONE
}
255 /* Tuning parameters. */
257 static const struct cpu_addrcost_table generic_addrcost_table
=
267 0, /* register_offset */
268 0, /* register_sextend */
269 0, /* register_zextend */
273 static const struct cpu_addrcost_table exynosm1_addrcost_table
=
283 1, /* register_offset */
284 1, /* register_sextend */
285 2, /* register_zextend */
289 static const struct cpu_addrcost_table xgene1_addrcost_table
=
299 0, /* register_offset */
300 1, /* register_sextend */
301 1, /* register_zextend */
305 static const struct cpu_addrcost_table thunderx2t99_addrcost_table
=
315 2, /* register_offset */
316 3, /* register_sextend */
317 3, /* register_zextend */
321 static const struct cpu_addrcost_table tsv110_addrcost_table
=
331 0, /* register_offset */
332 1, /* register_sextend */
333 1, /* register_zextend */
337 static const struct cpu_addrcost_table qdf24xx_addrcost_table
=
347 3, /* register_offset */
348 3, /* register_sextend */
349 3, /* register_zextend */
353 static const struct cpu_regmove_cost generic_regmove_cost
=
356 /* Avoid the use of slow int<->fp moves for spilling by setting
357 their cost higher than memmov_cost. */
363 static const struct cpu_regmove_cost cortexa57_regmove_cost
=
366 /* Avoid the use of slow int<->fp moves for spilling by setting
367 their cost higher than memmov_cost. */
373 static const struct cpu_regmove_cost cortexa53_regmove_cost
=
376 /* Avoid the use of slow int<->fp moves for spilling by setting
377 their cost higher than memmov_cost. */
383 static const struct cpu_regmove_cost exynosm1_regmove_cost
=
386 /* Avoid the use of slow int<->fp moves for spilling by setting
387 their cost higher than memmov_cost (actual, 4 and 9). */
393 static const struct cpu_regmove_cost thunderx_regmove_cost
=
401 static const struct cpu_regmove_cost xgene1_regmove_cost
=
404 /* Avoid the use of slow int<->fp moves for spilling by setting
405 their cost higher than memmov_cost. */
411 static const struct cpu_regmove_cost qdf24xx_regmove_cost
=
414 /* Avoid the use of int<->fp moves for spilling. */
420 static const struct cpu_regmove_cost thunderx2t99_regmove_cost
=
423 /* Avoid the use of int<->fp moves for spilling. */
429 static const struct cpu_regmove_cost tsv110_regmove_cost
=
432 /* Avoid the use of slow int<->fp moves for spilling by setting
433 their cost higher than memmov_cost. */
439 /* Generic costs for vector insn classes. */
440 static const struct cpu_vector_cost generic_vector_cost
=
442 1, /* scalar_int_stmt_cost */
443 1, /* scalar_fp_stmt_cost */
444 1, /* scalar_load_cost */
445 1, /* scalar_store_cost */
446 1, /* vec_int_stmt_cost */
447 1, /* vec_fp_stmt_cost */
448 2, /* vec_permute_cost */
449 1, /* vec_to_scalar_cost */
450 1, /* scalar_to_vec_cost */
451 1, /* vec_align_load_cost */
452 1, /* vec_unalign_load_cost */
453 1, /* vec_unalign_store_cost */
454 1, /* vec_store_cost */
455 3, /* cond_taken_branch_cost */
456 1 /* cond_not_taken_branch_cost */
459 /* QDF24XX costs for vector insn classes. */
460 static const struct cpu_vector_cost qdf24xx_vector_cost
=
462 1, /* scalar_int_stmt_cost */
463 1, /* scalar_fp_stmt_cost */
464 1, /* scalar_load_cost */
465 1, /* scalar_store_cost */
466 1, /* vec_int_stmt_cost */
467 3, /* vec_fp_stmt_cost */
468 2, /* vec_permute_cost */
469 1, /* vec_to_scalar_cost */
470 1, /* scalar_to_vec_cost */
471 1, /* vec_align_load_cost */
472 1, /* vec_unalign_load_cost */
473 1, /* vec_unalign_store_cost */
474 1, /* vec_store_cost */
475 3, /* cond_taken_branch_cost */
476 1 /* cond_not_taken_branch_cost */
479 /* ThunderX costs for vector insn classes. */
480 static const struct cpu_vector_cost thunderx_vector_cost
=
482 1, /* scalar_int_stmt_cost */
483 1, /* scalar_fp_stmt_cost */
484 3, /* scalar_load_cost */
485 1, /* scalar_store_cost */
486 4, /* vec_int_stmt_cost */
487 1, /* vec_fp_stmt_cost */
488 4, /* vec_permute_cost */
489 2, /* vec_to_scalar_cost */
490 2, /* scalar_to_vec_cost */
491 3, /* vec_align_load_cost */
492 5, /* vec_unalign_load_cost */
493 5, /* vec_unalign_store_cost */
494 1, /* vec_store_cost */
495 3, /* cond_taken_branch_cost */
496 3 /* cond_not_taken_branch_cost */
499 static const struct cpu_vector_cost tsv110_vector_cost
=
501 1, /* scalar_int_stmt_cost */
502 1, /* scalar_fp_stmt_cost */
503 5, /* scalar_load_cost */
504 1, /* scalar_store_cost */
505 2, /* vec_int_stmt_cost */
506 2, /* vec_fp_stmt_cost */
507 2, /* vec_permute_cost */
508 3, /* vec_to_scalar_cost */
509 2, /* scalar_to_vec_cost */
510 5, /* vec_align_load_cost */
511 5, /* vec_unalign_load_cost */
512 1, /* vec_unalign_store_cost */
513 1, /* vec_store_cost */
514 1, /* cond_taken_branch_cost */
515 1 /* cond_not_taken_branch_cost */
518 /* Generic costs for vector insn classes. */
519 static const struct cpu_vector_cost cortexa57_vector_cost
=
521 1, /* scalar_int_stmt_cost */
522 1, /* scalar_fp_stmt_cost */
523 4, /* scalar_load_cost */
524 1, /* scalar_store_cost */
525 2, /* vec_int_stmt_cost */
526 2, /* vec_fp_stmt_cost */
527 3, /* vec_permute_cost */
528 8, /* vec_to_scalar_cost */
529 8, /* scalar_to_vec_cost */
530 4, /* vec_align_load_cost */
531 4, /* vec_unalign_load_cost */
532 1, /* vec_unalign_store_cost */
533 1, /* vec_store_cost */
534 1, /* cond_taken_branch_cost */
535 1 /* cond_not_taken_branch_cost */
538 static const struct cpu_vector_cost exynosm1_vector_cost
=
540 1, /* scalar_int_stmt_cost */
541 1, /* scalar_fp_stmt_cost */
542 5, /* scalar_load_cost */
543 1, /* scalar_store_cost */
544 3, /* vec_int_stmt_cost */
545 3, /* vec_fp_stmt_cost */
546 3, /* vec_permute_cost */
547 3, /* vec_to_scalar_cost */
548 3, /* scalar_to_vec_cost */
549 5, /* vec_align_load_cost */
550 5, /* vec_unalign_load_cost */
551 1, /* vec_unalign_store_cost */
552 1, /* vec_store_cost */
553 1, /* cond_taken_branch_cost */
554 1 /* cond_not_taken_branch_cost */
557 /* Generic costs for vector insn classes. */
558 static const struct cpu_vector_cost xgene1_vector_cost
=
560 1, /* scalar_int_stmt_cost */
561 1, /* scalar_fp_stmt_cost */
562 5, /* scalar_load_cost */
563 1, /* scalar_store_cost */
564 2, /* vec_int_stmt_cost */
565 2, /* vec_fp_stmt_cost */
566 2, /* vec_permute_cost */
567 4, /* vec_to_scalar_cost */
568 4, /* scalar_to_vec_cost */
569 10, /* vec_align_load_cost */
570 10, /* vec_unalign_load_cost */
571 2, /* vec_unalign_store_cost */
572 2, /* vec_store_cost */
573 2, /* cond_taken_branch_cost */
574 1 /* cond_not_taken_branch_cost */
577 /* Costs for vector insn classes for Vulcan. */
578 static const struct cpu_vector_cost thunderx2t99_vector_cost
=
580 1, /* scalar_int_stmt_cost */
581 6, /* scalar_fp_stmt_cost */
582 4, /* scalar_load_cost */
583 1, /* scalar_store_cost */
584 5, /* vec_int_stmt_cost */
585 6, /* vec_fp_stmt_cost */
586 3, /* vec_permute_cost */
587 6, /* vec_to_scalar_cost */
588 5, /* scalar_to_vec_cost */
589 8, /* vec_align_load_cost */
590 8, /* vec_unalign_load_cost */
591 4, /* vec_unalign_store_cost */
592 4, /* vec_store_cost */
593 2, /* cond_taken_branch_cost */
594 1 /* cond_not_taken_branch_cost */
597 /* Generic costs for branch instructions. */
598 static const struct cpu_branch_cost generic_branch_cost
=
600 1, /* Predictable. */
601 3 /* Unpredictable. */
604 /* Generic approximation modes. */
605 static const cpu_approx_modes generic_approx_modes
=
607 AARCH64_APPROX_NONE
, /* division */
608 AARCH64_APPROX_NONE
, /* sqrt */
609 AARCH64_APPROX_NONE
/* recip_sqrt */
612 /* Approximation modes for Exynos M1. */
613 static const cpu_approx_modes exynosm1_approx_modes
=
615 AARCH64_APPROX_NONE
, /* division */
616 AARCH64_APPROX_ALL
, /* sqrt */
617 AARCH64_APPROX_ALL
/* recip_sqrt */
620 /* Approximation modes for X-Gene 1. */
621 static const cpu_approx_modes xgene1_approx_modes
=
623 AARCH64_APPROX_NONE
, /* division */
624 AARCH64_APPROX_NONE
, /* sqrt */
625 AARCH64_APPROX_ALL
/* recip_sqrt */
628 /* Generic prefetch settings (which disable prefetch). */
629 static const cpu_prefetch_tune generic_prefetch_tune
=
632 -1, /* l1_cache_size */
633 -1, /* l1_cache_line_size */
634 -1, /* l2_cache_size */
635 true, /* prefetch_dynamic_strides */
636 -1, /* minimum_stride */
637 -1 /* default_opt_level */
640 static const cpu_prefetch_tune exynosm1_prefetch_tune
=
643 -1, /* l1_cache_size */
644 64, /* l1_cache_line_size */
645 -1, /* l2_cache_size */
646 true, /* prefetch_dynamic_strides */
647 -1, /* minimum_stride */
648 -1 /* default_opt_level */
651 static const cpu_prefetch_tune qdf24xx_prefetch_tune
=
654 32, /* l1_cache_size */
655 64, /* l1_cache_line_size */
656 512, /* l2_cache_size */
657 false, /* prefetch_dynamic_strides */
658 2048, /* minimum_stride */
659 3 /* default_opt_level */
662 static const cpu_prefetch_tune thunderxt88_prefetch_tune
=
665 32, /* l1_cache_size */
666 128, /* l1_cache_line_size */
667 16*1024, /* l2_cache_size */
668 true, /* prefetch_dynamic_strides */
669 -1, /* minimum_stride */
670 3 /* default_opt_level */
673 static const cpu_prefetch_tune thunderx_prefetch_tune
=
676 32, /* l1_cache_size */
677 128, /* l1_cache_line_size */
678 -1, /* l2_cache_size */
679 true, /* prefetch_dynamic_strides */
680 -1, /* minimum_stride */
681 -1 /* default_opt_level */
684 static const cpu_prefetch_tune thunderx2t99_prefetch_tune
=
687 32, /* l1_cache_size */
688 64, /* l1_cache_line_size */
689 256, /* l2_cache_size */
690 true, /* prefetch_dynamic_strides */
691 -1, /* minimum_stride */
692 -1 /* default_opt_level */
695 static const cpu_prefetch_tune tsv110_prefetch_tune
=
698 64, /* l1_cache_size */
699 64, /* l1_cache_line_size */
700 512, /* l2_cache_size */
701 true, /* prefetch_dynamic_strides */
702 -1, /* minimum_stride */
703 -1 /* default_opt_level */
706 static const cpu_prefetch_tune xgene1_prefetch_tune
=
709 32, /* l1_cache_size */
710 64, /* l1_cache_line_size */
711 256, /* l2_cache_size */
712 true, /* prefetch_dynamic_strides */
713 -1, /* minimum_stride */
714 -1 /* default_opt_level */
717 static const struct tune_params generic_tunings
=
719 &cortexa57_extra_costs
,
720 &generic_addrcost_table
,
721 &generic_regmove_cost
,
722 &generic_vector_cost
,
723 &generic_branch_cost
,
724 &generic_approx_modes
,
725 SVE_NOT_IMPLEMENTED
, /* sve_width */
728 (AARCH64_FUSE_AES_AESMC
), /* fusible_ops */
729 "16:12", /* function_align. */
730 "4", /* jump_align. */
731 "8", /* loop_align. */
732 2, /* int_reassoc_width. */
733 4, /* fp_reassoc_width. */
734 1, /* vec_reassoc_width. */
735 2, /* min_div_recip_mul_sf. */
736 2, /* min_div_recip_mul_df. */
737 0, /* max_case_values. */
738 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
739 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
740 &generic_prefetch_tune
743 static const struct tune_params cortexa35_tunings
=
745 &cortexa53_extra_costs
,
746 &generic_addrcost_table
,
747 &cortexa53_regmove_cost
,
748 &generic_vector_cost
,
749 &generic_branch_cost
,
750 &generic_approx_modes
,
751 SVE_NOT_IMPLEMENTED
, /* sve_width */
754 (AARCH64_FUSE_AES_AESMC
| AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
755 | AARCH64_FUSE_MOVK_MOVK
| AARCH64_FUSE_ADRP_LDR
), /* fusible_ops */
756 "16", /* function_align. */
757 "4", /* jump_align. */
758 "8", /* loop_align. */
759 2, /* int_reassoc_width. */
760 4, /* fp_reassoc_width. */
761 1, /* vec_reassoc_width. */
762 2, /* min_div_recip_mul_sf. */
763 2, /* min_div_recip_mul_df. */
764 0, /* max_case_values. */
765 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
766 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
767 &generic_prefetch_tune
770 static const struct tune_params cortexa53_tunings
=
772 &cortexa53_extra_costs
,
773 &generic_addrcost_table
,
774 &cortexa53_regmove_cost
,
775 &generic_vector_cost
,
776 &generic_branch_cost
,
777 &generic_approx_modes
,
778 SVE_NOT_IMPLEMENTED
, /* sve_width */
781 (AARCH64_FUSE_AES_AESMC
| AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
782 | AARCH64_FUSE_MOVK_MOVK
| AARCH64_FUSE_ADRP_LDR
), /* fusible_ops */
783 "16", /* function_align. */
784 "4", /* jump_align. */
785 "8", /* loop_align. */
786 2, /* int_reassoc_width. */
787 4, /* fp_reassoc_width. */
788 1, /* vec_reassoc_width. */
789 2, /* min_div_recip_mul_sf. */
790 2, /* min_div_recip_mul_df. */
791 0, /* max_case_values. */
792 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
793 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
794 &generic_prefetch_tune
797 static const struct tune_params cortexa57_tunings
=
799 &cortexa57_extra_costs
,
800 &generic_addrcost_table
,
801 &cortexa57_regmove_cost
,
802 &cortexa57_vector_cost
,
803 &generic_branch_cost
,
804 &generic_approx_modes
,
805 SVE_NOT_IMPLEMENTED
, /* sve_width */
808 (AARCH64_FUSE_AES_AESMC
| AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
809 | AARCH64_FUSE_MOVK_MOVK
), /* fusible_ops */
810 "16", /* function_align. */
811 "4", /* jump_align. */
812 "8", /* loop_align. */
813 2, /* int_reassoc_width. */
814 4, /* fp_reassoc_width. */
815 1, /* vec_reassoc_width. */
816 2, /* min_div_recip_mul_sf. */
817 2, /* min_div_recip_mul_df. */
818 0, /* max_case_values. */
819 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
820 (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS
), /* tune_flags. */
821 &generic_prefetch_tune
824 static const struct tune_params cortexa72_tunings
=
826 &cortexa57_extra_costs
,
827 &generic_addrcost_table
,
828 &cortexa57_regmove_cost
,
829 &cortexa57_vector_cost
,
830 &generic_branch_cost
,
831 &generic_approx_modes
,
832 SVE_NOT_IMPLEMENTED
, /* sve_width */
835 (AARCH64_FUSE_AES_AESMC
| AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
836 | AARCH64_FUSE_MOVK_MOVK
), /* fusible_ops */
837 "16", /* function_align. */
838 "4", /* jump_align. */
839 "8", /* loop_align. */
840 2, /* int_reassoc_width. */
841 4, /* fp_reassoc_width. */
842 1, /* vec_reassoc_width. */
843 2, /* min_div_recip_mul_sf. */
844 2, /* min_div_recip_mul_df. */
845 0, /* max_case_values. */
846 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
847 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
848 &generic_prefetch_tune
851 static const struct tune_params cortexa73_tunings
=
853 &cortexa57_extra_costs
,
854 &generic_addrcost_table
,
855 &cortexa57_regmove_cost
,
856 &cortexa57_vector_cost
,
857 &generic_branch_cost
,
858 &generic_approx_modes
,
859 SVE_NOT_IMPLEMENTED
, /* sve_width */
860 4, /* memmov_cost. */
862 (AARCH64_FUSE_AES_AESMC
| AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
863 | AARCH64_FUSE_MOVK_MOVK
| AARCH64_FUSE_ADRP_LDR
), /* fusible_ops */
864 "16", /* function_align. */
865 "4", /* jump_align. */
866 "8", /* loop_align. */
867 2, /* int_reassoc_width. */
868 4, /* fp_reassoc_width. */
869 1, /* vec_reassoc_width. */
870 2, /* min_div_recip_mul_sf. */
871 2, /* min_div_recip_mul_df. */
872 0, /* max_case_values. */
873 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
874 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
875 &generic_prefetch_tune
880 static const struct tune_params exynosm1_tunings
=
882 &exynosm1_extra_costs
,
883 &exynosm1_addrcost_table
,
884 &exynosm1_regmove_cost
,
885 &exynosm1_vector_cost
,
886 &generic_branch_cost
,
887 &exynosm1_approx_modes
,
888 SVE_NOT_IMPLEMENTED
, /* sve_width */
891 (AARCH64_FUSE_AES_AESMC
), /* fusible_ops */
892 "4", /* function_align. */
893 "4", /* jump_align. */
894 "4", /* loop_align. */
895 2, /* int_reassoc_width. */
896 4, /* fp_reassoc_width. */
897 1, /* vec_reassoc_width. */
898 2, /* min_div_recip_mul_sf. */
899 2, /* min_div_recip_mul_df. */
900 48, /* max_case_values. */
901 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
902 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
903 &exynosm1_prefetch_tune
906 static const struct tune_params thunderxt88_tunings
=
908 &thunderx_extra_costs
,
909 &generic_addrcost_table
,
910 &thunderx_regmove_cost
,
911 &thunderx_vector_cost
,
912 &generic_branch_cost
,
913 &generic_approx_modes
,
914 SVE_NOT_IMPLEMENTED
, /* sve_width */
917 AARCH64_FUSE_CMP_BRANCH
, /* fusible_ops */
918 "8", /* function_align. */
919 "8", /* jump_align. */
920 "8", /* loop_align. */
921 2, /* int_reassoc_width. */
922 4, /* fp_reassoc_width. */
923 1, /* vec_reassoc_width. */
924 2, /* min_div_recip_mul_sf. */
925 2, /* min_div_recip_mul_df. */
926 0, /* max_case_values. */
927 tune_params::AUTOPREFETCHER_OFF
, /* autoprefetcher_model. */
928 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
), /* tune_flags. */
929 &thunderxt88_prefetch_tune
932 static const struct tune_params thunderx_tunings
=
934 &thunderx_extra_costs
,
935 &generic_addrcost_table
,
936 &thunderx_regmove_cost
,
937 &thunderx_vector_cost
,
938 &generic_branch_cost
,
939 &generic_approx_modes
,
940 SVE_NOT_IMPLEMENTED
, /* sve_width */
943 AARCH64_FUSE_CMP_BRANCH
, /* fusible_ops */
944 "8", /* function_align. */
945 "8", /* jump_align. */
946 "8", /* loop_align. */
947 2, /* int_reassoc_width. */
948 4, /* fp_reassoc_width. */
949 1, /* vec_reassoc_width. */
950 2, /* min_div_recip_mul_sf. */
951 2, /* min_div_recip_mul_df. */
952 0, /* max_case_values. */
953 tune_params::AUTOPREFETCHER_OFF
, /* autoprefetcher_model. */
954 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
955 | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND
), /* tune_flags. */
956 &thunderx_prefetch_tune
959 static const struct tune_params tsv110_tunings
=
962 &tsv110_addrcost_table
,
963 &tsv110_regmove_cost
,
965 &generic_branch_cost
,
966 &generic_approx_modes
,
967 SVE_NOT_IMPLEMENTED
, /* sve_width */
970 (AARCH64_FUSE_AES_AESMC
| AARCH64_FUSE_CMP_BRANCH
971 | AARCH64_FUSE_ALU_BRANCH
), /* fusible_ops */
972 "16", /* function_align. */
973 "4", /* jump_align. */
974 "8", /* loop_align. */
975 2, /* int_reassoc_width. */
976 4, /* fp_reassoc_width. */
977 1, /* vec_reassoc_width. */
978 2, /* min_div_recip_mul_sf. */
979 2, /* min_div_recip_mul_df. */
980 0, /* max_case_values. */
981 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
982 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
983 &tsv110_prefetch_tune
986 static const struct tune_params xgene1_tunings
=
989 &xgene1_addrcost_table
,
990 &xgene1_regmove_cost
,
992 &generic_branch_cost
,
993 &xgene1_approx_modes
,
994 SVE_NOT_IMPLEMENTED
, /* sve_width */
997 AARCH64_FUSE_NOTHING
, /* fusible_ops */
998 "16", /* function_align. */
999 "16", /* jump_align. */
1000 "16", /* loop_align. */
1001 2, /* int_reassoc_width. */
1002 4, /* fp_reassoc_width. */
1003 1, /* vec_reassoc_width. */
1004 2, /* min_div_recip_mul_sf. */
1005 2, /* min_div_recip_mul_df. */
1006 17, /* max_case_values. */
1007 tune_params::AUTOPREFETCHER_OFF
, /* autoprefetcher_model. */
1008 (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS
), /* tune_flags. */
1009 &xgene1_prefetch_tune
1012 static const struct tune_params emag_tunings
=
1014 &xgene1_extra_costs
,
1015 &xgene1_addrcost_table
,
1016 &xgene1_regmove_cost
,
1017 &xgene1_vector_cost
,
1018 &generic_branch_cost
,
1019 &xgene1_approx_modes
,
1020 SVE_NOT_IMPLEMENTED
,
1021 6, /* memmov_cost */
1023 AARCH64_FUSE_NOTHING
, /* fusible_ops */
1024 "16", /* function_align. */
1025 "16", /* jump_align. */
1026 "16", /* loop_align. */
1027 2, /* int_reassoc_width. */
1028 4, /* fp_reassoc_width. */
1029 1, /* vec_reassoc_width. */
1030 2, /* min_div_recip_mul_sf. */
1031 2, /* min_div_recip_mul_df. */
1032 17, /* max_case_values. */
1033 tune_params::AUTOPREFETCHER_OFF
, /* autoprefetcher_model. */
1034 (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS
), /* tune_flags. */
1035 &xgene1_prefetch_tune
1038 static const struct tune_params qdf24xx_tunings
=
1040 &qdf24xx_extra_costs
,
1041 &qdf24xx_addrcost_table
,
1042 &qdf24xx_regmove_cost
,
1043 &qdf24xx_vector_cost
,
1044 &generic_branch_cost
,
1045 &generic_approx_modes
,
1046 SVE_NOT_IMPLEMENTED
, /* sve_width */
1047 4, /* memmov_cost */
1049 (AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
1050 | AARCH64_FUSE_MOVK_MOVK
), /* fuseable_ops */
1051 "16", /* function_align. */
1052 "8", /* jump_align. */
1053 "16", /* loop_align. */
1054 2, /* int_reassoc_width. */
1055 4, /* fp_reassoc_width. */
1056 1, /* vec_reassoc_width. */
1057 2, /* min_div_recip_mul_sf. */
1058 2, /* min_div_recip_mul_df. */
1059 0, /* max_case_values. */
1060 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
1061 AARCH64_EXTRA_TUNE_RENAME_LOAD_REGS
, /* tune_flags. */
1062 &qdf24xx_prefetch_tune
1065 /* Tuning structure for the Qualcomm Saphira core. Default to falkor values
1067 static const struct tune_params saphira_tunings
=
1069 &generic_extra_costs
,
1070 &generic_addrcost_table
,
1071 &generic_regmove_cost
,
1072 &generic_vector_cost
,
1073 &generic_branch_cost
,
1074 &generic_approx_modes
,
1075 SVE_NOT_IMPLEMENTED
, /* sve_width */
1076 4, /* memmov_cost */
1078 (AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
1079 | AARCH64_FUSE_MOVK_MOVK
), /* fuseable_ops */
1080 "16", /* function_align. */
1081 "8", /* jump_align. */
1082 "16", /* loop_align. */
1083 2, /* int_reassoc_width. */
1084 4, /* fp_reassoc_width. */
1085 1, /* vec_reassoc_width. */
1086 2, /* min_div_recip_mul_sf. */
1087 2, /* min_div_recip_mul_df. */
1088 0, /* max_case_values. */
1089 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
1090 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
1091 &generic_prefetch_tune
1094 static const struct tune_params thunderx2t99_tunings
=
1096 &thunderx2t99_extra_costs
,
1097 &thunderx2t99_addrcost_table
,
1098 &thunderx2t99_regmove_cost
,
1099 &thunderx2t99_vector_cost
,
1100 &generic_branch_cost
,
1101 &generic_approx_modes
,
1102 SVE_NOT_IMPLEMENTED
, /* sve_width */
1103 4, /* memmov_cost. */
1104 4, /* issue_rate. */
1105 (AARCH64_FUSE_CMP_BRANCH
| AARCH64_FUSE_AES_AESMC
1106 | AARCH64_FUSE_ALU_BRANCH
), /* fusible_ops */
1107 "16", /* function_align. */
1108 "8", /* jump_align. */
1109 "16", /* loop_align. */
1110 3, /* int_reassoc_width. */
1111 2, /* fp_reassoc_width. */
1112 2, /* vec_reassoc_width. */
1113 2, /* min_div_recip_mul_sf. */
1114 2, /* min_div_recip_mul_df. */
1115 0, /* max_case_values. */
1116 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
1117 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
1118 &thunderx2t99_prefetch_tune
1121 static const struct tune_params neoversen1_tunings
=
1123 &cortexa57_extra_costs
,
1124 &generic_addrcost_table
,
1125 &generic_regmove_cost
,
1126 &cortexa57_vector_cost
,
1127 &generic_branch_cost
,
1128 &generic_approx_modes
,
1129 SVE_NOT_IMPLEMENTED
, /* sve_width */
1130 4, /* memmov_cost */
1132 AARCH64_FUSE_AES_AESMC
, /* fusible_ops */
1133 "32:16", /* function_align. */
1134 "32:16", /* jump_align. */
1135 "32:16", /* loop_align. */
1136 2, /* int_reassoc_width. */
1137 4, /* fp_reassoc_width. */
1138 2, /* vec_reassoc_width. */
1139 2, /* min_div_recip_mul_sf. */
1140 2, /* min_div_recip_mul_df. */
1141 0, /* max_case_values. */
1142 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
1143 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
1144 &generic_prefetch_tune
1147 /* Support for fine-grained override of the tuning structures. */
1148 struct aarch64_tuning_override_function
1151 void (*parse_override
)(const char*, struct tune_params
*);
1154 static void aarch64_parse_fuse_string (const char*, struct tune_params
*);
1155 static void aarch64_parse_tune_string (const char*, struct tune_params
*);
1156 static void aarch64_parse_sve_width_string (const char*, struct tune_params
*);
1158 static const struct aarch64_tuning_override_function
1159 aarch64_tuning_override_functions
[] =
1161 { "fuse", aarch64_parse_fuse_string
},
1162 { "tune", aarch64_parse_tune_string
},
1163 { "sve_width", aarch64_parse_sve_width_string
},
1167 /* A processor implementing AArch64. */
1170 const char *const name
;
1171 enum aarch64_processor ident
;
1172 enum aarch64_processor sched_core
;
1173 enum aarch64_arch arch
;
1174 unsigned architecture_version
;
1175 const uint64_t flags
;
1176 const struct tune_params
*const tune
;
1179 /* Architectures implementing AArch64. */
1180 static const struct processor all_architectures
[] =
1182 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
1183 {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
1184 #include "aarch64-arches.def"
1185 {NULL
, aarch64_none
, aarch64_none
, aarch64_no_arch
, 0, 0, NULL
}
1188 /* Processor cores implementing AArch64. */
1189 static const struct processor all_cores
[] =
1191 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART, VARIANT) \
1192 {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH, \
1193 all_architectures[AARCH64_ARCH_##ARCH].architecture_version, \
1194 FLAGS, &COSTS##_tunings},
1195 #include "aarch64-cores.def"
1196 {"generic", generic
, cortexa53
, AARCH64_ARCH_8A
, 8,
1197 AARCH64_FL_FOR_ARCH8
, &generic_tunings
},
1198 {NULL
, aarch64_none
, aarch64_none
, aarch64_no_arch
, 0, 0, NULL
}
1202 /* Target specification. These are populated by the -march, -mtune, -mcpu
1203 handling code or by target attributes. */
1204 static const struct processor
*selected_arch
;
1205 static const struct processor
*selected_cpu
;
1206 static const struct processor
*selected_tune
;
1208 enum aarch64_key_type aarch64_ra_sign_key
= AARCH64_KEY_A
;
1210 /* The current tuning set. */
1211 struct tune_params aarch64_tune_params
= generic_tunings
;
1213 /* Table of machine attributes. */
1214 static const struct attribute_spec aarch64_attribute_table
[] =
1216 /* { name, min_len, max_len, decl_req, type_req, fn_type_req,
1217 affects_type_identity, handler, exclude } */
1218 { "aarch64_vector_pcs", 0, 0, false, true, true, true, NULL
, NULL
},
1219 { NULL
, 0, 0, false, false, false, false, NULL
, NULL
}
1222 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
1224 /* An ISA extension in the co-processor and main instruction set space. */
1225 struct aarch64_option_extension
1227 const char *const name
;
1228 const unsigned long flags_on
;
1229 const unsigned long flags_off
;
1232 typedef enum aarch64_cond_code
1234 AARCH64_EQ
= 0, AARCH64_NE
, AARCH64_CS
, AARCH64_CC
, AARCH64_MI
, AARCH64_PL
,
1235 AARCH64_VS
, AARCH64_VC
, AARCH64_HI
, AARCH64_LS
, AARCH64_GE
, AARCH64_LT
,
1236 AARCH64_GT
, AARCH64_LE
, AARCH64_AL
, AARCH64_NV
1240 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
1242 struct aarch64_branch_protect_type
1244 /* The type's name that the user passes to the branch-protection option
1247 /* Function to handle the protection type and set global variables.
1248 First argument is the string token corresponding with this type and the
1249 second argument is the next token in the option string.
1251 * AARCH64_PARSE_OK: Handling was sucessful.
1252 * AARCH64_INVALID_ARG: The type is invalid in this context and the caller
1253 should print an error.
1254 * AARCH64_INVALID_FEATURE: The type is invalid and the handler prints its
1256 enum aarch64_parse_opt_result (*handler
)(char*, char*);
1257 /* A list of types that can follow this type in the option string. */
1258 const aarch64_branch_protect_type
* subtypes
;
1259 unsigned int num_subtypes
;
1262 static enum aarch64_parse_opt_result
1263 aarch64_handle_no_branch_protection (char* str
, char* rest
)
1265 aarch64_ra_sign_scope
= AARCH64_FUNCTION_NONE
;
1266 aarch64_enable_bti
= 0;
1269 error ("unexpected %<%s%> after %<%s%>", rest
, str
);
1270 return AARCH64_PARSE_INVALID_FEATURE
;
1272 return AARCH64_PARSE_OK
;
1275 static enum aarch64_parse_opt_result
1276 aarch64_handle_standard_branch_protection (char* str
, char* rest
)
1278 aarch64_ra_sign_scope
= AARCH64_FUNCTION_NON_LEAF
;
1279 aarch64_ra_sign_key
= AARCH64_KEY_A
;
1280 aarch64_enable_bti
= 1;
1283 error ("unexpected %<%s%> after %<%s%>", rest
, str
);
1284 return AARCH64_PARSE_INVALID_FEATURE
;
1286 return AARCH64_PARSE_OK
;
1289 static enum aarch64_parse_opt_result
1290 aarch64_handle_pac_ret_protection (char* str ATTRIBUTE_UNUSED
,
1291 char* rest ATTRIBUTE_UNUSED
)
1293 aarch64_ra_sign_scope
= AARCH64_FUNCTION_NON_LEAF
;
1294 aarch64_ra_sign_key
= AARCH64_KEY_A
;
1295 return AARCH64_PARSE_OK
;
1298 static enum aarch64_parse_opt_result
1299 aarch64_handle_pac_ret_leaf (char* str ATTRIBUTE_UNUSED
,
1300 char* rest ATTRIBUTE_UNUSED
)
1302 aarch64_ra_sign_scope
= AARCH64_FUNCTION_ALL
;
1303 return AARCH64_PARSE_OK
;
1306 static enum aarch64_parse_opt_result
1307 aarch64_handle_pac_ret_b_key (char* str ATTRIBUTE_UNUSED
,
1308 char* rest ATTRIBUTE_UNUSED
)
1310 aarch64_ra_sign_key
= AARCH64_KEY_B
;
1311 return AARCH64_PARSE_OK
;
1314 static enum aarch64_parse_opt_result
1315 aarch64_handle_bti_protection (char* str ATTRIBUTE_UNUSED
,
1316 char* rest ATTRIBUTE_UNUSED
)
1318 aarch64_enable_bti
= 1;
1319 return AARCH64_PARSE_OK
;
1322 static const struct aarch64_branch_protect_type aarch64_pac_ret_subtypes
[] = {
1323 { "leaf", aarch64_handle_pac_ret_leaf
, NULL
, 0 },
1324 { "b-key", aarch64_handle_pac_ret_b_key
, NULL
, 0 },
1325 { NULL
, NULL
, NULL
, 0 }
1328 static const struct aarch64_branch_protect_type aarch64_branch_protect_types
[] = {
1329 { "none", aarch64_handle_no_branch_protection
, NULL
, 0 },
1330 { "standard", aarch64_handle_standard_branch_protection
, NULL
, 0 },
1331 { "pac-ret", aarch64_handle_pac_ret_protection
, aarch64_pac_ret_subtypes
,
1332 ARRAY_SIZE (aarch64_pac_ret_subtypes
) },
1333 { "bti", aarch64_handle_bti_protection
, NULL
, 0 },
1334 { NULL
, NULL
, NULL
, 0 }
1337 /* The condition codes of the processor, and the inverse function. */
1338 static const char * const aarch64_condition_codes
[] =
1340 "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
1341 "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
1344 /* The preferred condition codes for SVE conditions. */
1345 static const char *const aarch64_sve_condition_codes
[] =
1347 "none", "any", "nlast", "last", "first", "nfrst", "vs", "vc",
1348 "pmore", "plast", "tcont", "tstop", "gt", "le", "al", "nv"
1351 /* Return the assembly token for svpattern value VALUE. */
1354 svpattern_token (enum aarch64_svpattern pattern
)
1358 #define CASE(UPPER, LOWER, VALUE) case AARCH64_SV_##UPPER: return #LOWER;
1359 AARCH64_FOR_SVPATTERN (CASE
)
1361 case AARCH64_NUM_SVPATTERNS
:
1367 /* Generate code to enable conditional branches in functions over 1 MiB. */
1369 aarch64_gen_far_branch (rtx
* operands
, int pos_label
, const char * dest
,
1370 const char * branch_format
)
1372 rtx_code_label
* tmp_label
= gen_label_rtx ();
1373 char label_buf
[256];
1375 ASM_GENERATE_INTERNAL_LABEL (label_buf
, dest
,
1376 CODE_LABEL_NUMBER (tmp_label
));
1377 const char *label_ptr
= targetm
.strip_name_encoding (label_buf
);
1378 rtx dest_label
= operands
[pos_label
];
1379 operands
[pos_label
] = tmp_label
;
1381 snprintf (buffer
, sizeof (buffer
), "%s%s", branch_format
, label_ptr
);
1382 output_asm_insn (buffer
, operands
);
1384 snprintf (buffer
, sizeof (buffer
), "b\t%%l%d\n%s:", pos_label
, label_ptr
);
1385 operands
[pos_label
] = dest_label
;
1386 output_asm_insn (buffer
, operands
);
1391 aarch64_err_no_fpadvsimd (machine_mode mode
)
1393 if (TARGET_GENERAL_REGS_ONLY
)
1394 if (FLOAT_MODE_P (mode
))
1395 error ("%qs is incompatible with the use of floating-point types",
1396 "-mgeneral-regs-only");
1398 error ("%qs is incompatible with the use of vector types",
1399 "-mgeneral-regs-only");
1401 if (FLOAT_MODE_P (mode
))
1402 error ("%qs feature modifier is incompatible with the use of"
1403 " floating-point types", "+nofp");
1405 error ("%qs feature modifier is incompatible with the use of"
1406 " vector types", "+nofp");
1409 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
1410 The register allocator chooses POINTER_AND_FP_REGS if FP_REGS and
1411 GENERAL_REGS have the same cost - even if POINTER_AND_FP_REGS has a much
1412 higher cost. POINTER_AND_FP_REGS is also used if the cost of both FP_REGS
1413 and GENERAL_REGS is lower than the memory cost (in this case the best class
1414 is the lowest cost one). Using POINTER_AND_FP_REGS irrespectively of its
1415 cost results in bad allocations with many redundant int<->FP moves which
1416 are expensive on various cores.
1417 To avoid this we don't allow POINTER_AND_FP_REGS as the allocno class, but
1418 force a decision between FP_REGS and GENERAL_REGS. We use the allocno class
1419 if it isn't POINTER_AND_FP_REGS. Similarly, use the best class if it isn't
1420 POINTER_AND_FP_REGS. Otherwise set the allocno class depending on the mode.
1421 The result of this is that it is no longer inefficient to have a higher
1422 memory move cost than the register move cost.
1426 aarch64_ira_change_pseudo_allocno_class (int regno
, reg_class_t allocno_class
,
1427 reg_class_t best_class
)
1431 if (!reg_class_subset_p (GENERAL_REGS
, allocno_class
)
1432 || !reg_class_subset_p (FP_REGS
, allocno_class
))
1433 return allocno_class
;
1435 if (!reg_class_subset_p (GENERAL_REGS
, best_class
)
1436 || !reg_class_subset_p (FP_REGS
, best_class
))
1439 mode
= PSEUDO_REGNO_MODE (regno
);
1440 return FLOAT_MODE_P (mode
) || VECTOR_MODE_P (mode
) ? FP_REGS
: GENERAL_REGS
;
1444 aarch64_min_divisions_for_recip_mul (machine_mode mode
)
1446 if (GET_MODE_UNIT_SIZE (mode
) == 4)
1447 return aarch64_tune_params
.min_div_recip_mul_sf
;
1448 return aarch64_tune_params
.min_div_recip_mul_df
;
1451 /* Return the reassociation width of treeop OPC with mode MODE. */
1453 aarch64_reassociation_width (unsigned opc
, machine_mode mode
)
1455 if (VECTOR_MODE_P (mode
))
1456 return aarch64_tune_params
.vec_reassoc_width
;
1457 if (INTEGRAL_MODE_P (mode
))
1458 return aarch64_tune_params
.int_reassoc_width
;
1459 /* Avoid reassociating floating point addition so we emit more FMAs. */
1460 if (FLOAT_MODE_P (mode
) && opc
!= PLUS_EXPR
)
1461 return aarch64_tune_params
.fp_reassoc_width
;
1465 /* Provide a mapping from gcc register numbers to dwarf register numbers. */
1467 aarch64_dbx_register_number (unsigned regno
)
1469 if (GP_REGNUM_P (regno
))
1470 return AARCH64_DWARF_R0
+ regno
- R0_REGNUM
;
1471 else if (regno
== SP_REGNUM
)
1472 return AARCH64_DWARF_SP
;
1473 else if (FP_REGNUM_P (regno
))
1474 return AARCH64_DWARF_V0
+ regno
- V0_REGNUM
;
1475 else if (PR_REGNUM_P (regno
))
1476 return AARCH64_DWARF_P0
+ regno
- P0_REGNUM
;
1477 else if (regno
== VG_REGNUM
)
1478 return AARCH64_DWARF_VG
;
1480 /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
1481 equivalent DWARF register. */
1482 return DWARF_FRAME_REGISTERS
;
1485 /* Return true if MODE is any of the Advanced SIMD structure modes. */
1487 aarch64_advsimd_struct_mode_p (machine_mode mode
)
1490 && (mode
== OImode
|| mode
== CImode
|| mode
== XImode
));
1493 /* Return true if MODE is an SVE predicate mode. */
1495 aarch64_sve_pred_mode_p (machine_mode mode
)
1498 && (mode
== VNx16BImode
1499 || mode
== VNx8BImode
1500 || mode
== VNx4BImode
1501 || mode
== VNx2BImode
));
1504 /* Three mutually-exclusive flags describing a vector or predicate type. */
1505 const unsigned int VEC_ADVSIMD
= 1;
1506 const unsigned int VEC_SVE_DATA
= 2;
1507 const unsigned int VEC_SVE_PRED
= 4;
1508 /* Can be used in combination with VEC_ADVSIMD or VEC_SVE_DATA to indicate
1509 a structure of 2, 3 or 4 vectors. */
1510 const unsigned int VEC_STRUCT
= 8;
1511 /* Useful combinations of the above. */
1512 const unsigned int VEC_ANY_SVE
= VEC_SVE_DATA
| VEC_SVE_PRED
;
1513 const unsigned int VEC_ANY_DATA
= VEC_ADVSIMD
| VEC_SVE_DATA
;
1515 /* Return a set of flags describing the vector properties of mode MODE.
1516 Ignore modes that are not supported by the current target. */
1518 aarch64_classify_vector_mode (machine_mode mode
)
1520 if (aarch64_advsimd_struct_mode_p (mode
))
1521 return VEC_ADVSIMD
| VEC_STRUCT
;
1523 if (aarch64_sve_pred_mode_p (mode
))
1524 return VEC_SVE_PRED
;
1526 /* Make the decision based on the mode's enum value rather than its
1527 properties, so that we keep the correct classification regardless
1528 of -msve-vector-bits. */
1531 /* Single SVE vectors. */
1539 return TARGET_SVE
? VEC_SVE_DATA
: 0;
1541 /* x2 SVE vectors. */
1549 /* x3 SVE vectors. */
1557 /* x4 SVE vectors. */
1565 return TARGET_SVE
? VEC_SVE_DATA
| VEC_STRUCT
: 0;
1567 /* 64-bit Advanced SIMD vectors. */
1571 /* ...E_V1DImode doesn't exist. */
1575 /* 128-bit Advanced SIMD vectors. */
1583 return TARGET_SIMD
? VEC_ADVSIMD
: 0;
1590 /* Return true if MODE is any of the data vector modes, including
1593 aarch64_vector_data_mode_p (machine_mode mode
)
1595 return aarch64_classify_vector_mode (mode
) & VEC_ANY_DATA
;
1598 /* Return true if MODE is an SVE data vector mode; either a single vector
1599 or a structure of vectors. */
1601 aarch64_sve_data_mode_p (machine_mode mode
)
1603 return aarch64_classify_vector_mode (mode
) & VEC_SVE_DATA
;
1606 /* Implement target hook TARGET_ARRAY_MODE. */
1607 static opt_machine_mode
1608 aarch64_array_mode (machine_mode mode
, unsigned HOST_WIDE_INT nelems
)
1610 if (aarch64_classify_vector_mode (mode
) == VEC_SVE_DATA
1611 && IN_RANGE (nelems
, 2, 4))
1612 return mode_for_vector (GET_MODE_INNER (mode
),
1613 GET_MODE_NUNITS (mode
) * nelems
);
1615 return opt_machine_mode ();
1618 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P. */
1620 aarch64_array_mode_supported_p (machine_mode mode
,
1621 unsigned HOST_WIDE_INT nelems
)
1624 && (AARCH64_VALID_SIMD_QREG_MODE (mode
)
1625 || AARCH64_VALID_SIMD_DREG_MODE (mode
))
1626 && (nelems
>= 2 && nelems
<= 4))
1632 /* Return the SVE predicate mode to use for elements that have
1633 ELEM_NBYTES bytes, if such a mode exists. */
1636 aarch64_sve_pred_mode (unsigned int elem_nbytes
)
1640 if (elem_nbytes
== 1)
1642 if (elem_nbytes
== 2)
1644 if (elem_nbytes
== 4)
1646 if (elem_nbytes
== 8)
1649 return opt_machine_mode ();
1652 /* Implement TARGET_VECTORIZE_GET_MASK_MODE. */
1654 static opt_machine_mode
1655 aarch64_get_mask_mode (poly_uint64 nunits
, poly_uint64 nbytes
)
1657 if (TARGET_SVE
&& known_eq (nbytes
, BYTES_PER_SVE_VECTOR
))
1659 unsigned int elem_nbytes
= vector_element_size (nbytes
, nunits
);
1660 machine_mode pred_mode
;
1661 if (aarch64_sve_pred_mode (elem_nbytes
).exists (&pred_mode
))
1665 return default_get_mask_mode (nunits
, nbytes
);
1668 /* Return the integer element mode associated with SVE mode MODE. */
1670 static scalar_int_mode
1671 aarch64_sve_element_int_mode (machine_mode mode
)
1673 unsigned int elt_bits
= vector_element_size (BITS_PER_SVE_VECTOR
,
1674 GET_MODE_NUNITS (mode
));
1675 return int_mode_for_size (elt_bits
, 0).require ();
1678 /* Implement TARGET_PREFERRED_ELSE_VALUE. For binary operations,
1679 prefer to use the first arithmetic operand as the else value if
1680 the else value doesn't matter, since that exactly matches the SVE
1681 destructive merging form. For ternary operations we could either
1682 pick the first operand and use FMAD-like instructions or the last
1683 operand and use FMLA-like instructions; the latter seems more
1687 aarch64_preferred_else_value (unsigned, tree
, unsigned int nops
, tree
*ops
)
1689 return nops
== 3 ? ops
[2] : ops
[0];
1692 /* Implement TARGET_HARD_REGNO_NREGS. */
1695 aarch64_hard_regno_nregs (unsigned regno
, machine_mode mode
)
1697 /* ??? Logically we should only need to provide a value when
1698 HARD_REGNO_MODE_OK says that the combination is valid,
1699 but at the moment we need to handle all modes. Just ignore
1700 any runtime parts for registers that can't store them. */
1701 HOST_WIDE_INT lowest_size
= constant_lower_bound (GET_MODE_SIZE (mode
));
1702 switch (aarch64_regno_regclass (regno
))
1707 if (aarch64_sve_data_mode_p (mode
))
1708 return exact_div (GET_MODE_SIZE (mode
),
1709 BYTES_PER_SVE_VECTOR
).to_constant ();
1710 return CEIL (lowest_size
, UNITS_PER_VREG
);
1716 return CEIL (lowest_size
, UNITS_PER_WORD
);
1721 /* Implement TARGET_HARD_REGNO_MODE_OK. */
1724 aarch64_hard_regno_mode_ok (unsigned regno
, machine_mode mode
)
1726 if (GET_MODE_CLASS (mode
) == MODE_CC
)
1727 return regno
== CC_REGNUM
;
1729 if (regno
== VG_REGNUM
)
1730 /* This must have the same size as _Unwind_Word. */
1731 return mode
== DImode
;
1733 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
1734 if (vec_flags
& VEC_SVE_PRED
)
1735 return PR_REGNUM_P (regno
);
1737 if (PR_REGNUM_P (regno
))
1740 if (regno
== SP_REGNUM
)
1741 /* The purpose of comparing with ptr_mode is to support the
1742 global register variable associated with the stack pointer
1743 register via the syntax of asm ("wsp") in ILP32. */
1744 return mode
== Pmode
|| mode
== ptr_mode
;
1746 if (regno
== FRAME_POINTER_REGNUM
|| regno
== ARG_POINTER_REGNUM
)
1747 return mode
== Pmode
;
1749 if (GP_REGNUM_P (regno
))
1751 if (known_le (GET_MODE_SIZE (mode
), 8))
1753 else if (known_le (GET_MODE_SIZE (mode
), 16))
1754 return (regno
& 1) == 0;
1756 else if (FP_REGNUM_P (regno
))
1758 if (vec_flags
& VEC_STRUCT
)
1759 return end_hard_regno (mode
, regno
) - 1 <= V31_REGNUM
;
1761 return !VECTOR_MODE_P (mode
) || vec_flags
!= 0;
1767 /* Return true if this is a definition of a vectorized simd function. */
1770 aarch64_simd_decl_p (tree fndecl
)
1776 fntype
= TREE_TYPE (fndecl
);
1780 /* Functions with the aarch64_vector_pcs attribute use the simd ABI. */
1781 if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (fntype
)) != NULL
)
1787 /* Return the mode a register save/restore should use. DImode for integer
1788 registers, DFmode for FP registers in non-SIMD functions (they only save
1789 the bottom half of a 128 bit register), or TFmode for FP registers in
1793 aarch64_reg_save_mode (tree fndecl
, unsigned regno
)
1795 return GP_REGNUM_P (regno
)
1797 : (aarch64_simd_decl_p (fndecl
) ? E_TFmode
: E_DFmode
);
1800 /* Return true if the instruction is a call to a SIMD function, false
1801 if it is not a SIMD function or if we do not know anything about
1805 aarch64_simd_call_p (rtx_insn
*insn
)
1811 gcc_assert (CALL_P (insn
));
1812 call
= get_call_rtx_from (insn
);
1813 symbol
= XEXP (XEXP (call
, 0), 0);
1814 if (GET_CODE (symbol
) != SYMBOL_REF
)
1816 fndecl
= SYMBOL_REF_DECL (symbol
);
1820 return aarch64_simd_decl_p (fndecl
);
1823 /* Implement TARGET_REMOVE_EXTRA_CALL_PRESERVED_REGS. If INSN calls
1824 a function that uses the SIMD ABI, take advantage of the extra
1825 call-preserved registers that the ABI provides. */
1828 aarch64_remove_extra_call_preserved_regs (rtx_insn
*insn
,
1829 HARD_REG_SET
*return_set
)
1831 if (aarch64_simd_call_p (insn
))
1833 for (int regno
= 0; regno
< FIRST_PSEUDO_REGISTER
; regno
++)
1834 if (FP_SIMD_SAVED_REGNUM_P (regno
))
1835 CLEAR_HARD_REG_BIT (*return_set
, regno
);
1839 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED. The callee only saves
1840 the lower 64 bits of a 128-bit register. Tell the compiler the callee
1841 clobbers the top 64 bits when restoring the bottom 64 bits. */
1844 aarch64_hard_regno_call_part_clobbered (rtx_insn
*insn
, unsigned int regno
,
1847 bool simd_p
= insn
&& CALL_P (insn
) && aarch64_simd_call_p (insn
);
1848 return FP_REGNUM_P (regno
)
1849 && maybe_gt (GET_MODE_SIZE (mode
), simd_p
? 16 : 8);
1852 /* Implement TARGET_RETURN_CALL_WITH_MAX_CLOBBERS. */
1855 aarch64_return_call_with_max_clobbers (rtx_insn
*call_1
, rtx_insn
*call_2
)
1857 gcc_assert (CALL_P (call_1
) && CALL_P (call_2
));
1859 if (!aarch64_simd_call_p (call_1
) || aarch64_simd_call_p (call_2
))
1865 /* Implement REGMODE_NATURAL_SIZE. */
1867 aarch64_regmode_natural_size (machine_mode mode
)
1869 /* The natural size for SVE data modes is one SVE data vector,
1870 and similarly for predicates. We can't independently modify
1871 anything smaller than that. */
1872 /* ??? For now, only do this for variable-width SVE registers.
1873 Doing it for constant-sized registers breaks lower-subreg.c. */
1874 /* ??? And once that's fixed, we should probably have similar
1875 code for Advanced SIMD. */
1876 if (!aarch64_sve_vg
.is_constant ())
1878 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
1879 if (vec_flags
& VEC_SVE_PRED
)
1880 return BYTES_PER_SVE_PRED
;
1881 if (vec_flags
& VEC_SVE_DATA
)
1882 return BYTES_PER_SVE_VECTOR
;
1884 return UNITS_PER_WORD
;
1887 /* Implement HARD_REGNO_CALLER_SAVE_MODE. */
1889 aarch64_hard_regno_caller_save_mode (unsigned regno
, unsigned,
1892 /* The predicate mode determines which bits are significant and
1893 which are "don't care". Decreasing the number of lanes would
1894 lose data while increasing the number of lanes would make bits
1895 unnecessarily significant. */
1896 if (PR_REGNUM_P (regno
))
1898 if (known_ge (GET_MODE_SIZE (mode
), 4))
1904 /* Return true if I's bits are consecutive ones from the MSB. */
1906 aarch64_high_bits_all_ones_p (HOST_WIDE_INT i
)
1908 return exact_log2 (-i
) != HOST_WIDE_INT_M1
;
1911 /* Implement TARGET_CONSTANT_ALIGNMENT. Make strings word-aligned so
1912 that strcpy from constants will be faster. */
1914 static HOST_WIDE_INT
1915 aarch64_constant_alignment (const_tree exp
, HOST_WIDE_INT align
)
1917 if (TREE_CODE (exp
) == STRING_CST
&& !optimize_size
)
1918 return MAX (align
, BITS_PER_WORD
);
1922 /* Return true if calls to DECL should be treated as
1923 long-calls (ie called via a register). */
1925 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED
)
1930 /* Return true if calls to symbol-ref SYM should be treated as
1931 long-calls (ie called via a register). */
1933 aarch64_is_long_call_p (rtx sym
)
1935 return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym
));
1938 /* Return true if calls to symbol-ref SYM should not go through
1942 aarch64_is_noplt_call_p (rtx sym
)
1944 const_tree decl
= SYMBOL_REF_DECL (sym
);
1949 || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl
)))
1950 && !targetm
.binds_local_p (decl
))
1956 /* Return true if the offsets to a zero/sign-extract operation
1957 represent an expression that matches an extend operation. The
1958 operands represent the paramters from
1960 (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)). */
1962 aarch64_is_extend_from_extract (scalar_int_mode mode
, rtx mult_imm
,
1965 HOST_WIDE_INT mult_val
, extract_val
;
1967 if (! CONST_INT_P (mult_imm
) || ! CONST_INT_P (extract_imm
))
1970 mult_val
= INTVAL (mult_imm
);
1971 extract_val
= INTVAL (extract_imm
);
1974 && extract_val
< GET_MODE_BITSIZE (mode
)
1975 && exact_log2 (extract_val
& ~7) > 0
1976 && (extract_val
& 7) <= 4
1977 && mult_val
== (1 << (extract_val
& 7)))
1983 /* Emit an insn that's a simple single-set. Both the operands must be
1984 known to be valid. */
1985 inline static rtx_insn
*
1986 emit_set_insn (rtx x
, rtx y
)
1988 return emit_insn (gen_rtx_SET (x
, y
));
1991 /* X and Y are two things to compare using CODE. Emit the compare insn and
1992 return the rtx for register 0 in the proper mode. */
1994 aarch64_gen_compare_reg (RTX_CODE code
, rtx x
, rtx y
)
1996 machine_mode mode
= SELECT_CC_MODE (code
, x
, y
);
1997 rtx cc_reg
= gen_rtx_REG (mode
, CC_REGNUM
);
1999 emit_set_insn (cc_reg
, gen_rtx_COMPARE (mode
, x
, y
));
2003 /* Similarly, but maybe zero-extend Y if Y_MODE < SImode. */
2006 aarch64_gen_compare_reg_maybe_ze (RTX_CODE code
, rtx x
, rtx y
,
2007 machine_mode y_mode
)
2009 if (y_mode
== E_QImode
|| y_mode
== E_HImode
)
2011 if (CONST_INT_P (y
))
2012 y
= GEN_INT (INTVAL (y
) & GET_MODE_MASK (y_mode
));
2016 machine_mode cc_mode
;
2018 t
= gen_rtx_ZERO_EXTEND (SImode
, y
);
2019 t
= gen_rtx_COMPARE (CC_SWPmode
, t
, x
);
2020 cc_mode
= CC_SWPmode
;
2021 cc_reg
= gen_rtx_REG (cc_mode
, CC_REGNUM
);
2022 emit_set_insn (cc_reg
, t
);
2027 return aarch64_gen_compare_reg (code
, x
, y
);
2030 /* Build the SYMBOL_REF for __tls_get_addr. */
2032 static GTY(()) rtx tls_get_addr_libfunc
;
2035 aarch64_tls_get_addr (void)
2037 if (!tls_get_addr_libfunc
)
2038 tls_get_addr_libfunc
= init_one_libfunc ("__tls_get_addr");
2039 return tls_get_addr_libfunc
;
2042 /* Return the TLS model to use for ADDR. */
2044 static enum tls_model
2045 tls_symbolic_operand_type (rtx addr
)
2047 enum tls_model tls_kind
= TLS_MODEL_NONE
;
2048 if (GET_CODE (addr
) == CONST
)
2051 rtx sym
= strip_offset (addr
, &addend
);
2052 if (GET_CODE (sym
) == SYMBOL_REF
)
2053 tls_kind
= SYMBOL_REF_TLS_MODEL (sym
);
2055 else if (GET_CODE (addr
) == SYMBOL_REF
)
2056 tls_kind
= SYMBOL_REF_TLS_MODEL (addr
);
2061 /* We'll allow lo_sum's in addresses in our legitimate addresses
2062 so that combine would take care of combining addresses where
2063 necessary, but for generation purposes, we'll generate the address
2066 tmp = hi (symbol_ref); adrp x1, foo
2067 dest = lo_sum (tmp, symbol_ref); add dest, x1, :lo_12:foo
2071 adrp x1, :got:foo adrp tmp, :tlsgd:foo
2072 ldr x1, [:got_lo12:foo] add dest, tmp, :tlsgd_lo12:foo
2076 Load TLS symbol, depending on TLS mechanism and TLS access model.
2078 Global Dynamic - Traditional TLS:
2079 adrp tmp, :tlsgd:imm
2080 add dest, tmp, #:tlsgd_lo12:imm
2083 Global Dynamic - TLS Descriptors:
2084 adrp dest, :tlsdesc:imm
2085 ldr tmp, [dest, #:tlsdesc_lo12:imm]
2086 add dest, dest, #:tlsdesc_lo12:imm
2093 adrp tmp, :gottprel:imm
2094 ldr dest, [tmp, #:gottprel_lo12:imm]
2099 add t0, tp, #:tprel_hi12:imm, lsl #12
2100 add t0, t0, #:tprel_lo12_nc:imm
2104 aarch64_load_symref_appropriately (rtx dest
, rtx imm
,
2105 enum aarch64_symbol_type type
)
2109 case SYMBOL_SMALL_ABSOLUTE
:
2111 /* In ILP32, the mode of dest can be either SImode or DImode. */
2113 machine_mode mode
= GET_MODE (dest
);
2115 gcc_assert (mode
== Pmode
|| mode
== ptr_mode
);
2117 if (can_create_pseudo_p ())
2118 tmp_reg
= gen_reg_rtx (mode
);
2120 emit_move_insn (tmp_reg
, gen_rtx_HIGH (mode
, imm
));
2121 emit_insn (gen_add_losym (dest
, tmp_reg
, imm
));
2125 case SYMBOL_TINY_ABSOLUTE
:
2126 emit_insn (gen_rtx_SET (dest
, imm
));
2129 case SYMBOL_SMALL_GOT_28K
:
2131 machine_mode mode
= GET_MODE (dest
);
2132 rtx gp_rtx
= pic_offset_table_rtx
;
2136 /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
2137 here before rtl expand. Tree IVOPT will generate rtl pattern to
2138 decide rtx costs, in which case pic_offset_table_rtx is not
2139 initialized. For that case no need to generate the first adrp
2140 instruction as the final cost for global variable access is
2144 /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
2145 using the page base as GOT base, the first page may be wasted,
2146 in the worst scenario, there is only 28K space for GOT).
2148 The generate instruction sequence for accessing global variable
2151 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
2153 Only one instruction needed. But we must initialize
2154 pic_offset_table_rtx properly. We generate initialize insn for
2155 every global access, and allow CSE to remove all redundant.
2157 The final instruction sequences will look like the following
2158 for multiply global variables access.
2160 adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
2162 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
2163 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
2164 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
2167 rtx s
= gen_rtx_SYMBOL_REF (Pmode
, "_GLOBAL_OFFSET_TABLE_");
2168 crtl
->uses_pic_offset_table
= 1;
2169 emit_move_insn (gp_rtx
, gen_rtx_HIGH (Pmode
, s
));
2171 if (mode
!= GET_MODE (gp_rtx
))
2172 gp_rtx
= gen_lowpart (mode
, gp_rtx
);
2176 if (mode
== ptr_mode
)
2179 insn
= gen_ldr_got_small_28k_di (dest
, gp_rtx
, imm
);
2181 insn
= gen_ldr_got_small_28k_si (dest
, gp_rtx
, imm
);
2183 mem
= XVECEXP (SET_SRC (insn
), 0, 0);
2187 gcc_assert (mode
== Pmode
);
2189 insn
= gen_ldr_got_small_28k_sidi (dest
, gp_rtx
, imm
);
2190 mem
= XVECEXP (XEXP (SET_SRC (insn
), 0), 0, 0);
2193 /* The operand is expected to be MEM. Whenever the related insn
2194 pattern changed, above code which calculate mem should be
2196 gcc_assert (GET_CODE (mem
) == MEM
);
2197 MEM_READONLY_P (mem
) = 1;
2198 MEM_NOTRAP_P (mem
) = 1;
2203 case SYMBOL_SMALL_GOT_4G
:
2205 /* In ILP32, the mode of dest can be either SImode or DImode,
2206 while the got entry is always of SImode size. The mode of
2207 dest depends on how dest is used: if dest is assigned to a
2208 pointer (e.g. in the memory), it has SImode; it may have
2209 DImode if dest is dereferenced to access the memeory.
2210 This is why we have to handle three different ldr_got_small
2211 patterns here (two patterns for ILP32). */
2216 machine_mode mode
= GET_MODE (dest
);
2218 if (can_create_pseudo_p ())
2219 tmp_reg
= gen_reg_rtx (mode
);
2221 emit_move_insn (tmp_reg
, gen_rtx_HIGH (mode
, imm
));
2222 if (mode
== ptr_mode
)
2225 insn
= gen_ldr_got_small_di (dest
, tmp_reg
, imm
);
2227 insn
= gen_ldr_got_small_si (dest
, tmp_reg
, imm
);
2229 mem
= XVECEXP (SET_SRC (insn
), 0, 0);
2233 gcc_assert (mode
== Pmode
);
2235 insn
= gen_ldr_got_small_sidi (dest
, tmp_reg
, imm
);
2236 mem
= XVECEXP (XEXP (SET_SRC (insn
), 0), 0, 0);
2239 gcc_assert (GET_CODE (mem
) == MEM
);
2240 MEM_READONLY_P (mem
) = 1;
2241 MEM_NOTRAP_P (mem
) = 1;
2246 case SYMBOL_SMALL_TLSGD
:
2249 machine_mode mode
= GET_MODE (dest
);
2250 rtx result
= gen_rtx_REG (mode
, R0_REGNUM
);
2254 aarch64_emit_call_insn (gen_tlsgd_small_si (result
, imm
));
2256 aarch64_emit_call_insn (gen_tlsgd_small_di (result
, imm
));
2257 insns
= get_insns ();
2260 RTL_CONST_CALL_P (insns
) = 1;
2261 emit_libcall_block (insns
, dest
, result
, imm
);
2265 case SYMBOL_SMALL_TLSDESC
:
2267 machine_mode mode
= GET_MODE (dest
);
2268 rtx x0
= gen_rtx_REG (mode
, R0_REGNUM
);
2271 gcc_assert (mode
== Pmode
|| mode
== ptr_mode
);
2273 /* In ILP32, the got entry is always of SImode size. Unlike
2274 small GOT, the dest is fixed at reg 0. */
2276 emit_insn (gen_tlsdesc_small_si (imm
));
2278 emit_insn (gen_tlsdesc_small_di (imm
));
2279 tp
= aarch64_load_tp (NULL
);
2282 tp
= gen_lowpart (mode
, tp
);
2284 emit_insn (gen_rtx_SET (dest
, gen_rtx_PLUS (mode
, tp
, x0
)));
2286 set_unique_reg_note (get_last_insn (), REG_EQUIV
, imm
);
2290 case SYMBOL_SMALL_TLSIE
:
2292 /* In ILP32, the mode of dest can be either SImode or DImode,
2293 while the got entry is always of SImode size. The mode of
2294 dest depends on how dest is used: if dest is assigned to a
2295 pointer (e.g. in the memory), it has SImode; it may have
2296 DImode if dest is dereferenced to access the memeory.
2297 This is why we have to handle three different tlsie_small
2298 patterns here (two patterns for ILP32). */
2299 machine_mode mode
= GET_MODE (dest
);
2300 rtx tmp_reg
= gen_reg_rtx (mode
);
2301 rtx tp
= aarch64_load_tp (NULL
);
2303 if (mode
== ptr_mode
)
2306 emit_insn (gen_tlsie_small_di (tmp_reg
, imm
));
2309 emit_insn (gen_tlsie_small_si (tmp_reg
, imm
));
2310 tp
= gen_lowpart (mode
, tp
);
2315 gcc_assert (mode
== Pmode
);
2316 emit_insn (gen_tlsie_small_sidi (tmp_reg
, imm
));
2319 emit_insn (gen_rtx_SET (dest
, gen_rtx_PLUS (mode
, tp
, tmp_reg
)));
2321 set_unique_reg_note (get_last_insn (), REG_EQUIV
, imm
);
2325 case SYMBOL_TLSLE12
:
2326 case SYMBOL_TLSLE24
:
2327 case SYMBOL_TLSLE32
:
2328 case SYMBOL_TLSLE48
:
2330 machine_mode mode
= GET_MODE (dest
);
2331 rtx tp
= aarch64_load_tp (NULL
);
2334 tp
= gen_lowpart (mode
, tp
);
2338 case SYMBOL_TLSLE12
:
2339 emit_insn ((mode
== DImode
? gen_tlsle12_di
: gen_tlsle12_si
)
2342 case SYMBOL_TLSLE24
:
2343 emit_insn ((mode
== DImode
? gen_tlsle24_di
: gen_tlsle24_si
)
2346 case SYMBOL_TLSLE32
:
2347 emit_insn ((mode
== DImode
? gen_tlsle32_di
: gen_tlsle32_si
)
2349 emit_insn ((mode
== DImode
? gen_adddi3
: gen_addsi3
)
2352 case SYMBOL_TLSLE48
:
2353 emit_insn ((mode
== DImode
? gen_tlsle48_di
: gen_tlsle48_si
)
2355 emit_insn ((mode
== DImode
? gen_adddi3
: gen_addsi3
)
2363 set_unique_reg_note (get_last_insn (), REG_EQUIV
, imm
);
2367 case SYMBOL_TINY_GOT
:
2368 emit_insn (gen_ldr_got_tiny (dest
, imm
));
2371 case SYMBOL_TINY_TLSIE
:
2373 machine_mode mode
= GET_MODE (dest
);
2374 rtx tp
= aarch64_load_tp (NULL
);
2376 if (mode
== ptr_mode
)
2379 emit_insn (gen_tlsie_tiny_di (dest
, imm
, tp
));
2382 tp
= gen_lowpart (mode
, tp
);
2383 emit_insn (gen_tlsie_tiny_si (dest
, imm
, tp
));
2388 gcc_assert (mode
== Pmode
);
2389 emit_insn (gen_tlsie_tiny_sidi (dest
, imm
, tp
));
2393 set_unique_reg_note (get_last_insn (), REG_EQUIV
, imm
);
2402 /* Emit a move from SRC to DEST. Assume that the move expanders can
2403 handle all moves if !can_create_pseudo_p (). The distinction is
2404 important because, unlike emit_move_insn, the move expanders know
2405 how to force Pmode objects into the constant pool even when the
2406 constant pool address is not itself legitimate. */
2408 aarch64_emit_move (rtx dest
, rtx src
)
2410 return (can_create_pseudo_p ()
2411 ? emit_move_insn (dest
, src
)
2412 : emit_move_insn_1 (dest
, src
));
2415 /* Apply UNOPTAB to OP and store the result in DEST. */
2418 aarch64_emit_unop (rtx dest
, optab unoptab
, rtx op
)
2420 rtx tmp
= expand_unop (GET_MODE (dest
), unoptab
, op
, dest
, 0);
2422 emit_move_insn (dest
, tmp
);
2425 /* Apply BINOPTAB to OP0 and OP1 and store the result in DEST. */
2428 aarch64_emit_binop (rtx dest
, optab binoptab
, rtx op0
, rtx op1
)
2430 rtx tmp
= expand_binop (GET_MODE (dest
), binoptab
, op0
, op1
, dest
, 0,
2433 emit_move_insn (dest
, tmp
);
2436 /* Split a 128-bit move operation into two 64-bit move operations,
2437 taking care to handle partial overlap of register to register
2438 copies. Special cases are needed when moving between GP regs and
2439 FP regs. SRC can be a register, constant or memory; DST a register
2440 or memory. If either operand is memory it must not have any side
2443 aarch64_split_128bit_move (rtx dst
, rtx src
)
2448 machine_mode mode
= GET_MODE (dst
);
2450 gcc_assert (mode
== TImode
|| mode
== TFmode
);
2451 gcc_assert (!(side_effects_p (src
) || side_effects_p (dst
)));
2452 gcc_assert (mode
== GET_MODE (src
) || GET_MODE (src
) == VOIDmode
);
2454 if (REG_P (dst
) && REG_P (src
))
2456 int src_regno
= REGNO (src
);
2457 int dst_regno
= REGNO (dst
);
2459 /* Handle FP <-> GP regs. */
2460 if (FP_REGNUM_P (dst_regno
) && GP_REGNUM_P (src_regno
))
2462 src_lo
= gen_lowpart (word_mode
, src
);
2463 src_hi
= gen_highpart (word_mode
, src
);
2465 emit_insn (gen_aarch64_movlow_di (mode
, dst
, src_lo
));
2466 emit_insn (gen_aarch64_movhigh_di (mode
, dst
, src_hi
));
2469 else if (GP_REGNUM_P (dst_regno
) && FP_REGNUM_P (src_regno
))
2471 dst_lo
= gen_lowpart (word_mode
, dst
);
2472 dst_hi
= gen_highpart (word_mode
, dst
);
2474 emit_insn (gen_aarch64_movdi_low (mode
, dst_lo
, src
));
2475 emit_insn (gen_aarch64_movdi_high (mode
, dst_hi
, src
));
2480 dst_lo
= gen_lowpart (word_mode
, dst
);
2481 dst_hi
= gen_highpart (word_mode
, dst
);
2482 src_lo
= gen_lowpart (word_mode
, src
);
2483 src_hi
= gen_highpart_mode (word_mode
, mode
, src
);
2485 /* At most one pairing may overlap. */
2486 if (reg_overlap_mentioned_p (dst_lo
, src_hi
))
2488 aarch64_emit_move (dst_hi
, src_hi
);
2489 aarch64_emit_move (dst_lo
, src_lo
);
2493 aarch64_emit_move (dst_lo
, src_lo
);
2494 aarch64_emit_move (dst_hi
, src_hi
);
2499 aarch64_split_128bit_move_p (rtx dst
, rtx src
)
2501 return (! REG_P (src
)
2502 || ! (FP_REGNUM_P (REGNO (dst
)) && FP_REGNUM_P (REGNO (src
))));
2505 /* Split a complex SIMD combine. */
2508 aarch64_split_simd_combine (rtx dst
, rtx src1
, rtx src2
)
2510 machine_mode src_mode
= GET_MODE (src1
);
2511 machine_mode dst_mode
= GET_MODE (dst
);
2513 gcc_assert (VECTOR_MODE_P (dst_mode
));
2514 gcc_assert (register_operand (dst
, dst_mode
)
2515 && register_operand (src1
, src_mode
)
2516 && register_operand (src2
, src_mode
));
2518 emit_insn (gen_aarch64_simd_combine (src_mode
, dst
, src1
, src2
));
2522 /* Split a complex SIMD move. */
2525 aarch64_split_simd_move (rtx dst
, rtx src
)
2527 machine_mode src_mode
= GET_MODE (src
);
2528 machine_mode dst_mode
= GET_MODE (dst
);
2530 gcc_assert (VECTOR_MODE_P (dst_mode
));
2532 if (REG_P (dst
) && REG_P (src
))
2534 gcc_assert (VECTOR_MODE_P (src_mode
));
2535 emit_insn (gen_aarch64_split_simd_mov (src_mode
, dst
, src
));
2540 aarch64_zero_extend_const_eq (machine_mode xmode
, rtx x
,
2541 machine_mode ymode
, rtx y
)
2543 rtx r
= simplify_const_unary_operation (ZERO_EXTEND
, xmode
, y
, ymode
);
2544 gcc_assert (r
!= NULL
);
2545 return rtx_equal_p (x
, r
);
2549 /* Return TARGET if it is nonnull and a register of mode MODE.
2550 Otherwise, return a fresh register of mode MODE if we can,
2551 or TARGET reinterpreted as MODE if we can't. */
2554 aarch64_target_reg (rtx target
, machine_mode mode
)
2556 if (target
&& REG_P (target
) && GET_MODE (target
) == mode
)
2558 if (!can_create_pseudo_p ())
2560 gcc_assert (target
);
2561 return gen_lowpart (mode
, target
);
2563 return gen_reg_rtx (mode
);
2566 /* Return a register that contains the constant in BUILDER, given that
2567 the constant is a legitimate move operand. Use TARGET as the register
2568 if it is nonnull and convenient. */
2571 aarch64_emit_set_immediate (rtx target
, rtx_vector_builder
&builder
)
2573 rtx src
= builder
.build ();
2574 target
= aarch64_target_reg (target
, GET_MODE (src
));
2575 emit_insn (gen_rtx_SET (target
, src
));
2580 aarch64_force_temporary (machine_mode mode
, rtx x
, rtx value
)
2582 if (can_create_pseudo_p ())
2583 return force_reg (mode
, value
);
2587 aarch64_emit_move (x
, value
);
2592 /* Return true if predicate value X is a constant in which every element
2593 is a CONST_INT. When returning true, describe X in BUILDER as a VNx16BI
2594 value, i.e. as a predicate in which all bits are significant. */
2597 aarch64_get_sve_pred_bits (rtx_vector_builder
&builder
, rtx x
)
2599 if (GET_CODE (x
) != CONST_VECTOR
)
2602 unsigned int factor
= vector_element_size (GET_MODE_NUNITS (VNx16BImode
),
2603 GET_MODE_NUNITS (GET_MODE (x
)));
2604 unsigned int npatterns
= CONST_VECTOR_NPATTERNS (x
) * factor
;
2605 unsigned int nelts_per_pattern
= CONST_VECTOR_NELTS_PER_PATTERN (x
);
2606 builder
.new_vector (VNx16BImode
, npatterns
, nelts_per_pattern
);
2608 unsigned int nelts
= const_vector_encoded_nelts (x
);
2609 for (unsigned int i
= 0; i
< nelts
; ++i
)
2611 rtx elt
= CONST_VECTOR_ENCODED_ELT (x
, i
);
2612 if (!CONST_INT_P (elt
))
2615 builder
.quick_push (elt
);
2616 for (unsigned int j
= 1; j
< factor
; ++j
)
2617 builder
.quick_push (const0_rtx
);
2619 builder
.finalize ();
2623 /* BUILDER contains a predicate constant of mode VNx16BI. Return the
2624 widest predicate element size it can have (that is, the largest size
2625 for which each element would still be 0 or 1). */
2628 aarch64_widest_sve_pred_elt_size (rtx_vector_builder
&builder
)
2630 /* Start with the most optimistic assumption: that we only need
2631 one bit per pattern. This is what we will use if only the first
2632 bit in each pattern is ever set. */
2633 unsigned int mask
= GET_MODE_SIZE (DImode
);
2634 mask
|= builder
.npatterns ();
2636 /* Look for set bits. */
2637 unsigned int nelts
= builder
.encoded_nelts ();
2638 for (unsigned int i
= 1; i
< nelts
; ++i
)
2639 if (INTVAL (builder
.elt (i
)) != 0)
2645 return mask
& -mask
;
2648 /* BUILDER is a predicate constant of mode VNx16BI. Consider the value
2649 that the constant would have with predicate element size ELT_SIZE
2650 (ignoring the upper bits in each element) and return:
2652 * -1 if all bits are set
2653 * N if the predicate has N leading set bits followed by all clear bits
2654 * 0 if the predicate does not have any of these forms. */
2657 aarch64_partial_ptrue_length (rtx_vector_builder
&builder
,
2658 unsigned int elt_size
)
2660 /* If nelts_per_pattern is 3, we have set bits followed by clear bits
2661 followed by set bits. */
2662 if (builder
.nelts_per_pattern () == 3)
2665 /* Skip over leading set bits. */
2666 unsigned int nelts
= builder
.encoded_nelts ();
2668 for (; i
< nelts
; i
+= elt_size
)
2669 if (INTVAL (builder
.elt (i
)) == 0)
2671 unsigned int vl
= i
/ elt_size
;
2673 /* Check for the all-true case. */
2677 /* If nelts_per_pattern is 1, then either VL is zero, or we have a
2678 repeating pattern of set bits followed by clear bits. */
2679 if (builder
.nelts_per_pattern () != 2)
2682 /* We have a "foreground" value and a duplicated "background" value.
2683 If the background might repeat and the last set bit belongs to it,
2684 we might have set bits followed by clear bits followed by set bits. */
2685 if (i
> builder
.npatterns () && maybe_ne (nelts
, builder
.full_nelts ()))
2688 /* Make sure that the rest are all clear. */
2689 for (; i
< nelts
; i
+= elt_size
)
2690 if (INTVAL (builder
.elt (i
)) != 0)
2696 /* See if there is an svpattern that encodes an SVE predicate of mode
2697 PRED_MODE in which the first VL bits are set and the rest are clear.
2698 Return the pattern if so, otherwise return AARCH64_NUM_SVPATTERNS.
2699 A VL of -1 indicates an all-true vector. */
2702 aarch64_svpattern_for_vl (machine_mode pred_mode
, int vl
)
2705 return AARCH64_SV_ALL
;
2707 if (maybe_gt (vl
, GET_MODE_NUNITS (pred_mode
)))
2708 return AARCH64_NUM_SVPATTERNS
;
2710 if (vl
>= 1 && vl
<= 8)
2711 return aarch64_svpattern (AARCH64_SV_VL1
+ (vl
- 1));
2713 if (vl
>= 16 && vl
<= 256 && pow2p_hwi (vl
))
2714 return aarch64_svpattern (AARCH64_SV_VL16
+ (exact_log2 (vl
) - 4));
2717 if (GET_MODE_NUNITS (pred_mode
).is_constant (&max_vl
))
2719 if (vl
== (max_vl
/ 3) * 3)
2720 return AARCH64_SV_MUL3
;
2721 /* These would only trigger for non-power-of-2 lengths. */
2722 if (vl
== (max_vl
& -4))
2723 return AARCH64_SV_MUL4
;
2724 if (vl
== (1 << floor_log2 (max_vl
)))
2725 return AARCH64_SV_POW2
;
2727 return AARCH64_SV_ALL
;
2729 return AARCH64_NUM_SVPATTERNS
;
2732 /* Return a VNx16BImode constant in which every sequence of ELT_SIZE
2733 bits has the lowest bit set and the upper bits clear. This is the
2734 VNx16BImode equivalent of a PTRUE for controlling elements of
2735 ELT_SIZE bytes. However, because the constant is VNx16BImode,
2736 all bits are significant, even the upper zeros. */
2739 aarch64_ptrue_all (unsigned int elt_size
)
2741 rtx_vector_builder
builder (VNx16BImode
, elt_size
, 1);
2742 builder
.quick_push (const1_rtx
);
2743 for (unsigned int i
= 1; i
< elt_size
; ++i
)
2744 builder
.quick_push (const0_rtx
);
2745 return builder
.build ();
2748 /* Return an all-true predicate register of mode MODE. */
2751 aarch64_ptrue_reg (machine_mode mode
)
2753 gcc_assert (GET_MODE_CLASS (mode
) == MODE_VECTOR_BOOL
);
2754 rtx reg
= force_reg (VNx16BImode
, CONSTM1_RTX (VNx16BImode
));
2755 return gen_lowpart (mode
, reg
);
2758 /* Return an all-false predicate register of mode MODE. */
2761 aarch64_pfalse_reg (machine_mode mode
)
2763 gcc_assert (GET_MODE_CLASS (mode
) == MODE_VECTOR_BOOL
);
2764 rtx reg
= force_reg (VNx16BImode
, CONST0_RTX (VNx16BImode
));
2765 return gen_lowpart (mode
, reg
);
2768 /* Return true if predicate PRED1[0] is true whenever predicate PRED2 is
2769 true, or alternatively if we know that the operation predicated by
2770 PRED1[0] is safe to perform whenever PRED2 is true. PRED1[1] is a
2771 aarch64_sve_gp_strictness operand that describes the operation
2772 predicated by PRED1[0]. */
2775 aarch64_sve_pred_dominates_p (rtx
*pred1
, rtx pred2
)
2777 machine_mode mode
= GET_MODE (pred2
);
2778 gcc_assert (GET_MODE_CLASS (mode
) == MODE_VECTOR_BOOL
2779 && mode
== GET_MODE (pred1
[0])
2780 && aarch64_sve_gp_strictness (pred1
[1], SImode
));
2781 return (pred1
[0] == CONSTM1_RTX (mode
)
2782 || INTVAL (pred1
[1]) == SVE_RELAXED_GP
2783 || rtx_equal_p (pred1
[0], pred2
));
2786 /* Use a comparison to convert integer vector SRC into MODE, which is
2787 the corresponding SVE predicate mode. Use TARGET for the result
2788 if it's nonnull and convenient. */
2791 aarch64_convert_sve_data_to_pred (rtx target
, machine_mode mode
, rtx src
)
2793 machine_mode src_mode
= GET_MODE (src
);
2794 insn_code icode
= code_for_aarch64_pred_cmp (NE
, src_mode
);
2795 expand_operand ops
[4];
2796 create_output_operand (&ops
[0], target
, mode
);
2797 create_input_operand (&ops
[1], CONSTM1_RTX (mode
), mode
);
2798 create_input_operand (&ops
[2], src
, src_mode
);
2799 create_input_operand (&ops
[3], CONST0_RTX (src_mode
), src_mode
);
2800 expand_insn (icode
, 4, ops
);
2801 return ops
[0].value
;
2804 /* Return true if we can move VALUE into a register using a single
2805 CNT[BHWD] instruction. */
2808 aarch64_sve_cnt_immediate_p (poly_int64 value
)
2810 HOST_WIDE_INT factor
= value
.coeffs
[0];
2811 /* The coefficient must be [1, 16] * {2, 4, 8, 16}. */
2812 return (value
.coeffs
[1] == factor
2813 && IN_RANGE (factor
, 2, 16 * 16)
2814 && (factor
& 1) == 0
2815 && factor
<= 16 * (factor
& -factor
));
2818 /* Likewise for rtx X. */
2821 aarch64_sve_cnt_immediate_p (rtx x
)
2824 return poly_int_rtx_p (x
, &value
) && aarch64_sve_cnt_immediate_p (value
);
2827 /* Return the asm string for an instruction with a CNT-like vector size
2828 operand (a vector pattern followed by a multiplier in the range [1, 16]).
2829 PREFIX is the mnemonic without the size suffix and OPERANDS is the
2830 first part of the operands template (the part that comes before the
2831 vector size itself). FACTOR is the number of quadwords.
2832 NELTS_PER_VQ, if nonzero, is the number of elements in each quadword.
2833 If it is zero, we can use any element size. */
2836 aarch64_output_sve_cnt_immediate (const char *prefix
, const char *operands
,
2837 unsigned int factor
,
2838 unsigned int nelts_per_vq
)
2840 static char buffer
[sizeof ("sqincd\t%x0, %w0, all, mul #16")];
2842 if (nelts_per_vq
== 0)
2843 /* There is some overlap in the ranges of the four CNT instructions.
2844 Here we always use the smallest possible element size, so that the
2845 multiplier is 1 whereever possible. */
2846 nelts_per_vq
= factor
& -factor
;
2847 int shift
= std::min (exact_log2 (nelts_per_vq
), 4);
2848 gcc_assert (IN_RANGE (shift
, 1, 4));
2849 char suffix
= "dwhb"[shift
- 1];
2852 unsigned int written
;
2854 written
= snprintf (buffer
, sizeof (buffer
), "%s%c\t%s",
2855 prefix
, suffix
, operands
);
2857 written
= snprintf (buffer
, sizeof (buffer
), "%s%c\t%s, all, mul #%d",
2858 prefix
, suffix
, operands
, factor
);
2859 gcc_assert (written
< sizeof (buffer
));
2863 /* Return the asm string for an instruction with a CNT-like vector size
2864 operand (a vector pattern followed by a multiplier in the range [1, 16]).
2865 PREFIX is the mnemonic without the size suffix and OPERANDS is the
2866 first part of the operands template (the part that comes before the
2867 vector size itself). X is the value of the vector size operand,
2868 as a polynomial integer rtx. */
2871 aarch64_output_sve_cnt_immediate (const char *prefix
, const char *operands
,
2874 poly_int64 value
= rtx_to_poly_int64 (x
);
2875 gcc_assert (aarch64_sve_cnt_immediate_p (value
));
2876 return aarch64_output_sve_cnt_immediate (prefix
, operands
,
2877 value
.coeffs
[1], 0);
2880 /* Return true if we can add VALUE to a register using a single ADDVL
2881 or ADDPL instruction. */
2884 aarch64_sve_addvl_addpl_immediate_p (poly_int64 value
)
2886 HOST_WIDE_INT factor
= value
.coeffs
[0];
2887 if (factor
== 0 || value
.coeffs
[1] != factor
)
2889 /* FACTOR counts VG / 2, so a value of 2 is one predicate width
2890 and a value of 16 is one vector width. */
2891 return (((factor
& 15) == 0 && IN_RANGE (factor
, -32 * 16, 31 * 16))
2892 || ((factor
& 1) == 0 && IN_RANGE (factor
, -32 * 2, 31 * 2)));
2895 /* Likewise for rtx X. */
2898 aarch64_sve_addvl_addpl_immediate_p (rtx x
)
2901 return (poly_int_rtx_p (x
, &value
)
2902 && aarch64_sve_addvl_addpl_immediate_p (value
));
2905 /* Return the asm string for adding ADDVL or ADDPL immediate X to operand 1
2906 and storing the result in operand 0. */
2909 aarch64_output_sve_addvl_addpl (rtx dest
, rtx base
, rtx offset
)
2911 static char buffer
[sizeof ("addpl\t%x0, %x1, #-") + 3 * sizeof (int)];
2912 poly_int64 offset_value
= rtx_to_poly_int64 (offset
);
2913 gcc_assert (aarch64_sve_addvl_addpl_immediate_p (offset_value
));
2915 /* Use INC or DEC if possible. */
2916 if (rtx_equal_p (dest
, base
) && GP_REGNUM_P (REGNO (dest
)))
2918 if (aarch64_sve_cnt_immediate_p (offset_value
))
2919 return aarch64_output_sve_cnt_immediate ("inc", "%x0",
2920 offset_value
.coeffs
[1], 0);
2921 if (aarch64_sve_cnt_immediate_p (-offset_value
))
2922 return aarch64_output_sve_cnt_immediate ("dec", "%x0",
2923 -offset_value
.coeffs
[1], 0);
2926 int factor
= offset_value
.coeffs
[1];
2927 if ((factor
& 15) == 0)
2928 snprintf (buffer
, sizeof (buffer
), "addvl\t%%x0, %%x1, #%d", factor
/ 16);
2930 snprintf (buffer
, sizeof (buffer
), "addpl\t%%x0, %%x1, #%d", factor
/ 2);
2934 /* Return true if X is a valid immediate for an SVE vector INC or DEC
2935 instruction. If it is, store the number of elements in each vector
2936 quadword in *NELTS_PER_VQ_OUT (if nonnull) and store the multiplication
2937 factor in *FACTOR_OUT (if nonnull). */
2940 aarch64_sve_inc_dec_immediate_p (rtx x
, int *factor_out
,
2941 unsigned int *nelts_per_vq_out
)
2946 if (!const_vec_duplicate_p (x
, &elt
)
2947 || !poly_int_rtx_p (elt
, &value
))
2950 unsigned int nelts_per_vq
= 128 / GET_MODE_UNIT_BITSIZE (GET_MODE (x
));
2951 if (nelts_per_vq
!= 8 && nelts_per_vq
!= 4 && nelts_per_vq
!= 2)
2952 /* There's no vector INCB. */
2955 HOST_WIDE_INT factor
= value
.coeffs
[0];
2956 if (value
.coeffs
[1] != factor
)
2959 /* The coefficient must be [1, 16] * NELTS_PER_VQ. */
2960 if ((factor
% nelts_per_vq
) != 0
2961 || !IN_RANGE (abs (factor
), nelts_per_vq
, 16 * nelts_per_vq
))
2965 *factor_out
= factor
;
2966 if (nelts_per_vq_out
)
2967 *nelts_per_vq_out
= nelts_per_vq
;
2971 /* Return true if X is a valid immediate for an SVE vector INC or DEC
2975 aarch64_sve_inc_dec_immediate_p (rtx x
)
2977 return aarch64_sve_inc_dec_immediate_p (x
, NULL
, NULL
);
2980 /* Return the asm template for an SVE vector INC or DEC instruction.
2981 OPERANDS gives the operands before the vector count and X is the
2982 value of the vector count operand itself. */
2985 aarch64_output_sve_inc_dec_immediate (const char *operands
, rtx x
)
2988 unsigned int nelts_per_vq
;
2989 if (!aarch64_sve_inc_dec_immediate_p (x
, &factor
, &nelts_per_vq
))
2992 return aarch64_output_sve_cnt_immediate ("dec", operands
, -factor
,
2995 return aarch64_output_sve_cnt_immediate ("inc", operands
, factor
,
3000 aarch64_internal_mov_immediate (rtx dest
, rtx imm
, bool generate
,
3001 scalar_int_mode mode
)
3004 unsigned HOST_WIDE_INT val
, val2
, mask
;
3005 int one_match
, zero_match
;
3010 if (aarch64_move_imm (val
, mode
))
3013 emit_insn (gen_rtx_SET (dest
, imm
));
3017 /* Check to see if the low 32 bits are either 0xffffXXXX or 0xXXXXffff
3018 (with XXXX non-zero). In that case check to see if the move can be done in
3020 val2
= val
& 0xffffffff;
3022 && aarch64_move_imm (val2
, SImode
)
3023 && (((val
>> 32) & 0xffff) == 0 || (val
>> 48) == 0))
3026 emit_insn (gen_rtx_SET (dest
, GEN_INT (val2
)));
3028 /* Check if we have to emit a second instruction by checking to see
3029 if any of the upper 32 bits of the original DI mode value is set. */
3033 i
= (val
>> 48) ? 48 : 32;
3036 emit_insn (gen_insv_immdi (dest
, GEN_INT (i
),
3037 GEN_INT ((val
>> i
) & 0xffff)));
3042 if ((val
>> 32) == 0 || mode
== SImode
)
3046 emit_insn (gen_rtx_SET (dest
, GEN_INT (val
& 0xffff)));
3048 emit_insn (gen_insv_immsi (dest
, GEN_INT (16),
3049 GEN_INT ((val
>> 16) & 0xffff)));
3051 emit_insn (gen_insv_immdi (dest
, GEN_INT (16),
3052 GEN_INT ((val
>> 16) & 0xffff)));
3057 /* Remaining cases are all for DImode. */
3060 zero_match
= ((val
& mask
) == 0) + ((val
& (mask
<< 16)) == 0) +
3061 ((val
& (mask
<< 32)) == 0) + ((val
& (mask
<< 48)) == 0);
3062 one_match
= ((~val
& mask
) == 0) + ((~val
& (mask
<< 16)) == 0) +
3063 ((~val
& (mask
<< 32)) == 0) + ((~val
& (mask
<< 48)) == 0);
3065 if (zero_match
!= 2 && one_match
!= 2)
3067 /* Try emitting a bitmask immediate with a movk replacing 16 bits.
3068 For a 64-bit bitmask try whether changing 16 bits to all ones or
3069 zeroes creates a valid bitmask. To check any repeated bitmask,
3070 try using 16 bits from the other 32-bit half of val. */
3072 for (i
= 0; i
< 64; i
+= 16, mask
<<= 16)
3075 if (val2
!= val
&& aarch64_bitmask_imm (val2
, mode
))
3078 if (val2
!= val
&& aarch64_bitmask_imm (val2
, mode
))
3080 val2
= val2
& ~mask
;
3081 val2
= val2
| (((val2
>> 32) | (val2
<< 32)) & mask
);
3082 if (val2
!= val
&& aarch64_bitmask_imm (val2
, mode
))
3089 emit_insn (gen_rtx_SET (dest
, GEN_INT (val2
)));
3090 emit_insn (gen_insv_immdi (dest
, GEN_INT (i
),
3091 GEN_INT ((val
>> i
) & 0xffff)));
3097 /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
3098 are emitted by the initial mov. If one_match > zero_match, skip set bits,
3099 otherwise skip zero bits. */
3103 val2
= one_match
> zero_match
? ~val
: val
;
3104 i
= (val2
& mask
) != 0 ? 0 : (val2
& (mask
<< 16)) != 0 ? 16 : 32;
3107 emit_insn (gen_rtx_SET (dest
, GEN_INT (one_match
> zero_match
3108 ? (val
| ~(mask
<< i
))
3109 : (val
& (mask
<< i
)))));
3110 for (i
+= 16; i
< 64; i
+= 16)
3112 if ((val2
& (mask
<< i
)) == 0)
3115 emit_insn (gen_insv_immdi (dest
, GEN_INT (i
),
3116 GEN_INT ((val
>> i
) & 0xffff)));
3123 /* Return whether imm is a 128-bit immediate which is simple enough to
3126 aarch64_mov128_immediate (rtx imm
)
3128 if (GET_CODE (imm
) == CONST_INT
)
3131 gcc_assert (CONST_WIDE_INT_NUNITS (imm
) == 2);
3133 rtx lo
= GEN_INT (CONST_WIDE_INT_ELT (imm
, 0));
3134 rtx hi
= GEN_INT (CONST_WIDE_INT_ELT (imm
, 1));
3136 return aarch64_internal_mov_immediate (NULL_RTX
, lo
, false, DImode
)
3137 + aarch64_internal_mov_immediate (NULL_RTX
, hi
, false, DImode
) <= 4;
3141 /* Return the number of temporary registers that aarch64_add_offset_1
3142 would need to add OFFSET to a register. */
3145 aarch64_add_offset_1_temporaries (HOST_WIDE_INT offset
)
3147 return abs_hwi (offset
) < 0x1000000 ? 0 : 1;
3150 /* A subroutine of aarch64_add_offset. Set DEST to SRC + OFFSET for
3151 a non-polynomial OFFSET. MODE is the mode of the addition.
3152 FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
3153 be set and CFA adjustments added to the generated instructions.
3155 TEMP1, if nonnull, is a register of mode MODE that can be used as a
3156 temporary if register allocation is already complete. This temporary
3157 register may overlap DEST but must not overlap SRC. If TEMP1 is known
3158 to hold abs (OFFSET), EMIT_MOVE_IMM can be set to false to avoid emitting
3159 the immediate again.
3161 Since this function may be used to adjust the stack pointer, we must
3162 ensure that it cannot cause transient stack deallocation (for example
3163 by first incrementing SP and then decrementing when adjusting by a
3164 large immediate). */
3167 aarch64_add_offset_1 (scalar_int_mode mode
, rtx dest
,
3168 rtx src
, HOST_WIDE_INT offset
, rtx temp1
,
3169 bool frame_related_p
, bool emit_move_imm
)
3171 gcc_assert (emit_move_imm
|| temp1
!= NULL_RTX
);
3172 gcc_assert (temp1
== NULL_RTX
|| !reg_overlap_mentioned_p (temp1
, src
));
3174 HOST_WIDE_INT moffset
= abs_hwi (offset
);
3179 if (!rtx_equal_p (dest
, src
))
3181 insn
= emit_insn (gen_rtx_SET (dest
, src
));
3182 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
3187 /* Single instruction adjustment. */
3188 if (aarch64_uimm12_shift (moffset
))
3190 insn
= emit_insn (gen_add3_insn (dest
, src
, GEN_INT (offset
)));
3191 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
3195 /* Emit 2 additions/subtractions if the adjustment is less than 24 bits
3198 a) the offset cannot be loaded by a 16-bit move or
3199 b) there is no spare register into which we can move it. */
3200 if (moffset
< 0x1000000
3201 && ((!temp1
&& !can_create_pseudo_p ())
3202 || !aarch64_move_imm (moffset
, mode
)))
3204 HOST_WIDE_INT low_off
= moffset
& 0xfff;
3206 low_off
= offset
< 0 ? -low_off
: low_off
;
3207 insn
= emit_insn (gen_add3_insn (dest
, src
, GEN_INT (low_off
)));
3208 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
3209 insn
= emit_insn (gen_add2_insn (dest
, GEN_INT (offset
- low_off
)));
3210 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
3214 /* Emit a move immediate if required and an addition/subtraction. */
3217 gcc_assert (temp1
!= NULL_RTX
|| can_create_pseudo_p ());
3218 temp1
= aarch64_force_temporary (mode
, temp1
, GEN_INT (moffset
));
3220 insn
= emit_insn (offset
< 0
3221 ? gen_sub3_insn (dest
, src
, temp1
)
3222 : gen_add3_insn (dest
, src
, temp1
));
3223 if (frame_related_p
)
3225 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
3226 rtx adj
= plus_constant (mode
, src
, offset
);
3227 add_reg_note (insn
, REG_CFA_ADJUST_CFA
, gen_rtx_SET (dest
, adj
));
3231 /* Return the number of temporary registers that aarch64_add_offset
3232 would need to move OFFSET into a register or add OFFSET to a register;
3233 ADD_P is true if we want the latter rather than the former. */
3236 aarch64_offset_temporaries (bool add_p
, poly_int64 offset
)
3238 /* This follows the same structure as aarch64_add_offset. */
3239 if (add_p
&& aarch64_sve_addvl_addpl_immediate_p (offset
))
3242 unsigned int count
= 0;
3243 HOST_WIDE_INT factor
= offset
.coeffs
[1];
3244 HOST_WIDE_INT constant
= offset
.coeffs
[0] - factor
;
3245 poly_int64
poly_offset (factor
, factor
);
3246 if (add_p
&& aarch64_sve_addvl_addpl_immediate_p (poly_offset
))
3247 /* Need one register for the ADDVL/ADDPL result. */
3249 else if (factor
!= 0)
3251 factor
= abs (factor
);
3252 if (factor
> 16 * (factor
& -factor
))
3253 /* Need one register for the CNT result and one for the multiplication
3254 factor. If necessary, the second temporary can be reused for the
3255 constant part of the offset. */
3257 /* Need one register for the CNT result (which might then
3261 return count
+ aarch64_add_offset_1_temporaries (constant
);
3264 /* If X can be represented as a poly_int64, return the number
3265 of temporaries that are required to add it to a register.
3266 Return -1 otherwise. */
3269 aarch64_add_offset_temporaries (rtx x
)
3272 if (!poly_int_rtx_p (x
, &offset
))
3274 return aarch64_offset_temporaries (true, offset
);
3277 /* Set DEST to SRC + OFFSET. MODE is the mode of the addition.
3278 FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
3279 be set and CFA adjustments added to the generated instructions.
3281 TEMP1, if nonnull, is a register of mode MODE that can be used as a
3282 temporary if register allocation is already complete. This temporary
3283 register may overlap DEST if !FRAME_RELATED_P but must not overlap SRC.
3284 If TEMP1 is known to hold abs (OFFSET), EMIT_MOVE_IMM can be set to
3285 false to avoid emitting the immediate again.
3287 TEMP2, if nonnull, is a second temporary register that doesn't
3288 overlap either DEST or REG.
3290 Since this function may be used to adjust the stack pointer, we must
3291 ensure that it cannot cause transient stack deallocation (for example
3292 by first incrementing SP and then decrementing when adjusting by a
3293 large immediate). */
3296 aarch64_add_offset (scalar_int_mode mode
, rtx dest
, rtx src
,
3297 poly_int64 offset
, rtx temp1
, rtx temp2
,
3298 bool frame_related_p
, bool emit_move_imm
= true)
3300 gcc_assert (emit_move_imm
|| temp1
!= NULL_RTX
);
3301 gcc_assert (temp1
== NULL_RTX
|| !reg_overlap_mentioned_p (temp1
, src
));
3302 gcc_assert (temp1
== NULL_RTX
3304 || !reg_overlap_mentioned_p (temp1
, dest
));
3305 gcc_assert (temp2
== NULL_RTX
|| !reg_overlap_mentioned_p (dest
, temp2
));
3307 /* Try using ADDVL or ADDPL to add the whole value. */
3308 if (src
!= const0_rtx
&& aarch64_sve_addvl_addpl_immediate_p (offset
))
3310 rtx offset_rtx
= gen_int_mode (offset
, mode
);
3311 rtx_insn
*insn
= emit_insn (gen_add3_insn (dest
, src
, offset_rtx
));
3312 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
3316 /* Coefficient 1 is multiplied by the number of 128-bit blocks in an
3317 SVE vector register, over and above the minimum size of 128 bits.
3318 This is equivalent to half the value returned by CNTD with a
3319 vector shape of ALL. */
3320 HOST_WIDE_INT factor
= offset
.coeffs
[1];
3321 HOST_WIDE_INT constant
= offset
.coeffs
[0] - factor
;
3323 /* Try using ADDVL or ADDPL to add the VG-based part. */
3324 poly_int64
poly_offset (factor
, factor
);
3325 if (src
!= const0_rtx
3326 && aarch64_sve_addvl_addpl_immediate_p (poly_offset
))
3328 rtx offset_rtx
= gen_int_mode (poly_offset
, mode
);
3329 if (frame_related_p
)
3331 rtx_insn
*insn
= emit_insn (gen_add3_insn (dest
, src
, offset_rtx
));
3332 RTX_FRAME_RELATED_P (insn
) = true;
3337 rtx addr
= gen_rtx_PLUS (mode
, src
, offset_rtx
);
3338 src
= aarch64_force_temporary (mode
, temp1
, addr
);
3343 /* Otherwise use a CNT-based sequence. */
3344 else if (factor
!= 0)
3346 /* Use a subtraction if we have a negative factor. */
3347 rtx_code code
= PLUS
;
3354 /* Calculate CNTD * FACTOR / 2. First try to fold the division
3355 into the multiplication. */
3359 /* Use a right shift by 1. */
3363 HOST_WIDE_INT low_bit
= factor
& -factor
;
3364 if (factor
<= 16 * low_bit
)
3366 if (factor
> 16 * 8)
3368 /* "CNTB Xn, ALL, MUL #FACTOR" is out of range, so calculate
3369 the value with the minimum multiplier and shift it into
3371 int extra_shift
= exact_log2 (low_bit
);
3372 shift
+= extra_shift
;
3373 factor
>>= extra_shift
;
3375 val
= gen_int_mode (poly_int64 (factor
* 2, factor
* 2), mode
);
3379 /* Use CNTD, then multiply it by FACTOR. */
3380 val
= gen_int_mode (poly_int64 (2, 2), mode
);
3381 val
= aarch64_force_temporary (mode
, temp1
, val
);
3383 /* Go back to using a negative multiplication factor if we have
3384 no register from which to subtract. */
3385 if (code
== MINUS
&& src
== const0_rtx
)
3390 rtx coeff1
= gen_int_mode (factor
, mode
);
3391 coeff1
= aarch64_force_temporary (mode
, temp2
, coeff1
);
3392 val
= gen_rtx_MULT (mode
, val
, coeff1
);
3397 /* Multiply by 1 << SHIFT. */
3398 val
= aarch64_force_temporary (mode
, temp1
, val
);
3399 val
= gen_rtx_ASHIFT (mode
, val
, GEN_INT (shift
));
3401 else if (shift
== -1)
3404 val
= aarch64_force_temporary (mode
, temp1
, val
);
3405 val
= gen_rtx_ASHIFTRT (mode
, val
, const1_rtx
);
3408 /* Calculate SRC +/- CNTD * FACTOR / 2. */
3409 if (src
!= const0_rtx
)
3411 val
= aarch64_force_temporary (mode
, temp1
, val
);
3412 val
= gen_rtx_fmt_ee (code
, mode
, src
, val
);
3414 else if (code
== MINUS
)
3416 val
= aarch64_force_temporary (mode
, temp1
, val
);
3417 val
= gen_rtx_NEG (mode
, val
);
3420 if (constant
== 0 || frame_related_p
)
3422 rtx_insn
*insn
= emit_insn (gen_rtx_SET (dest
, val
));
3423 if (frame_related_p
)
3425 RTX_FRAME_RELATED_P (insn
) = true;
3426 add_reg_note (insn
, REG_CFA_ADJUST_CFA
,
3427 gen_rtx_SET (dest
, plus_constant (Pmode
, src
,
3436 src
= aarch64_force_temporary (mode
, temp1
, val
);
3441 emit_move_imm
= true;
3444 aarch64_add_offset_1 (mode
, dest
, src
, constant
, temp1
,
3445 frame_related_p
, emit_move_imm
);
3448 /* Like aarch64_add_offset, but the offset is given as an rtx rather
3449 than a poly_int64. */
3452 aarch64_split_add_offset (scalar_int_mode mode
, rtx dest
, rtx src
,
3453 rtx offset_rtx
, rtx temp1
, rtx temp2
)
3455 aarch64_add_offset (mode
, dest
, src
, rtx_to_poly_int64 (offset_rtx
),
3456 temp1
, temp2
, false);
3459 /* Add DELTA to the stack pointer, marking the instructions frame-related.
3460 TEMP1 is available as a temporary if nonnull. EMIT_MOVE_IMM is false
3461 if TEMP1 already contains abs (DELTA). */
3464 aarch64_add_sp (rtx temp1
, rtx temp2
, poly_int64 delta
, bool emit_move_imm
)
3466 aarch64_add_offset (Pmode
, stack_pointer_rtx
, stack_pointer_rtx
, delta
,
3467 temp1
, temp2
, true, emit_move_imm
);
3470 /* Subtract DELTA from the stack pointer, marking the instructions
3471 frame-related if FRAME_RELATED_P. TEMP1 is available as a temporary
3475 aarch64_sub_sp (rtx temp1
, rtx temp2
, poly_int64 delta
, bool frame_related_p
,
3476 bool emit_move_imm
= true)
3478 aarch64_add_offset (Pmode
, stack_pointer_rtx
, stack_pointer_rtx
, -delta
,
3479 temp1
, temp2
, frame_related_p
, emit_move_imm
);
3482 /* Set DEST to (vec_series BASE STEP). */
3485 aarch64_expand_vec_series (rtx dest
, rtx base
, rtx step
)
3487 machine_mode mode
= GET_MODE (dest
);
3488 scalar_mode inner
= GET_MODE_INNER (mode
);
3490 /* Each operand can be a register or an immediate in the range [-16, 15]. */
3491 if (!aarch64_sve_index_immediate_p (base
))
3492 base
= force_reg (inner
, base
);
3493 if (!aarch64_sve_index_immediate_p (step
))
3494 step
= force_reg (inner
, step
);
3496 emit_set_insn (dest
, gen_rtx_VEC_SERIES (mode
, base
, step
));
3499 /* Duplicate 128-bit Advanced SIMD vector SRC so that it fills an SVE
3500 register of mode MODE. Use TARGET for the result if it's nonnull
3503 The two vector modes must have the same element mode. The behavior
3504 is to duplicate architectural lane N of SRC into architectural lanes
3505 N + I * STEP of the result. On big-endian targets, architectural
3506 lane 0 of an Advanced SIMD vector is the last element of the vector
3507 in memory layout, so for big-endian targets this operation has the
3508 effect of reversing SRC before duplicating it. Callers need to
3509 account for this. */
3512 aarch64_expand_sve_dupq (rtx target
, machine_mode mode
, rtx src
)
3514 machine_mode src_mode
= GET_MODE (src
);
3515 gcc_assert (GET_MODE_INNER (mode
) == GET_MODE_INNER (src_mode
));
3516 insn_code icode
= (BYTES_BIG_ENDIAN
3517 ? code_for_aarch64_vec_duplicate_vq_be (mode
)
3518 : code_for_aarch64_vec_duplicate_vq_le (mode
));
3521 expand_operand ops
[3];
3522 create_output_operand (&ops
[i
++], target
, mode
);
3523 create_output_operand (&ops
[i
++], src
, src_mode
);
3524 if (BYTES_BIG_ENDIAN
)
3526 /* Create a PARALLEL describing the reversal of SRC. */
3527 unsigned int nelts_per_vq
= 128 / GET_MODE_UNIT_BITSIZE (mode
);
3528 rtx sel
= aarch64_gen_stepped_int_parallel (nelts_per_vq
,
3529 nelts_per_vq
- 1, -1);
3530 create_fixed_operand (&ops
[i
++], sel
);
3532 expand_insn (icode
, i
, ops
);
3533 return ops
[0].value
;
3536 /* Try to force 128-bit vector value SRC into memory and use LD1RQ to fetch
3537 the memory image into DEST. Return true on success. */
3540 aarch64_expand_sve_ld1rq (rtx dest
, rtx src
)
3542 src
= force_const_mem (GET_MODE (src
), src
);
3546 /* Make sure that the address is legitimate. */
3547 if (!aarch64_sve_ld1rq_operand_p (src
))
3549 rtx addr
= force_reg (Pmode
, XEXP (src
, 0));
3550 src
= replace_equiv_address (src
, addr
);
3553 machine_mode mode
= GET_MODE (dest
);
3554 unsigned int elem_bytes
= GET_MODE_UNIT_SIZE (mode
);
3555 machine_mode pred_mode
= aarch64_sve_pred_mode (elem_bytes
).require ();
3556 rtx ptrue
= aarch64_ptrue_reg (pred_mode
);
3557 emit_insn (gen_aarch64_sve_ld1rq (mode
, dest
, src
, ptrue
));
3561 /* Return a register containing CONST_VECTOR SRC, given that SRC has an
3562 SVE data mode and isn't a legitimate constant. Use TARGET for the
3563 result if convenient.
3565 The returned register can have whatever mode seems most natural
3566 given the contents of SRC. */
3569 aarch64_expand_sve_const_vector (rtx target
, rtx src
)
3571 machine_mode mode
= GET_MODE (src
);
3572 unsigned int npatterns
= CONST_VECTOR_NPATTERNS (src
);
3573 unsigned int nelts_per_pattern
= CONST_VECTOR_NELTS_PER_PATTERN (src
);
3574 scalar_mode elt_mode
= GET_MODE_INNER (mode
);
3575 unsigned int elt_bits
= GET_MODE_BITSIZE (elt_mode
);
3576 unsigned int encoded_bits
= npatterns
* nelts_per_pattern
* elt_bits
;
3578 if (nelts_per_pattern
== 1 && encoded_bits
== 128)
3580 /* The constant is a duplicated quadword but can't be narrowed
3581 beyond a quadword. Get the memory image of the first quadword
3582 as a 128-bit vector and try using LD1RQ to load it from memory.
3584 The effect for both endiannesses is to load memory lane N into
3585 architectural lanes N + I * STEP of the result. On big-endian
3586 targets, the layout of the 128-bit vector in an Advanced SIMD
3587 register would be different from its layout in an SVE register,
3588 but this 128-bit vector is a memory value only. */
3589 machine_mode vq_mode
= aarch64_vq_mode (elt_mode
).require ();
3590 rtx vq_value
= simplify_gen_subreg (vq_mode
, src
, mode
, 0);
3591 if (vq_value
&& aarch64_expand_sve_ld1rq (target
, vq_value
))
3595 if (nelts_per_pattern
== 1 && encoded_bits
< 128)
3597 /* The vector is a repeating sequence of 64 bits or fewer.
3598 See if we can load them using an Advanced SIMD move and then
3599 duplicate it to fill a vector. This is better than using a GPR
3600 move because it keeps everything in the same register file. */
3601 machine_mode vq_mode
= aarch64_vq_mode (elt_mode
).require ();
3602 rtx_vector_builder
builder (vq_mode
, npatterns
, 1);
3603 for (unsigned int i
= 0; i
< npatterns
; ++i
)
3605 /* We want memory lane N to go into architectural lane N,
3606 so reverse for big-endian targets. The DUP .Q pattern
3607 has a compensating reverse built-in. */
3608 unsigned int srci
= BYTES_BIG_ENDIAN
? npatterns
- i
- 1 : i
;
3609 builder
.quick_push (CONST_VECTOR_ENCODED_ELT (src
, srci
));
3611 rtx vq_src
= builder
.build ();
3612 if (aarch64_simd_valid_immediate (vq_src
, NULL
))
3614 vq_src
= force_reg (vq_mode
, vq_src
);
3615 return aarch64_expand_sve_dupq (target
, mode
, vq_src
);
3618 /* Get an integer representation of the repeating part of Advanced
3619 SIMD vector VQ_SRC. This preserves the endianness of VQ_SRC,
3620 which for big-endian targets is lane-swapped wrt a normal
3621 Advanced SIMD vector. This means that for both endiannesses,
3622 memory lane N of SVE vector SRC corresponds to architectural
3623 lane N of a register holding VQ_SRC. This in turn means that
3624 memory lane 0 of SVE vector SRC is in the lsb of VQ_SRC (viewed
3625 as a single 128-bit value) and thus that memory lane 0 of SRC is
3626 in the lsb of the integer. Duplicating the integer therefore
3627 ensures that memory lane N of SRC goes into architectural lane
3628 N + I * INDEX of the SVE register. */
3629 scalar_mode int_mode
= int_mode_for_size (encoded_bits
, 0).require ();
3630 rtx elt_value
= simplify_gen_subreg (int_mode
, vq_src
, vq_mode
, 0);
3633 /* Pretend that we had a vector of INT_MODE to start with. */
3634 elt_mode
= int_mode
;
3635 mode
= aarch64_full_sve_mode (int_mode
).require ();
3637 /* If the integer can be moved into a general register by a
3638 single instruction, do that and duplicate the result. */
3639 if (CONST_INT_P (elt_value
)
3640 && aarch64_move_imm (INTVAL (elt_value
), elt_mode
))
3642 elt_value
= force_reg (elt_mode
, elt_value
);
3643 return expand_vector_broadcast (mode
, elt_value
);
3646 else if (npatterns
== 1)
3647 /* We're duplicating a single value, but can't do better than
3648 force it to memory and load from there. This handles things
3649 like symbolic constants. */
3650 elt_value
= CONST_VECTOR_ENCODED_ELT (src
, 0);
3654 /* Load the element from memory if we can, otherwise move it into
3655 a register and use a DUP. */
3656 rtx op
= force_const_mem (elt_mode
, elt_value
);
3658 op
= force_reg (elt_mode
, elt_value
);
3659 return expand_vector_broadcast (mode
, op
);
3663 /* Try using INDEX. */
3665 if (const_vec_series_p (src
, &base
, &step
))
3667 aarch64_expand_vec_series (target
, base
, step
);
3671 /* From here on, it's better to force the whole constant to memory
3673 if (GET_MODE_NUNITS (mode
).is_constant ())
3676 /* Expand each pattern individually. */
3677 gcc_assert (npatterns
> 1);
3678 rtx_vector_builder builder
;
3679 auto_vec
<rtx
, 16> vectors (npatterns
);
3680 for (unsigned int i
= 0; i
< npatterns
; ++i
)
3682 builder
.new_vector (mode
, 1, nelts_per_pattern
);
3683 for (unsigned int j
= 0; j
< nelts_per_pattern
; ++j
)
3684 builder
.quick_push (CONST_VECTOR_ELT (src
, i
+ j
* npatterns
));
3685 vectors
.quick_push (force_reg (mode
, builder
.build ()));
3688 /* Use permutes to interleave the separate vectors. */
3689 while (npatterns
> 1)
3692 for (unsigned int i
= 0; i
< npatterns
; ++i
)
3694 rtx tmp
= (npatterns
== 1 ? target
: gen_reg_rtx (mode
));
3695 rtvec v
= gen_rtvec (2, vectors
[i
], vectors
[i
+ npatterns
]);
3696 emit_set_insn (tmp
, gen_rtx_UNSPEC (mode
, v
, UNSPEC_ZIP1
));
3700 gcc_assert (vectors
[0] == target
);
3704 /* Use WHILE to set a predicate register of mode MODE in which the first
3705 VL bits are set and the rest are clear. Use TARGET for the register
3706 if it's nonnull and convenient. */
3709 aarch64_sve_move_pred_via_while (rtx target
, machine_mode mode
,
3712 rtx limit
= force_reg (DImode
, gen_int_mode (vl
, DImode
));
3713 target
= aarch64_target_reg (target
, mode
);
3714 emit_insn (gen_while_ult (DImode
, mode
, target
, const0_rtx
, limit
));
3718 /* Subroutine of aarch64_expand_sve_const_pred. Try to load the VNx16BI
3719 constant in BUILDER into an SVE predicate register. Return the register
3720 on success, otherwise return null. Use TARGET for the register if
3721 nonnull and convenient. */
3724 aarch64_expand_sve_const_pred_1 (rtx target
, rtx_vector_builder
&builder
)
3726 if (builder
.encoded_nelts () == 1)
3727 /* A PFALSE or a PTRUE .B ALL. */
3728 return aarch64_emit_set_immediate (target
, builder
);
3730 unsigned int elt_size
= aarch64_widest_sve_pred_elt_size (builder
);
3731 if (int vl
= aarch64_partial_ptrue_length (builder
, elt_size
))
3733 /* If we can load the constant using PTRUE, use it as-is. */
3734 machine_mode mode
= aarch64_sve_pred_mode (elt_size
).require ();
3735 if (aarch64_svpattern_for_vl (mode
, vl
) != AARCH64_NUM_SVPATTERNS
)
3736 return aarch64_emit_set_immediate (target
, builder
);
3738 /* Otherwise use WHILE to set the first VL bits. */
3739 return aarch64_sve_move_pred_via_while (target
, mode
, vl
);
3745 /* Return an SVE predicate register that contains the VNx16BImode
3746 constant in BUILDER, without going through the move expanders.
3748 The returned register can have whatever mode seems most natural
3749 given the contents of BUILDER. Use TARGET for the result if
3753 aarch64_expand_sve_const_pred (rtx target
, rtx_vector_builder
&builder
)
3755 /* Try loading the constant using pure predicate operations. */
3756 if (rtx res
= aarch64_expand_sve_const_pred_1 (target
, builder
))
3759 /* Try forcing the constant to memory. */
3760 if (builder
.full_nelts ().is_constant ())
3761 if (rtx mem
= force_const_mem (VNx16BImode
, builder
.build ()))
3763 target
= aarch64_target_reg (target
, VNx16BImode
);
3764 emit_move_insn (target
, mem
);
3768 /* The last resort is to load the constant as an integer and then
3769 compare it against zero. Use -1 for set bits in order to increase
3770 the changes of using SVE DUPM or an Advanced SIMD byte mask. */
3771 rtx_vector_builder
int_builder (VNx16QImode
, builder
.npatterns (),
3772 builder
.nelts_per_pattern ());
3773 for (unsigned int i
= 0; i
< builder
.encoded_nelts (); ++i
)
3774 int_builder
.quick_push (INTVAL (builder
.elt (i
))
3775 ? constm1_rtx
: const0_rtx
);
3776 return aarch64_convert_sve_data_to_pred (target
, VNx16BImode
,
3777 int_builder
.build ());
3780 /* Set DEST to immediate IMM. */
3783 aarch64_expand_mov_immediate (rtx dest
, rtx imm
)
3785 machine_mode mode
= GET_MODE (dest
);
3787 /* Check on what type of symbol it is. */
3788 scalar_int_mode int_mode
;
3789 if ((GET_CODE (imm
) == SYMBOL_REF
3790 || GET_CODE (imm
) == LABEL_REF
3791 || GET_CODE (imm
) == CONST
3792 || GET_CODE (imm
) == CONST_POLY_INT
)
3793 && is_a
<scalar_int_mode
> (mode
, &int_mode
))
3797 HOST_WIDE_INT const_offset
;
3798 enum aarch64_symbol_type sty
;
3800 /* If we have (const (plus symbol offset)), separate out the offset
3801 before we start classifying the symbol. */
3802 rtx base
= strip_offset (imm
, &offset
);
3804 /* We must always add an offset involving VL separately, rather than
3805 folding it into the relocation. */
3806 if (!offset
.is_constant (&const_offset
))
3808 if (base
== const0_rtx
&& aarch64_sve_cnt_immediate_p (offset
))
3809 emit_insn (gen_rtx_SET (dest
, imm
));
3812 /* Do arithmetic on 32-bit values if the result is smaller
3814 if (partial_subreg_p (int_mode
, SImode
))
3816 /* It is invalid to do symbol calculations in modes
3817 narrower than SImode. */
3818 gcc_assert (base
== const0_rtx
);
3819 dest
= gen_lowpart (SImode
, dest
);
3822 if (base
!= const0_rtx
)
3824 base
= aarch64_force_temporary (int_mode
, dest
, base
);
3825 aarch64_add_offset (int_mode
, dest
, base
, offset
,
3826 NULL_RTX
, NULL_RTX
, false);
3829 aarch64_add_offset (int_mode
, dest
, base
, offset
,
3830 dest
, NULL_RTX
, false);
3835 sty
= aarch64_classify_symbol (base
, const_offset
);
3838 case SYMBOL_FORCE_TO_MEM
:
3839 if (const_offset
!= 0
3840 && targetm
.cannot_force_const_mem (int_mode
, imm
))
3842 gcc_assert (can_create_pseudo_p ());
3843 base
= aarch64_force_temporary (int_mode
, dest
, base
);
3844 aarch64_add_offset (int_mode
, dest
, base
, const_offset
,
3845 NULL_RTX
, NULL_RTX
, false);
3849 mem
= force_const_mem (ptr_mode
, imm
);
3852 /* If we aren't generating PC relative literals, then
3853 we need to expand the literal pool access carefully.
3854 This is something that needs to be done in a number
3855 of places, so could well live as a separate function. */
3856 if (!aarch64_pcrelative_literal_loads
)
3858 gcc_assert (can_create_pseudo_p ());
3859 base
= gen_reg_rtx (ptr_mode
);
3860 aarch64_expand_mov_immediate (base
, XEXP (mem
, 0));
3861 if (ptr_mode
!= Pmode
)
3862 base
= convert_memory_address (Pmode
, base
);
3863 mem
= gen_rtx_MEM (ptr_mode
, base
);
3866 if (int_mode
!= ptr_mode
)
3867 mem
= gen_rtx_ZERO_EXTEND (int_mode
, mem
);
3869 emit_insn (gen_rtx_SET (dest
, mem
));
3873 case SYMBOL_SMALL_TLSGD
:
3874 case SYMBOL_SMALL_TLSDESC
:
3875 case SYMBOL_SMALL_TLSIE
:
3876 case SYMBOL_SMALL_GOT_28K
:
3877 case SYMBOL_SMALL_GOT_4G
:
3878 case SYMBOL_TINY_GOT
:
3879 case SYMBOL_TINY_TLSIE
:
3880 if (const_offset
!= 0)
3882 gcc_assert(can_create_pseudo_p ());
3883 base
= aarch64_force_temporary (int_mode
, dest
, base
);
3884 aarch64_add_offset (int_mode
, dest
, base
, const_offset
,
3885 NULL_RTX
, NULL_RTX
, false);
3890 case SYMBOL_SMALL_ABSOLUTE
:
3891 case SYMBOL_TINY_ABSOLUTE
:
3892 case SYMBOL_TLSLE12
:
3893 case SYMBOL_TLSLE24
:
3894 case SYMBOL_TLSLE32
:
3895 case SYMBOL_TLSLE48
:
3896 aarch64_load_symref_appropriately (dest
, imm
, sty
);
3904 if (!CONST_INT_P (imm
))
3906 if (GET_MODE_CLASS (mode
) == MODE_VECTOR_BOOL
)
3908 /* Only the low bit of each .H, .S and .D element is defined,
3909 so we can set the upper bits to whatever we like. If the
3910 predicate is all-true in MODE, prefer to set all the undefined
3911 bits as well, so that we can share a single .B predicate for
3913 if (imm
== CONSTM1_RTX (mode
))
3914 imm
= CONSTM1_RTX (VNx16BImode
);
3916 /* All methods for constructing predicate modes wider than VNx16BI
3917 will set the upper bits of each element to zero. Expose this
3918 by moving such constants as a VNx16BI, so that all bits are
3919 significant and so that constants for different modes can be
3920 shared. The wider constant will still be available as a
3922 rtx_vector_builder builder
;
3923 if (aarch64_get_sve_pred_bits (builder
, imm
))
3925 rtx res
= aarch64_expand_sve_const_pred (dest
, builder
);
3927 emit_move_insn (dest
, gen_lowpart (mode
, res
));
3932 if (GET_CODE (imm
) == HIGH
3933 || aarch64_simd_valid_immediate (imm
, NULL
))
3935 emit_insn (gen_rtx_SET (dest
, imm
));
3939 if (GET_CODE (imm
) == CONST_VECTOR
&& aarch64_sve_data_mode_p (mode
))
3940 if (rtx res
= aarch64_expand_sve_const_vector (dest
, imm
))
3943 emit_insn (gen_aarch64_sve_reinterpret (mode
, dest
, res
));
3947 rtx mem
= force_const_mem (mode
, imm
);
3949 emit_move_insn (dest
, mem
);
3953 aarch64_internal_mov_immediate (dest
, imm
, true,
3954 as_a
<scalar_int_mode
> (mode
));
3957 /* Emit an SVE predicated move from SRC to DEST. PRED is a predicate
3958 that is known to contain PTRUE. */
3961 aarch64_emit_sve_pred_move (rtx dest
, rtx pred
, rtx src
)
3963 expand_operand ops
[3];
3964 machine_mode mode
= GET_MODE (dest
);
3965 create_output_operand (&ops
[0], dest
, mode
);
3966 create_input_operand (&ops
[1], pred
, GET_MODE(pred
));
3967 create_input_operand (&ops
[2], src
, mode
);
3968 temporary_volatile_ok
v (true);
3969 expand_insn (code_for_aarch64_pred_mov (mode
), 3, ops
);
3972 /* Expand a pre-RA SVE data move from SRC to DEST in which at least one
3973 operand is in memory. In this case we need to use the predicated LD1
3974 and ST1 instead of LDR and STR, both for correctness on big-endian
3975 targets and because LD1 and ST1 support a wider range of addressing modes.
3976 PRED_MODE is the mode of the predicate.
3978 See the comment at the head of aarch64-sve.md for details about the
3979 big-endian handling. */
3982 aarch64_expand_sve_mem_move (rtx dest
, rtx src
, machine_mode pred_mode
)
3984 machine_mode mode
= GET_MODE (dest
);
3985 rtx ptrue
= aarch64_ptrue_reg (pred_mode
);
3986 if (!register_operand (src
, mode
)
3987 && !register_operand (dest
, mode
))
3989 rtx tmp
= gen_reg_rtx (mode
);
3991 aarch64_emit_sve_pred_move (tmp
, ptrue
, src
);
3993 emit_move_insn (tmp
, src
);
3996 aarch64_emit_sve_pred_move (dest
, ptrue
, src
);
3999 /* Called only on big-endian targets. See whether an SVE vector move
4000 from SRC to DEST is effectively a REV[BHW] instruction, because at
4001 least one operand is a subreg of an SVE vector that has wider or
4002 narrower elements. Return true and emit the instruction if so.
4006 (set (reg:VNx8HI R1) (subreg:VNx8HI (reg:VNx16QI R2) 0))
4008 represents a VIEW_CONVERT between the following vectors, viewed
4011 R2: { [0].high, [0].low, [1].high, [1].low, ... }
4012 R1: { [0], [1], [2], [3], ... }
4014 The high part of lane X in R2 should therefore correspond to lane X*2
4015 of R1, but the register representations are:
4018 R2: ...... [1].high [1].low [0].high [0].low
4019 R1: ...... [3] [2] [1] [0]
4021 where the low part of lane X in R2 corresponds to lane X*2 in R1.
4022 We therefore need a reverse operation to swap the high and low values
4025 This is purely an optimization. Without it we would spill the
4026 subreg operand to the stack in one mode and reload it in the
4027 other mode, which has the same effect as the REV. */
4030 aarch64_maybe_expand_sve_subreg_move (rtx dest
, rtx src
)
4032 gcc_assert (BYTES_BIG_ENDIAN
);
4033 if (GET_CODE (dest
) == SUBREG
)
4034 dest
= SUBREG_REG (dest
);
4035 if (GET_CODE (src
) == SUBREG
)
4036 src
= SUBREG_REG (src
);
4038 /* The optimization handles two single SVE REGs with different element
4042 || aarch64_classify_vector_mode (GET_MODE (dest
)) != VEC_SVE_DATA
4043 || aarch64_classify_vector_mode (GET_MODE (src
)) != VEC_SVE_DATA
4044 || (GET_MODE_UNIT_SIZE (GET_MODE (dest
))
4045 == GET_MODE_UNIT_SIZE (GET_MODE (src
))))
4048 /* Generate *aarch64_sve_mov<mode>_subreg_be. */
4049 rtx ptrue
= aarch64_ptrue_reg (VNx16BImode
);
4050 rtx unspec
= gen_rtx_UNSPEC (GET_MODE (dest
), gen_rtvec (2, ptrue
, src
),
4052 emit_insn (gen_rtx_SET (dest
, unspec
));
4056 /* Return a copy of X with mode MODE, without changing its other
4057 attributes. Unlike gen_lowpart, this doesn't care whether the
4058 mode change is valid. */
4061 aarch64_replace_reg_mode (rtx x
, machine_mode mode
)
4063 if (GET_MODE (x
) == mode
)
4066 x
= shallow_copy_rtx (x
);
4067 set_mode_and_regno (x
, mode
, REGNO (x
));
4071 /* Split a *aarch64_sve_mov<mode>_subreg_be pattern with the given
4075 aarch64_split_sve_subreg_move (rtx dest
, rtx ptrue
, rtx src
)
4077 /* Decide which REV operation we need. The mode with narrower elements
4078 determines the mode of the operands and the mode with the wider
4079 elements determines the reverse width. */
4080 machine_mode mode_with_wider_elts
= GET_MODE (dest
);
4081 machine_mode mode_with_narrower_elts
= GET_MODE (src
);
4082 if (GET_MODE_UNIT_SIZE (mode_with_wider_elts
)
4083 < GET_MODE_UNIT_SIZE (mode_with_narrower_elts
))
4084 std::swap (mode_with_wider_elts
, mode_with_narrower_elts
);
4086 unsigned int wider_bytes
= GET_MODE_UNIT_SIZE (mode_with_wider_elts
);
4087 unsigned int unspec
;
4088 if (wider_bytes
== 8)
4089 unspec
= UNSPEC_REV64
;
4090 else if (wider_bytes
== 4)
4091 unspec
= UNSPEC_REV32
;
4092 else if (wider_bytes
== 2)
4093 unspec
= UNSPEC_REV16
;
4096 machine_mode pred_mode
= aarch64_sve_pred_mode (wider_bytes
).require ();
4100 (set DEST (unspec [PTRUE (unspec [SRC] UNSPEC_REV<nn>)]
4101 UNSPEC_MERGE_PTRUE))
4103 with the appropriate modes. */
4104 ptrue
= gen_lowpart (pred_mode
, ptrue
);
4105 dest
= aarch64_replace_reg_mode (dest
, mode_with_narrower_elts
);
4106 src
= aarch64_replace_reg_mode (src
, mode_with_narrower_elts
);
4107 src
= gen_rtx_UNSPEC (mode_with_narrower_elts
, gen_rtvec (1, src
), unspec
);
4108 src
= gen_rtx_UNSPEC (mode_with_narrower_elts
, gen_rtvec (2, ptrue
, src
),
4109 UNSPEC_MERGE_PTRUE
);
4110 emit_insn (gen_rtx_SET (dest
, src
));
4114 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED
,
4115 tree exp ATTRIBUTE_UNUSED
)
4117 if (aarch64_simd_decl_p (cfun
->decl
) != aarch64_simd_decl_p (decl
))
4123 /* Implement TARGET_PASS_BY_REFERENCE. */
4126 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED
,
4129 bool named ATTRIBUTE_UNUSED
)
4132 machine_mode dummymode
;
4135 /* GET_MODE_SIZE (BLKmode) is useless since it is 0. */
4136 if (mode
== BLKmode
&& type
)
4137 size
= int_size_in_bytes (type
);
4139 /* No frontends can create types with variable-sized modes, so we
4140 shouldn't be asked to pass or return them. */
4141 size
= GET_MODE_SIZE (mode
).to_constant ();
4143 /* Aggregates are passed by reference based on their size. */
4144 if (type
&& AGGREGATE_TYPE_P (type
))
4146 size
= int_size_in_bytes (type
);
4149 /* Variable sized arguments are always returned by reference. */
4153 /* Can this be a candidate to be passed in fp/simd register(s)? */
4154 if (aarch64_vfp_is_call_or_return_candidate (mode
, type
,
4159 /* Arguments which are variable sized or larger than 2 registers are
4160 passed by reference unless they are a homogenous floating point
4162 return size
> 2 * UNITS_PER_WORD
;
4165 /* Return TRUE if VALTYPE is padded to its least significant bits. */
4167 aarch64_return_in_msb (const_tree valtype
)
4169 machine_mode dummy_mode
;
4172 /* Never happens in little-endian mode. */
4173 if (!BYTES_BIG_ENDIAN
)
4176 /* Only composite types smaller than or equal to 16 bytes can
4177 be potentially returned in registers. */
4178 if (!aarch64_composite_type_p (valtype
, TYPE_MODE (valtype
))
4179 || int_size_in_bytes (valtype
) <= 0
4180 || int_size_in_bytes (valtype
) > 16)
4183 /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
4184 or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
4185 is always passed/returned in the least significant bits of fp/simd
4187 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype
), valtype
,
4188 &dummy_mode
, &dummy_int
, NULL
))
4194 /* Implement TARGET_FUNCTION_VALUE.
4195 Define how to find the value returned by a function. */
4198 aarch64_function_value (const_tree type
, const_tree func
,
4199 bool outgoing ATTRIBUTE_UNUSED
)
4204 machine_mode ag_mode
;
4206 mode
= TYPE_MODE (type
);
4207 if (INTEGRAL_TYPE_P (type
))
4208 mode
= promote_function_mode (type
, mode
, &unsignedp
, func
, 1);
4210 if (aarch64_return_in_msb (type
))
4212 HOST_WIDE_INT size
= int_size_in_bytes (type
);
4214 if (size
% UNITS_PER_WORD
!= 0)
4216 size
+= UNITS_PER_WORD
- size
% UNITS_PER_WORD
;
4217 mode
= int_mode_for_size (size
* BITS_PER_UNIT
, 0).require ();
4221 if (aarch64_vfp_is_call_or_return_candidate (mode
, type
,
4222 &ag_mode
, &count
, NULL
))
4224 if (!aarch64_composite_type_p (type
, mode
))
4226 gcc_assert (count
== 1 && mode
== ag_mode
);
4227 return gen_rtx_REG (mode
, V0_REGNUM
);
4234 par
= gen_rtx_PARALLEL (mode
, rtvec_alloc (count
));
4235 for (i
= 0; i
< count
; i
++)
4237 rtx tmp
= gen_rtx_REG (ag_mode
, V0_REGNUM
+ i
);
4238 rtx offset
= gen_int_mode (i
* GET_MODE_SIZE (ag_mode
), Pmode
);
4239 tmp
= gen_rtx_EXPR_LIST (VOIDmode
, tmp
, offset
);
4240 XVECEXP (par
, 0, i
) = tmp
;
4246 return gen_rtx_REG (mode
, R0_REGNUM
);
4249 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
4250 Return true if REGNO is the number of a hard register in which the values
4251 of called function may come back. */
4254 aarch64_function_value_regno_p (const unsigned int regno
)
4256 /* Maximum of 16 bytes can be returned in the general registers. Examples
4257 of 16-byte return values are: 128-bit integers and 16-byte small
4258 structures (excluding homogeneous floating-point aggregates). */
4259 if (regno
== R0_REGNUM
|| regno
== R1_REGNUM
)
4262 /* Up to four fp/simd registers can return a function value, e.g. a
4263 homogeneous floating-point aggregate having four members. */
4264 if (regno
>= V0_REGNUM
&& regno
< V0_REGNUM
+ HA_MAX_NUM_FLDS
)
4265 return TARGET_FLOAT
;
4270 /* Implement TARGET_RETURN_IN_MEMORY.
4272 If the type T of the result of a function is such that
4274 would require that arg be passed as a value in a register (or set of
4275 registers) according to the parameter passing rules, then the result
4276 is returned in the same registers as would be used for such an
4280 aarch64_return_in_memory (const_tree type
, const_tree fndecl ATTRIBUTE_UNUSED
)
4283 machine_mode ag_mode
;
4286 if (!AGGREGATE_TYPE_P (type
)
4287 && TREE_CODE (type
) != COMPLEX_TYPE
4288 && TREE_CODE (type
) != VECTOR_TYPE
)
4289 /* Simple scalar types always returned in registers. */
4292 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type
),
4299 /* Types larger than 2 registers returned in memory. */
4300 size
= int_size_in_bytes (type
);
4301 return (size
< 0 || size
> 2 * UNITS_PER_WORD
);
4305 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v
, machine_mode mode
,
4306 const_tree type
, int *nregs
)
4308 CUMULATIVE_ARGS
*pcum
= get_cumulative_args (pcum_v
);
4309 return aarch64_vfp_is_call_or_return_candidate (mode
,
4311 &pcum
->aapcs_vfp_rmode
,
4316 /* Given MODE and TYPE of a function argument, return the alignment in
4317 bits. The idea is to suppress any stronger alignment requested by
4318 the user and opt for the natural alignment (specified in AAPCS64 \S
4319 4.1). ABI_BREAK is set to true if the alignment was incorrectly
4320 calculated in versions of GCC prior to GCC-9. This is a helper
4321 function for local use only. */
4324 aarch64_function_arg_alignment (machine_mode mode
, const_tree type
,
4329 return GET_MODE_ALIGNMENT (mode
);
4331 if (integer_zerop (TYPE_SIZE (type
)))
4334 gcc_assert (TYPE_MODE (type
) == mode
);
4336 if (!AGGREGATE_TYPE_P (type
))
4337 return TYPE_ALIGN (TYPE_MAIN_VARIANT (type
));
4339 if (TREE_CODE (type
) == ARRAY_TYPE
)
4340 return TYPE_ALIGN (TREE_TYPE (type
));
4342 unsigned int alignment
= 0;
4343 unsigned int bitfield_alignment
= 0;
4344 for (tree field
= TYPE_FIELDS (type
); field
; field
= DECL_CHAIN (field
))
4345 if (TREE_CODE (field
) == FIELD_DECL
)
4347 alignment
= std::max (alignment
, DECL_ALIGN (field
));
4348 if (DECL_BIT_FIELD_TYPE (field
))
4350 = std::max (bitfield_alignment
,
4351 TYPE_ALIGN (DECL_BIT_FIELD_TYPE (field
)));
4354 if (bitfield_alignment
> alignment
)
4357 return bitfield_alignment
;
4363 /* Layout a function argument according to the AAPCS64 rules. The rule
4364 numbers refer to the rule numbers in the AAPCS64. */
4367 aarch64_layout_arg (cumulative_args_t pcum_v
, machine_mode mode
,
4369 bool named ATTRIBUTE_UNUSED
)
4371 CUMULATIVE_ARGS
*pcum
= get_cumulative_args (pcum_v
);
4372 int ncrn
, nvrn
, nregs
;
4373 bool allocate_ncrn
, allocate_nvrn
;
4377 /* We need to do this once per argument. */
4378 if (pcum
->aapcs_arg_processed
)
4381 pcum
->aapcs_arg_processed
= true;
4383 /* Size in bytes, rounded to the nearest multiple of 8 bytes. */
4385 size
= int_size_in_bytes (type
);
4387 /* No frontends can create types with variable-sized modes, so we
4388 shouldn't be asked to pass or return them. */
4389 size
= GET_MODE_SIZE (mode
).to_constant ();
4390 size
= ROUND_UP (size
, UNITS_PER_WORD
);
4392 allocate_ncrn
= (type
) ? !(FLOAT_TYPE_P (type
)) : !FLOAT_MODE_P (mode
);
4393 allocate_nvrn
= aarch64_vfp_is_call_candidate (pcum_v
,
4398 /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
4399 The following code thus handles passing by SIMD/FP registers first. */
4401 nvrn
= pcum
->aapcs_nvrn
;
4403 /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
4404 and homogenous short-vector aggregates (HVA). */
4408 aarch64_err_no_fpadvsimd (mode
);
4410 if (nvrn
+ nregs
<= NUM_FP_ARG_REGS
)
4412 pcum
->aapcs_nextnvrn
= nvrn
+ nregs
;
4413 if (!aarch64_composite_type_p (type
, mode
))
4415 gcc_assert (nregs
== 1);
4416 pcum
->aapcs_reg
= gen_rtx_REG (mode
, V0_REGNUM
+ nvrn
);
4422 par
= gen_rtx_PARALLEL (mode
, rtvec_alloc (nregs
));
4423 for (i
= 0; i
< nregs
; i
++)
4425 rtx tmp
= gen_rtx_REG (pcum
->aapcs_vfp_rmode
,
4426 V0_REGNUM
+ nvrn
+ i
);
4427 rtx offset
= gen_int_mode
4428 (i
* GET_MODE_SIZE (pcum
->aapcs_vfp_rmode
), Pmode
);
4429 tmp
= gen_rtx_EXPR_LIST (VOIDmode
, tmp
, offset
);
4430 XVECEXP (par
, 0, i
) = tmp
;
4432 pcum
->aapcs_reg
= par
;
4438 /* C.3 NSRN is set to 8. */
4439 pcum
->aapcs_nextnvrn
= NUM_FP_ARG_REGS
;
4444 ncrn
= pcum
->aapcs_ncrn
;
4445 nregs
= size
/ UNITS_PER_WORD
;
4447 /* C6 - C9. though the sign and zero extension semantics are
4448 handled elsewhere. This is the case where the argument fits
4449 entirely general registers. */
4450 if (allocate_ncrn
&& (ncrn
+ nregs
<= NUM_ARG_REGS
))
4452 gcc_assert (nregs
== 0 || nregs
== 1 || nregs
== 2);
4454 /* C.8 if the argument has an alignment of 16 then the NGRN is
4455 rounded up to the next even number. */
4458 /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
4459 comparison is there because for > 16 * BITS_PER_UNIT
4460 alignment nregs should be > 2 and therefore it should be
4461 passed by reference rather than value. */
4462 && (aarch64_function_arg_alignment (mode
, type
, &abi_break
)
4463 == 16 * BITS_PER_UNIT
))
4465 if (abi_break
&& warn_psabi
&& currently_expanding_gimple_stmt
)
4466 inform (input_location
, "parameter passing for argument of type "
4467 "%qT changed in GCC 9.1", type
);
4469 gcc_assert (ncrn
+ nregs
<= NUM_ARG_REGS
);
4472 /* NREGS can be 0 when e.g. an empty structure is to be passed.
4473 A reg is still generated for it, but the caller should be smart
4474 enough not to use it. */
4475 if (nregs
== 0 || nregs
== 1 || GET_MODE_CLASS (mode
) == MODE_INT
)
4476 pcum
->aapcs_reg
= gen_rtx_REG (mode
, R0_REGNUM
+ ncrn
);
4482 par
= gen_rtx_PARALLEL (mode
, rtvec_alloc (nregs
));
4483 for (i
= 0; i
< nregs
; i
++)
4485 rtx tmp
= gen_rtx_REG (word_mode
, R0_REGNUM
+ ncrn
+ i
);
4486 tmp
= gen_rtx_EXPR_LIST (VOIDmode
, tmp
,
4487 GEN_INT (i
* UNITS_PER_WORD
));
4488 XVECEXP (par
, 0, i
) = tmp
;
4490 pcum
->aapcs_reg
= par
;
4493 pcum
->aapcs_nextncrn
= ncrn
+ nregs
;
4498 pcum
->aapcs_nextncrn
= NUM_ARG_REGS
;
4500 /* The argument is passed on stack; record the needed number of words for
4501 this argument and align the total size if necessary. */
4503 pcum
->aapcs_stack_words
= size
/ UNITS_PER_WORD
;
4505 if (aarch64_function_arg_alignment (mode
, type
, &abi_break
)
4506 == 16 * BITS_PER_UNIT
)
4508 int new_size
= ROUND_UP (pcum
->aapcs_stack_size
, 16 / UNITS_PER_WORD
);
4509 if (pcum
->aapcs_stack_size
!= new_size
)
4511 if (abi_break
&& warn_psabi
&& currently_expanding_gimple_stmt
)
4512 inform (input_location
, "parameter passing for argument of type "
4513 "%qT changed in GCC 9.1", type
);
4514 pcum
->aapcs_stack_size
= new_size
;
4520 /* Implement TARGET_FUNCTION_ARG. */
4523 aarch64_function_arg (cumulative_args_t pcum_v
, machine_mode mode
,
4524 const_tree type
, bool named
)
4526 CUMULATIVE_ARGS
*pcum
= get_cumulative_args (pcum_v
);
4527 gcc_assert (pcum
->pcs_variant
== ARM_PCS_AAPCS64
);
4529 if (mode
== VOIDmode
)
4532 aarch64_layout_arg (pcum_v
, mode
, type
, named
);
4533 return pcum
->aapcs_reg
;
4537 aarch64_init_cumulative_args (CUMULATIVE_ARGS
*pcum
,
4538 const_tree fntype ATTRIBUTE_UNUSED
,
4539 rtx libname ATTRIBUTE_UNUSED
,
4540 const_tree fndecl ATTRIBUTE_UNUSED
,
4541 unsigned n_named ATTRIBUTE_UNUSED
)
4543 pcum
->aapcs_ncrn
= 0;
4544 pcum
->aapcs_nvrn
= 0;
4545 pcum
->aapcs_nextncrn
= 0;
4546 pcum
->aapcs_nextnvrn
= 0;
4547 pcum
->pcs_variant
= ARM_PCS_AAPCS64
;
4548 pcum
->aapcs_reg
= NULL_RTX
;
4549 pcum
->aapcs_arg_processed
= false;
4550 pcum
->aapcs_stack_words
= 0;
4551 pcum
->aapcs_stack_size
= 0;
4554 && fndecl
&& TREE_PUBLIC (fndecl
)
4555 && fntype
&& fntype
!= error_mark_node
)
4557 const_tree type
= TREE_TYPE (fntype
);
4558 machine_mode mode ATTRIBUTE_UNUSED
; /* To pass pointer as argument. */
4559 int nregs ATTRIBUTE_UNUSED
; /* Likewise. */
4560 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type
), type
,
4561 &mode
, &nregs
, NULL
))
4562 aarch64_err_no_fpadvsimd (TYPE_MODE (type
));
4568 aarch64_function_arg_advance (cumulative_args_t pcum_v
,
4573 CUMULATIVE_ARGS
*pcum
= get_cumulative_args (pcum_v
);
4574 if (pcum
->pcs_variant
== ARM_PCS_AAPCS64
)
4576 aarch64_layout_arg (pcum_v
, mode
, type
, named
);
4577 gcc_assert ((pcum
->aapcs_reg
!= NULL_RTX
)
4578 != (pcum
->aapcs_stack_words
!= 0));
4579 pcum
->aapcs_arg_processed
= false;
4580 pcum
->aapcs_ncrn
= pcum
->aapcs_nextncrn
;
4581 pcum
->aapcs_nvrn
= pcum
->aapcs_nextnvrn
;
4582 pcum
->aapcs_stack_size
+= pcum
->aapcs_stack_words
;
4583 pcum
->aapcs_stack_words
= 0;
4584 pcum
->aapcs_reg
= NULL_RTX
;
4589 aarch64_function_arg_regno_p (unsigned regno
)
4591 return ((GP_REGNUM_P (regno
) && regno
< R0_REGNUM
+ NUM_ARG_REGS
)
4592 || (FP_REGNUM_P (regno
) && regno
< V0_REGNUM
+ NUM_FP_ARG_REGS
));
4595 /* Implement FUNCTION_ARG_BOUNDARY. Every parameter gets at least
4596 PARM_BOUNDARY bits of alignment, but will be given anything up
4597 to STACK_BOUNDARY bits if the type requires it. This makes sure
4598 that both before and after the layout of each argument, the Next
4599 Stacked Argument Address (NSAA) will have a minimum alignment of
4603 aarch64_function_arg_boundary (machine_mode mode
, const_tree type
)
4606 unsigned int alignment
= aarch64_function_arg_alignment (mode
, type
,
4608 if (abi_break
& warn_psabi
)
4609 inform (input_location
, "parameter passing for argument of type "
4610 "%qT changed in GCC 9.1", type
);
4612 return MIN (MAX (alignment
, PARM_BOUNDARY
), STACK_BOUNDARY
);
4615 /* Implement TARGET_GET_RAW_RESULT_MODE and TARGET_GET_RAW_ARG_MODE. */
4617 static fixed_size_mode
4618 aarch64_get_reg_raw_mode (int regno
)
4620 if (TARGET_SVE
&& FP_REGNUM_P (regno
))
4621 /* Don't use the SVE part of the register for __builtin_apply and
4622 __builtin_return. The SVE registers aren't used by the normal PCS,
4623 so using them there would be a waste of time. The PCS extensions
4624 for SVE types are fundamentally incompatible with the
4625 __builtin_return/__builtin_apply interface. */
4626 return as_a
<fixed_size_mode
> (V16QImode
);
4627 return default_get_reg_raw_mode (regno
);
4630 /* Implement TARGET_FUNCTION_ARG_PADDING.
4632 Small aggregate types are placed in the lowest memory address.
4634 The related parameter passing rules are B.4, C.3, C.5 and C.14. */
4636 static pad_direction
4637 aarch64_function_arg_padding (machine_mode mode
, const_tree type
)
4639 /* On little-endian targets, the least significant byte of every stack
4640 argument is passed at the lowest byte address of the stack slot. */
4641 if (!BYTES_BIG_ENDIAN
)
4644 /* Otherwise, integral, floating-point and pointer types are padded downward:
4645 the least significant byte of a stack argument is passed at the highest
4646 byte address of the stack slot. */
4648 ? (INTEGRAL_TYPE_P (type
) || SCALAR_FLOAT_TYPE_P (type
)
4649 || POINTER_TYPE_P (type
))
4650 : (SCALAR_INT_MODE_P (mode
) || SCALAR_FLOAT_MODE_P (mode
)))
4651 return PAD_DOWNWARD
;
4653 /* Everything else padded upward, i.e. data in first byte of stack slot. */
4657 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
4659 It specifies padding for the last (may also be the only)
4660 element of a block move between registers and memory. If
4661 assuming the block is in the memory, padding upward means that
4662 the last element is padded after its highest significant byte,
4663 while in downward padding, the last element is padded at the
4664 its least significant byte side.
4666 Small aggregates and small complex types are always padded
4669 We don't need to worry about homogeneous floating-point or
4670 short-vector aggregates; their move is not affected by the
4671 padding direction determined here. Regardless of endianness,
4672 each element of such an aggregate is put in the least
4673 significant bits of a fp/simd register.
4675 Return !BYTES_BIG_ENDIAN if the least significant byte of the
4676 register has useful data, and return the opposite if the most
4677 significant byte does. */
4680 aarch64_pad_reg_upward (machine_mode mode
, const_tree type
,
4681 bool first ATTRIBUTE_UNUSED
)
4684 /* Small composite types are always padded upward. */
4685 if (BYTES_BIG_ENDIAN
&& aarch64_composite_type_p (type
, mode
))
4689 size
= int_size_in_bytes (type
);
4691 /* No frontends can create types with variable-sized modes, so we
4692 shouldn't be asked to pass or return them. */
4693 size
= GET_MODE_SIZE (mode
).to_constant ();
4694 if (size
< 2 * UNITS_PER_WORD
)
4698 /* Otherwise, use the default padding. */
4699 return !BYTES_BIG_ENDIAN
;
4702 static scalar_int_mode
4703 aarch64_libgcc_cmp_return_mode (void)
4708 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
4710 /* We use the 12-bit shifted immediate arithmetic instructions so values
4711 must be multiple of (1 << 12), i.e. 4096. */
4712 #define ARITH_FACTOR 4096
4714 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
4715 #error Cannot use simple address calculation for stack probing
4718 /* The pair of scratch registers used for stack probing. */
4719 #define PROBE_STACK_FIRST_REG R9_REGNUM
4720 #define PROBE_STACK_SECOND_REG R10_REGNUM
4722 /* Emit code to probe a range of stack addresses from FIRST to FIRST+POLY_SIZE,
4723 inclusive. These are offsets from the current stack pointer. */
4726 aarch64_emit_probe_stack_range (HOST_WIDE_INT first
, poly_int64 poly_size
)
4729 if (!poly_size
.is_constant (&size
))
4731 sorry ("stack probes for SVE frames");
4735 rtx reg1
= gen_rtx_REG (Pmode
, PROBE_STACK_FIRST_REG
);
4737 /* See the same assertion on PROBE_INTERVAL above. */
4738 gcc_assert ((first
% ARITH_FACTOR
) == 0);
4740 /* See if we have a constant small number of probes to generate. If so,
4741 that's the easy case. */
4742 if (size
<= PROBE_INTERVAL
)
4744 const HOST_WIDE_INT base
= ROUND_UP (size
, ARITH_FACTOR
);
4746 emit_set_insn (reg1
,
4747 plus_constant (Pmode
,
4748 stack_pointer_rtx
, -(first
+ base
)));
4749 emit_stack_probe (plus_constant (Pmode
, reg1
, base
- size
));
4752 /* The run-time loop is made up of 8 insns in the generic case while the
4753 compile-time loop is made up of 4+2*(n-2) insns for n # of intervals. */
4754 else if (size
<= 4 * PROBE_INTERVAL
)
4756 HOST_WIDE_INT i
, rem
;
4758 emit_set_insn (reg1
,
4759 plus_constant (Pmode
,
4761 -(first
+ PROBE_INTERVAL
)));
4762 emit_stack_probe (reg1
);
4764 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
4765 it exceeds SIZE. If only two probes are needed, this will not
4766 generate any code. Then probe at FIRST + SIZE. */
4767 for (i
= 2 * PROBE_INTERVAL
; i
< size
; i
+= PROBE_INTERVAL
)
4769 emit_set_insn (reg1
,
4770 plus_constant (Pmode
, reg1
, -PROBE_INTERVAL
));
4771 emit_stack_probe (reg1
);
4774 rem
= size
- (i
- PROBE_INTERVAL
);
4777 const HOST_WIDE_INT base
= ROUND_UP (rem
, ARITH_FACTOR
);
4779 emit_set_insn (reg1
, plus_constant (Pmode
, reg1
, -base
));
4780 emit_stack_probe (plus_constant (Pmode
, reg1
, base
- rem
));
4783 emit_stack_probe (plus_constant (Pmode
, reg1
, -rem
));
4786 /* Otherwise, do the same as above, but in a loop. Note that we must be
4787 extra careful with variables wrapping around because we might be at
4788 the very top (or the very bottom) of the address space and we have
4789 to be able to handle this case properly; in particular, we use an
4790 equality test for the loop condition. */
4793 rtx reg2
= gen_rtx_REG (Pmode
, PROBE_STACK_SECOND_REG
);
4795 /* Step 1: round SIZE to the previous multiple of the interval. */
4797 HOST_WIDE_INT rounded_size
= size
& -PROBE_INTERVAL
;
4800 /* Step 2: compute initial and final value of the loop counter. */
4802 /* TEST_ADDR = SP + FIRST. */
4803 emit_set_insn (reg1
,
4804 plus_constant (Pmode
, stack_pointer_rtx
, -first
));
4806 /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE. */
4807 HOST_WIDE_INT adjustment
= - (first
+ rounded_size
);
4808 if (! aarch64_uimm12_shift (adjustment
))
4810 aarch64_internal_mov_immediate (reg2
, GEN_INT (adjustment
),
4812 emit_set_insn (reg2
, gen_rtx_PLUS (Pmode
, stack_pointer_rtx
, reg2
));
4815 emit_set_insn (reg2
,
4816 plus_constant (Pmode
, stack_pointer_rtx
, adjustment
));
4822 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
4825 while (TEST_ADDR != LAST_ADDR)
4827 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
4828 until it is equal to ROUNDED_SIZE. */
4830 emit_insn (gen_probe_stack_range (reg1
, reg1
, reg2
));
4833 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
4834 that SIZE is equal to ROUNDED_SIZE. */
4836 if (size
!= rounded_size
)
4838 HOST_WIDE_INT rem
= size
- rounded_size
;
4842 const HOST_WIDE_INT base
= ROUND_UP (rem
, ARITH_FACTOR
);
4844 emit_set_insn (reg2
, plus_constant (Pmode
, reg2
, -base
));
4845 emit_stack_probe (plus_constant (Pmode
, reg2
, base
- rem
));
4848 emit_stack_probe (plus_constant (Pmode
, reg2
, -rem
));
4852 /* Make sure nothing is scheduled before we are done. */
4853 emit_insn (gen_blockage ());
4856 /* Probe a range of stack addresses from REG1 to REG2 inclusive. These are
4857 absolute addresses. */
4860 aarch64_output_probe_stack_range (rtx reg1
, rtx reg2
)
4862 static int labelno
= 0;
4866 ASM_GENERATE_INTERNAL_LABEL (loop_lab
, "LPSRL", labelno
++);
4869 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file
, loop_lab
);
4871 HOST_WIDE_INT stack_clash_probe_interval
4872 = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE
);
4874 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
4876 HOST_WIDE_INT interval
;
4877 if (flag_stack_clash_protection
)
4878 interval
= stack_clash_probe_interval
;
4880 interval
= PROBE_INTERVAL
;
4882 gcc_assert (aarch64_uimm12_shift (interval
));
4883 xops
[1] = GEN_INT (interval
);
4885 output_asm_insn ("sub\t%0, %0, %1", xops
);
4887 /* If doing stack clash protection then we probe up by the ABI specified
4888 amount. We do this because we're dropping full pages at a time in the
4889 loop. But if we're doing non-stack clash probing, probe at SP 0. */
4890 if (flag_stack_clash_protection
)
4891 xops
[1] = GEN_INT (STACK_CLASH_CALLER_GUARD
);
4893 xops
[1] = CONST0_RTX (GET_MODE (xops
[1]));
4895 /* Probe at TEST_ADDR. If we're inside the loop it is always safe to probe
4896 by this amount for each iteration. */
4897 output_asm_insn ("str\txzr, [%0, %1]", xops
);
4899 /* Test if TEST_ADDR == LAST_ADDR. */
4901 output_asm_insn ("cmp\t%0, %1", xops
);
4904 fputs ("\tb.ne\t", asm_out_file
);
4905 assemble_name_raw (asm_out_file
, loop_lab
);
4906 fputc ('\n', asm_out_file
);
4911 /* Emit the probe loop for doing stack clash probes and stack adjustments for
4912 SVE. This emits probes from BASE to BASE - ADJUSTMENT based on a guard size
4913 of GUARD_SIZE. When a probe is emitted it is done at most
4914 MIN_PROBE_THRESHOLD bytes from the current BASE at an interval of
4915 at most MIN_PROBE_THRESHOLD. By the end of this function
4916 BASE = BASE - ADJUSTMENT. */
4919 aarch64_output_probe_sve_stack_clash (rtx base
, rtx adjustment
,
4920 rtx min_probe_threshold
, rtx guard_size
)
4922 /* This function is not allowed to use any instruction generation function
4923 like gen_ and friends. If you do you'll likely ICE during CFG validation,
4924 so instead emit the code you want using output_asm_insn. */
4925 gcc_assert (flag_stack_clash_protection
);
4926 gcc_assert (CONST_INT_P (min_probe_threshold
) && CONST_INT_P (guard_size
));
4927 gcc_assert (INTVAL (guard_size
) > INTVAL (min_probe_threshold
));
4929 /* The minimum required allocation before the residual requires probing. */
4930 HOST_WIDE_INT residual_probe_guard
= INTVAL (min_probe_threshold
);
4932 /* Clamp the value down to the nearest value that can be used with a cmp. */
4933 residual_probe_guard
= aarch64_clamp_to_uimm12_shift (residual_probe_guard
);
4934 rtx probe_offset_value_rtx
= gen_int_mode (residual_probe_guard
, Pmode
);
4936 gcc_assert (INTVAL (min_probe_threshold
) >= residual_probe_guard
);
4937 gcc_assert (aarch64_uimm12_shift (residual_probe_guard
));
4939 static int labelno
= 0;
4940 char loop_start_lab
[32];
4941 char loop_end_lab
[32];
4944 ASM_GENERATE_INTERNAL_LABEL (loop_start_lab
, "SVLPSPL", labelno
);
4945 ASM_GENERATE_INTERNAL_LABEL (loop_end_lab
, "SVLPEND", labelno
++);
4947 /* Emit loop start label. */
4948 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file
, loop_start_lab
);
4950 /* ADJUSTMENT < RESIDUAL_PROBE_GUARD. */
4951 xops
[0] = adjustment
;
4952 xops
[1] = probe_offset_value_rtx
;
4953 output_asm_insn ("cmp\t%0, %1", xops
);
4955 /* Branch to end if not enough adjustment to probe. */
4956 fputs ("\tb.lt\t", asm_out_file
);
4957 assemble_name_raw (asm_out_file
, loop_end_lab
);
4958 fputc ('\n', asm_out_file
);
4960 /* BASE = BASE - RESIDUAL_PROBE_GUARD. */
4962 xops
[1] = probe_offset_value_rtx
;
4963 output_asm_insn ("sub\t%0, %0, %1", xops
);
4965 /* Probe at BASE. */
4966 xops
[1] = const0_rtx
;
4967 output_asm_insn ("str\txzr, [%0, %1]", xops
);
4969 /* ADJUSTMENT = ADJUSTMENT - RESIDUAL_PROBE_GUARD. */
4970 xops
[0] = adjustment
;
4971 xops
[1] = probe_offset_value_rtx
;
4972 output_asm_insn ("sub\t%0, %0, %1", xops
);
4974 /* Branch to start if still more bytes to allocate. */
4975 fputs ("\tb\t", asm_out_file
);
4976 assemble_name_raw (asm_out_file
, loop_start_lab
);
4977 fputc ('\n', asm_out_file
);
4979 /* No probe leave. */
4980 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file
, loop_end_lab
);
4982 /* BASE = BASE - ADJUSTMENT. */
4984 xops
[1] = adjustment
;
4985 output_asm_insn ("sub\t%0, %0, %1", xops
);
4989 /* Determine whether a frame chain needs to be generated. */
4991 aarch64_needs_frame_chain (void)
4993 /* Force a frame chain for EH returns so the return address is at FP+8. */
4994 if (frame_pointer_needed
|| crtl
->calls_eh_return
)
4997 /* A leaf function cannot have calls or write LR. */
4998 bool is_leaf
= crtl
->is_leaf
&& !df_regs_ever_live_p (LR_REGNUM
);
5000 /* Don't use a frame chain in leaf functions if leaf frame pointers
5002 if (flag_omit_leaf_frame_pointer
&& is_leaf
)
5005 return aarch64_use_frame_pointer
;
5008 /* Mark the registers that need to be saved by the callee and calculate
5009 the size of the callee-saved registers area and frame record (both FP
5010 and LR may be omitted). */
5012 aarch64_layout_frame (void)
5014 HOST_WIDE_INT offset
= 0;
5015 int regno
, last_fp_reg
= INVALID_REGNUM
;
5016 bool simd_function
= aarch64_simd_decl_p (cfun
->decl
);
5018 cfun
->machine
->frame
.emit_frame_chain
= aarch64_needs_frame_chain ();
5020 /* Adjust the outgoing arguments size if required. Keep it in sync with what
5021 the mid-end is doing. */
5022 crtl
->outgoing_args_size
= STACK_DYNAMIC_OFFSET (cfun
);
5024 #define SLOT_NOT_REQUIRED (-2)
5025 #define SLOT_REQUIRED (-1)
5027 cfun
->machine
->frame
.wb_candidate1
= INVALID_REGNUM
;
5028 cfun
->machine
->frame
.wb_candidate2
= INVALID_REGNUM
;
5030 /* If this is a non-leaf simd function with calls we assume that
5031 at least one of those calls is to a non-simd function and thus
5032 we must save V8 to V23 in the prologue. */
5034 if (simd_function
&& !crtl
->is_leaf
)
5036 for (regno
= V0_REGNUM
; regno
<= V31_REGNUM
; regno
++)
5037 if (FP_SIMD_SAVED_REGNUM_P (regno
))
5038 df_set_regs_ever_live (regno
, true);
5041 /* First mark all the registers that really need to be saved... */
5042 for (regno
= R0_REGNUM
; regno
<= R30_REGNUM
; regno
++)
5043 cfun
->machine
->frame
.reg_offset
[regno
] = SLOT_NOT_REQUIRED
;
5045 for (regno
= V0_REGNUM
; regno
<= V31_REGNUM
; regno
++)
5046 cfun
->machine
->frame
.reg_offset
[regno
] = SLOT_NOT_REQUIRED
;
5048 /* ... that includes the eh data registers (if needed)... */
5049 if (crtl
->calls_eh_return
)
5050 for (regno
= 0; EH_RETURN_DATA_REGNO (regno
) != INVALID_REGNUM
; regno
++)
5051 cfun
->machine
->frame
.reg_offset
[EH_RETURN_DATA_REGNO (regno
)]
5054 /* ... and any callee saved register that dataflow says is live. */
5055 for (regno
= R0_REGNUM
; regno
<= R30_REGNUM
; regno
++)
5056 if (df_regs_ever_live_p (regno
)
5057 && (regno
== R30_REGNUM
5058 || !call_used_regs
[regno
]))
5059 cfun
->machine
->frame
.reg_offset
[regno
] = SLOT_REQUIRED
;
5061 for (regno
= V0_REGNUM
; regno
<= V31_REGNUM
; regno
++)
5062 if (df_regs_ever_live_p (regno
)
5063 && (!call_used_regs
[regno
]
5064 || (simd_function
&& FP_SIMD_SAVED_REGNUM_P (regno
))))
5066 cfun
->machine
->frame
.reg_offset
[regno
] = SLOT_REQUIRED
;
5067 last_fp_reg
= regno
;
5070 if (cfun
->machine
->frame
.emit_frame_chain
)
5072 /* FP and LR are placed in the linkage record. */
5073 cfun
->machine
->frame
.reg_offset
[R29_REGNUM
] = 0;
5074 cfun
->machine
->frame
.wb_candidate1
= R29_REGNUM
;
5075 cfun
->machine
->frame
.reg_offset
[R30_REGNUM
] = UNITS_PER_WORD
;
5076 cfun
->machine
->frame
.wb_candidate2
= R30_REGNUM
;
5077 offset
= 2 * UNITS_PER_WORD
;
5080 /* With stack-clash, LR must be saved in non-leaf functions. */
5081 gcc_assert (crtl
->is_leaf
5082 || (cfun
->machine
->frame
.reg_offset
[R30_REGNUM
]
5083 != SLOT_NOT_REQUIRED
));
5085 /* Now assign stack slots for them. */
5086 for (regno
= R0_REGNUM
; regno
<= R30_REGNUM
; regno
++)
5087 if (cfun
->machine
->frame
.reg_offset
[regno
] == SLOT_REQUIRED
)
5089 cfun
->machine
->frame
.reg_offset
[regno
] = offset
;
5090 if (cfun
->machine
->frame
.wb_candidate1
== INVALID_REGNUM
)
5091 cfun
->machine
->frame
.wb_candidate1
= regno
;
5092 else if (cfun
->machine
->frame
.wb_candidate2
== INVALID_REGNUM
)
5093 cfun
->machine
->frame
.wb_candidate2
= regno
;
5094 offset
+= UNITS_PER_WORD
;
5097 HOST_WIDE_INT max_int_offset
= offset
;
5098 offset
= ROUND_UP (offset
, STACK_BOUNDARY
/ BITS_PER_UNIT
);
5099 bool has_align_gap
= offset
!= max_int_offset
;
5101 for (regno
= V0_REGNUM
; regno
<= V31_REGNUM
; regno
++)
5102 if (cfun
->machine
->frame
.reg_offset
[regno
] == SLOT_REQUIRED
)
5104 /* If there is an alignment gap between integer and fp callee-saves,
5105 allocate the last fp register to it if possible. */
5106 if (regno
== last_fp_reg
5109 && (offset
& 8) == 0)
5111 cfun
->machine
->frame
.reg_offset
[regno
] = max_int_offset
;
5115 cfun
->machine
->frame
.reg_offset
[regno
] = offset
;
5116 if (cfun
->machine
->frame
.wb_candidate1
== INVALID_REGNUM
)
5117 cfun
->machine
->frame
.wb_candidate1
= regno
;
5118 else if (cfun
->machine
->frame
.wb_candidate2
== INVALID_REGNUM
5119 && cfun
->machine
->frame
.wb_candidate1
>= V0_REGNUM
)
5120 cfun
->machine
->frame
.wb_candidate2
= regno
;
5121 offset
+= simd_function
? UNITS_PER_VREG
: UNITS_PER_WORD
;
5124 offset
= ROUND_UP (offset
, STACK_BOUNDARY
/ BITS_PER_UNIT
);
5126 cfun
->machine
->frame
.saved_regs_size
= offset
;
5128 HOST_WIDE_INT varargs_and_saved_regs_size
5129 = offset
+ cfun
->machine
->frame
.saved_varargs_size
;
5131 cfun
->machine
->frame
.hard_fp_offset
5132 = aligned_upper_bound (varargs_and_saved_regs_size
5133 + get_frame_size (),
5134 STACK_BOUNDARY
/ BITS_PER_UNIT
);
5136 /* Both these values are already aligned. */
5137 gcc_assert (multiple_p (crtl
->outgoing_args_size
,
5138 STACK_BOUNDARY
/ BITS_PER_UNIT
));
5139 cfun
->machine
->frame
.frame_size
5140 = (cfun
->machine
->frame
.hard_fp_offset
5141 + crtl
->outgoing_args_size
);
5143 cfun
->machine
->frame
.locals_offset
= cfun
->machine
->frame
.saved_varargs_size
;
5145 cfun
->machine
->frame
.initial_adjust
= 0;
5146 cfun
->machine
->frame
.final_adjust
= 0;
5147 cfun
->machine
->frame
.callee_adjust
= 0;
5148 cfun
->machine
->frame
.callee_offset
= 0;
5150 HOST_WIDE_INT max_push_offset
= 0;
5151 if (cfun
->machine
->frame
.wb_candidate2
!= INVALID_REGNUM
)
5152 max_push_offset
= 512;
5153 else if (cfun
->machine
->frame
.wb_candidate1
!= INVALID_REGNUM
)
5154 max_push_offset
= 256;
5156 HOST_WIDE_INT const_size
, const_fp_offset
;
5157 if (cfun
->machine
->frame
.frame_size
.is_constant (&const_size
)
5158 && const_size
< max_push_offset
5159 && known_eq (crtl
->outgoing_args_size
, 0))
5161 /* Simple, small frame with no outgoing arguments:
5162 stp reg1, reg2, [sp, -frame_size]!
5163 stp reg3, reg4, [sp, 16] */
5164 cfun
->machine
->frame
.callee_adjust
= const_size
;
5166 else if (known_lt (crtl
->outgoing_args_size
5167 + cfun
->machine
->frame
.saved_regs_size
, 512)
5168 && !(cfun
->calls_alloca
5169 && known_lt (cfun
->machine
->frame
.hard_fp_offset
,
5172 /* Frame with small outgoing arguments:
5173 sub sp, sp, frame_size
5174 stp reg1, reg2, [sp, outgoing_args_size]
5175 stp reg3, reg4, [sp, outgoing_args_size + 16] */
5176 cfun
->machine
->frame
.initial_adjust
= cfun
->machine
->frame
.frame_size
;
5177 cfun
->machine
->frame
.callee_offset
5178 = cfun
->machine
->frame
.frame_size
- cfun
->machine
->frame
.hard_fp_offset
;
5180 else if (cfun
->machine
->frame
.hard_fp_offset
.is_constant (&const_fp_offset
)
5181 && const_fp_offset
< max_push_offset
)
5183 /* Frame with large outgoing arguments but a small local area:
5184 stp reg1, reg2, [sp, -hard_fp_offset]!
5185 stp reg3, reg4, [sp, 16]
5186 sub sp, sp, outgoing_args_size */
5187 cfun
->machine
->frame
.callee_adjust
= const_fp_offset
;
5188 cfun
->machine
->frame
.final_adjust
5189 = cfun
->machine
->frame
.frame_size
- cfun
->machine
->frame
.callee_adjust
;
5193 /* Frame with large local area and outgoing arguments using frame pointer:
5194 sub sp, sp, hard_fp_offset
5195 stp x29, x30, [sp, 0]
5197 stp reg3, reg4, [sp, 16]
5198 sub sp, sp, outgoing_args_size */
5199 cfun
->machine
->frame
.initial_adjust
= cfun
->machine
->frame
.hard_fp_offset
;
5200 cfun
->machine
->frame
.final_adjust
5201 = cfun
->machine
->frame
.frame_size
- cfun
->machine
->frame
.initial_adjust
;
5204 cfun
->machine
->frame
.laid_out
= true;
5207 /* Return true if the register REGNO is saved on entry to
5208 the current function. */
5211 aarch64_register_saved_on_entry (int regno
)
5213 return cfun
->machine
->frame
.reg_offset
[regno
] >= 0;
5216 /* Return the next register up from REGNO up to LIMIT for the callee
5220 aarch64_next_callee_save (unsigned regno
, unsigned limit
)
5222 while (regno
<= limit
&& !aarch64_register_saved_on_entry (regno
))
5227 /* Push the register number REGNO of mode MODE to the stack with write-back
5228 adjusting the stack by ADJUSTMENT. */
5231 aarch64_pushwb_single_reg (machine_mode mode
, unsigned regno
,
5232 HOST_WIDE_INT adjustment
)
5234 rtx base_rtx
= stack_pointer_rtx
;
5237 reg
= gen_rtx_REG (mode
, regno
);
5238 mem
= gen_rtx_PRE_MODIFY (Pmode
, base_rtx
,
5239 plus_constant (Pmode
, base_rtx
, -adjustment
));
5240 mem
= gen_frame_mem (mode
, mem
);
5242 insn
= emit_move_insn (mem
, reg
);
5243 RTX_FRAME_RELATED_P (insn
) = 1;
5246 /* Generate and return an instruction to store the pair of registers
5247 REG and REG2 of mode MODE to location BASE with write-back adjusting
5248 the stack location BASE by ADJUSTMENT. */
5251 aarch64_gen_storewb_pair (machine_mode mode
, rtx base
, rtx reg
, rtx reg2
,
5252 HOST_WIDE_INT adjustment
)
5257 return gen_storewb_pairdi_di (base
, base
, reg
, reg2
,
5258 GEN_INT (-adjustment
),
5259 GEN_INT (UNITS_PER_WORD
- adjustment
));
5261 return gen_storewb_pairdf_di (base
, base
, reg
, reg2
,
5262 GEN_INT (-adjustment
),
5263 GEN_INT (UNITS_PER_WORD
- adjustment
));
5265 return gen_storewb_pairtf_di (base
, base
, reg
, reg2
,
5266 GEN_INT (-adjustment
),
5267 GEN_INT (UNITS_PER_VREG
- adjustment
));
5273 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
5274 stack pointer by ADJUSTMENT. */
5277 aarch64_push_regs (unsigned regno1
, unsigned regno2
, HOST_WIDE_INT adjustment
)
5280 machine_mode mode
= aarch64_reg_save_mode (cfun
->decl
, regno1
);
5282 if (regno2
== INVALID_REGNUM
)
5283 return aarch64_pushwb_single_reg (mode
, regno1
, adjustment
);
5285 rtx reg1
= gen_rtx_REG (mode
, regno1
);
5286 rtx reg2
= gen_rtx_REG (mode
, regno2
);
5288 insn
= emit_insn (aarch64_gen_storewb_pair (mode
, stack_pointer_rtx
, reg1
,
5290 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn
), 0, 2)) = 1;
5291 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn
), 0, 1)) = 1;
5292 RTX_FRAME_RELATED_P (insn
) = 1;
5295 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
5296 adjusting it by ADJUSTMENT afterwards. */
5299 aarch64_gen_loadwb_pair (machine_mode mode
, rtx base
, rtx reg
, rtx reg2
,
5300 HOST_WIDE_INT adjustment
)
5305 return gen_loadwb_pairdi_di (base
, base
, reg
, reg2
, GEN_INT (adjustment
),
5306 GEN_INT (UNITS_PER_WORD
));
5308 return gen_loadwb_pairdf_di (base
, base
, reg
, reg2
, GEN_INT (adjustment
),
5309 GEN_INT (UNITS_PER_WORD
));
5311 return gen_loadwb_pairtf_di (base
, base
, reg
, reg2
, GEN_INT (adjustment
),
5312 GEN_INT (UNITS_PER_VREG
));
5318 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
5319 afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
5323 aarch64_pop_regs (unsigned regno1
, unsigned regno2
, HOST_WIDE_INT adjustment
,
5326 machine_mode mode
= aarch64_reg_save_mode (cfun
->decl
, regno1
);
5327 rtx reg1
= gen_rtx_REG (mode
, regno1
);
5329 *cfi_ops
= alloc_reg_note (REG_CFA_RESTORE
, reg1
, *cfi_ops
);
5331 if (regno2
== INVALID_REGNUM
)
5333 rtx mem
= plus_constant (Pmode
, stack_pointer_rtx
, adjustment
);
5334 mem
= gen_rtx_POST_MODIFY (Pmode
, stack_pointer_rtx
, mem
);
5335 emit_move_insn (reg1
, gen_frame_mem (mode
, mem
));
5339 rtx reg2
= gen_rtx_REG (mode
, regno2
);
5340 *cfi_ops
= alloc_reg_note (REG_CFA_RESTORE
, reg2
, *cfi_ops
);
5341 emit_insn (aarch64_gen_loadwb_pair (mode
, stack_pointer_rtx
, reg1
,
5346 /* Generate and return a store pair instruction of mode MODE to store
5347 register REG1 to MEM1 and register REG2 to MEM2. */
5350 aarch64_gen_store_pair (machine_mode mode
, rtx mem1
, rtx reg1
, rtx mem2
,
5356 return gen_store_pair_dw_didi (mem1
, reg1
, mem2
, reg2
);
5359 return gen_store_pair_dw_dfdf (mem1
, reg1
, mem2
, reg2
);
5362 return gen_store_pair_dw_tftf (mem1
, reg1
, mem2
, reg2
);
5369 /* Generate and regurn a load pair isntruction of mode MODE to load register
5370 REG1 from MEM1 and register REG2 from MEM2. */
5373 aarch64_gen_load_pair (machine_mode mode
, rtx reg1
, rtx mem1
, rtx reg2
,
5379 return gen_load_pair_dw_didi (reg1
, mem1
, reg2
, mem2
);
5382 return gen_load_pair_dw_dfdf (reg1
, mem1
, reg2
, mem2
);
5385 return gen_load_pair_dw_tftf (reg1
, mem1
, reg2
, mem2
);
5392 /* Return TRUE if return address signing should be enabled for the current
5393 function, otherwise return FALSE. */
5396 aarch64_return_address_signing_enabled (void)
5398 /* This function should only be called after frame laid out. */
5399 gcc_assert (cfun
->machine
->frame
.laid_out
);
5401 /* If signing scope is AARCH64_FUNCTION_NON_LEAF, we only sign a leaf function
5402 if its LR is pushed onto stack. */
5403 return (aarch64_ra_sign_scope
== AARCH64_FUNCTION_ALL
5404 || (aarch64_ra_sign_scope
== AARCH64_FUNCTION_NON_LEAF
5405 && cfun
->machine
->frame
.reg_offset
[LR_REGNUM
] >= 0));
5408 /* Return TRUE if Branch Target Identification Mechanism is enabled. */
5410 aarch64_bti_enabled (void)
5412 return (aarch64_enable_bti
== 1);
5415 /* Emit code to save the callee-saved registers from register number START
5416 to LIMIT to the stack at the location starting at offset START_OFFSET,
5417 skipping any write-back candidates if SKIP_WB is true. */
5420 aarch64_save_callee_saves (machine_mode mode
, poly_int64 start_offset
,
5421 unsigned start
, unsigned limit
, bool skip_wb
)
5427 for (regno
= aarch64_next_callee_save (start
, limit
);
5429 regno
= aarch64_next_callee_save (regno
+ 1, limit
))
5436 && (regno
== cfun
->machine
->frame
.wb_candidate1
5437 || regno
== cfun
->machine
->frame
.wb_candidate2
))
5440 if (cfun
->machine
->reg_is_wrapped_separately
[regno
])
5443 reg
= gen_rtx_REG (mode
, regno
);
5444 offset
= start_offset
+ cfun
->machine
->frame
.reg_offset
[regno
];
5445 mem
= gen_frame_mem (mode
, plus_constant (Pmode
, stack_pointer_rtx
,
5448 regno2
= aarch64_next_callee_save (regno
+ 1, limit
);
5449 offset_diff
= cfun
->machine
->frame
.reg_offset
[regno2
]
5450 - cfun
->machine
->frame
.reg_offset
[regno
];
5453 && !cfun
->machine
->reg_is_wrapped_separately
[regno2
]
5454 && known_eq (GET_MODE_SIZE (mode
), offset_diff
))
5456 rtx reg2
= gen_rtx_REG (mode
, regno2
);
5459 offset
= start_offset
+ cfun
->machine
->frame
.reg_offset
[regno2
];
5460 mem2
= gen_frame_mem (mode
, plus_constant (Pmode
, stack_pointer_rtx
,
5462 insn
= emit_insn (aarch64_gen_store_pair (mode
, mem
, reg
, mem2
,
5465 /* The first part of a frame-related parallel insn is
5466 always assumed to be relevant to the frame
5467 calculations; subsequent parts, are only
5468 frame-related if explicitly marked. */
5469 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn
), 0, 1)) = 1;
5473 insn
= emit_move_insn (mem
, reg
);
5475 RTX_FRAME_RELATED_P (insn
) = 1;
5479 /* Emit code to restore the callee registers of mode MODE from register
5480 number START up to and including LIMIT. Restore from the stack offset
5481 START_OFFSET, skipping any write-back candidates if SKIP_WB is true.
5482 Write the appropriate REG_CFA_RESTORE notes into CFI_OPS. */
5485 aarch64_restore_callee_saves (machine_mode mode
,
5486 poly_int64 start_offset
, unsigned start
,
5487 unsigned limit
, bool skip_wb
, rtx
*cfi_ops
)
5489 rtx base_rtx
= stack_pointer_rtx
;
5494 for (regno
= aarch64_next_callee_save (start
, limit
);
5496 regno
= aarch64_next_callee_save (regno
+ 1, limit
))
5498 if (cfun
->machine
->reg_is_wrapped_separately
[regno
])
5505 && (regno
== cfun
->machine
->frame
.wb_candidate1
5506 || regno
== cfun
->machine
->frame
.wb_candidate2
))
5509 reg
= gen_rtx_REG (mode
, regno
);
5510 offset
= start_offset
+ cfun
->machine
->frame
.reg_offset
[regno
];
5511 mem
= gen_frame_mem (mode
, plus_constant (Pmode
, base_rtx
, offset
));
5513 regno2
= aarch64_next_callee_save (regno
+ 1, limit
);
5514 offset_diff
= cfun
->machine
->frame
.reg_offset
[regno2
]
5515 - cfun
->machine
->frame
.reg_offset
[regno
];
5518 && !cfun
->machine
->reg_is_wrapped_separately
[regno2
]
5519 && known_eq (GET_MODE_SIZE (mode
), offset_diff
))
5521 rtx reg2
= gen_rtx_REG (mode
, regno2
);
5524 offset
= start_offset
+ cfun
->machine
->frame
.reg_offset
[regno2
];
5525 mem2
= gen_frame_mem (mode
, plus_constant (Pmode
, base_rtx
, offset
));
5526 emit_insn (aarch64_gen_load_pair (mode
, reg
, mem
, reg2
, mem2
));
5528 *cfi_ops
= alloc_reg_note (REG_CFA_RESTORE
, reg2
, *cfi_ops
);
5532 emit_move_insn (reg
, mem
);
5533 *cfi_ops
= alloc_reg_note (REG_CFA_RESTORE
, reg
, *cfi_ops
);
5537 /* Return true if OFFSET is a signed 4-bit value multiplied by the size
5541 offset_4bit_signed_scaled_p (machine_mode mode
, poly_int64 offset
)
5543 HOST_WIDE_INT multiple
;
5544 return (constant_multiple_p (offset
, GET_MODE_SIZE (mode
), &multiple
)
5545 && IN_RANGE (multiple
, -8, 7));
5548 /* Return true if OFFSET is a unsigned 6-bit value multiplied by the size
5552 offset_6bit_unsigned_scaled_p (machine_mode mode
, poly_int64 offset
)
5554 HOST_WIDE_INT multiple
;
5555 return (constant_multiple_p (offset
, GET_MODE_SIZE (mode
), &multiple
)
5556 && IN_RANGE (multiple
, 0, 63));
5559 /* Return true if OFFSET is a signed 7-bit value multiplied by the size
5563 aarch64_offset_7bit_signed_scaled_p (machine_mode mode
, poly_int64 offset
)
5565 HOST_WIDE_INT multiple
;
5566 return (constant_multiple_p (offset
, GET_MODE_SIZE (mode
), &multiple
)
5567 && IN_RANGE (multiple
, -64, 63));
5570 /* Return true if OFFSET is a signed 9-bit value. */
5573 aarch64_offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED
,
5576 HOST_WIDE_INT const_offset
;
5577 return (offset
.is_constant (&const_offset
)
5578 && IN_RANGE (const_offset
, -256, 255));
5581 /* Return true if OFFSET is a signed 9-bit value multiplied by the size
5585 offset_9bit_signed_scaled_p (machine_mode mode
, poly_int64 offset
)
5587 HOST_WIDE_INT multiple
;
5588 return (constant_multiple_p (offset
, GET_MODE_SIZE (mode
), &multiple
)
5589 && IN_RANGE (multiple
, -256, 255));
5592 /* Return true if OFFSET is an unsigned 12-bit value multiplied by the size
5596 offset_12bit_unsigned_scaled_p (machine_mode mode
, poly_int64 offset
)
5598 HOST_WIDE_INT multiple
;
5599 return (constant_multiple_p (offset
, GET_MODE_SIZE (mode
), &multiple
)
5600 && IN_RANGE (multiple
, 0, 4095));
5603 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS. */
5606 aarch64_get_separate_components (void)
5608 sbitmap components
= sbitmap_alloc (LAST_SAVED_REGNUM
+ 1);
5609 bitmap_clear (components
);
5611 /* The registers we need saved to the frame. */
5612 for (unsigned regno
= 0; regno
<= LAST_SAVED_REGNUM
; regno
++)
5613 if (aarch64_register_saved_on_entry (regno
))
5615 poly_int64 offset
= cfun
->machine
->frame
.reg_offset
[regno
];
5616 if (!frame_pointer_needed
)
5617 offset
+= cfun
->machine
->frame
.frame_size
5618 - cfun
->machine
->frame
.hard_fp_offset
;
5619 /* Check that we can access the stack slot of the register with one
5620 direct load with no adjustments needed. */
5621 if (offset_12bit_unsigned_scaled_p (DImode
, offset
))
5622 bitmap_set_bit (components
, regno
);
5625 /* Don't mess with the hard frame pointer. */
5626 if (frame_pointer_needed
)
5627 bitmap_clear_bit (components
, HARD_FRAME_POINTER_REGNUM
);
5629 unsigned reg1
= cfun
->machine
->frame
.wb_candidate1
;
5630 unsigned reg2
= cfun
->machine
->frame
.wb_candidate2
;
5631 /* If registers have been chosen to be stored/restored with
5632 writeback don't interfere with them to avoid having to output explicit
5633 stack adjustment instructions. */
5634 if (reg2
!= INVALID_REGNUM
)
5635 bitmap_clear_bit (components
, reg2
);
5636 if (reg1
!= INVALID_REGNUM
)
5637 bitmap_clear_bit (components
, reg1
);
5639 bitmap_clear_bit (components
, LR_REGNUM
);
5640 bitmap_clear_bit (components
, SP_REGNUM
);
5645 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB. */
5648 aarch64_components_for_bb (basic_block bb
)
5650 bitmap in
= DF_LIVE_IN (bb
);
5651 bitmap gen
= &DF_LIVE_BB_INFO (bb
)->gen
;
5652 bitmap kill
= &DF_LIVE_BB_INFO (bb
)->kill
;
5653 bool simd_function
= aarch64_simd_decl_p (cfun
->decl
);
5655 sbitmap components
= sbitmap_alloc (LAST_SAVED_REGNUM
+ 1);
5656 bitmap_clear (components
);
5658 /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets. */
5659 for (unsigned regno
= 0; regno
<= LAST_SAVED_REGNUM
; regno
++)
5660 if ((!call_used_regs
[regno
]
5661 || (simd_function
&& FP_SIMD_SAVED_REGNUM_P (regno
)))
5662 && (bitmap_bit_p (in
, regno
)
5663 || bitmap_bit_p (gen
, regno
)
5664 || bitmap_bit_p (kill
, regno
)))
5666 unsigned regno2
, offset
, offset2
;
5667 bitmap_set_bit (components
, regno
);
5669 /* If there is a callee-save at an adjacent offset, add it too
5670 to increase the use of LDP/STP. */
5671 offset
= cfun
->machine
->frame
.reg_offset
[regno
];
5672 regno2
= ((offset
& 8) == 0) ? regno
+ 1 : regno
- 1;
5674 if (regno2
<= LAST_SAVED_REGNUM
)
5676 offset2
= cfun
->machine
->frame
.reg_offset
[regno2
];
5677 if ((offset
& ~8) == (offset2
& ~8))
5678 bitmap_set_bit (components
, regno2
);
5685 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
5686 Nothing to do for aarch64. */
5689 aarch64_disqualify_components (sbitmap
, edge
, sbitmap
, bool)
5693 /* Return the next set bit in BMP from START onwards. Return the total number
5694 of bits in BMP if no set bit is found at or after START. */
5697 aarch64_get_next_set_bit (sbitmap bmp
, unsigned int start
)
5699 unsigned int nbits
= SBITMAP_SIZE (bmp
);
5703 gcc_assert (start
< nbits
);
5704 for (unsigned int i
= start
; i
< nbits
; i
++)
5705 if (bitmap_bit_p (bmp
, i
))
5711 /* Do the work for aarch64_emit_prologue_components and
5712 aarch64_emit_epilogue_components. COMPONENTS is the bitmap of registers
5713 to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
5714 for these components or the epilogue sequence. That is, it determines
5715 whether we should emit stores or loads and what kind of CFA notes to attach
5716 to the insns. Otherwise the logic for the two sequences is very
5720 aarch64_process_components (sbitmap components
, bool prologue_p
)
5722 rtx ptr_reg
= gen_rtx_REG (Pmode
, frame_pointer_needed
5723 ? HARD_FRAME_POINTER_REGNUM
5724 : STACK_POINTER_REGNUM
);
5726 unsigned last_regno
= SBITMAP_SIZE (components
);
5727 unsigned regno
= aarch64_get_next_set_bit (components
, R0_REGNUM
);
5728 rtx_insn
*insn
= NULL
;
5730 while (regno
!= last_regno
)
5732 /* AAPCS64 section 5.1.2 requires only the low 64 bits to be saved
5733 so DFmode for the vector registers is enough. For simd functions
5734 we want to save the low 128 bits. */
5735 machine_mode mode
= aarch64_reg_save_mode (cfun
->decl
, regno
);
5737 rtx reg
= gen_rtx_REG (mode
, regno
);
5738 poly_int64 offset
= cfun
->machine
->frame
.reg_offset
[regno
];
5739 if (!frame_pointer_needed
)
5740 offset
+= cfun
->machine
->frame
.frame_size
5741 - cfun
->machine
->frame
.hard_fp_offset
;
5742 rtx addr
= plus_constant (Pmode
, ptr_reg
, offset
);
5743 rtx mem
= gen_frame_mem (mode
, addr
);
5745 rtx set
= prologue_p
? gen_rtx_SET (mem
, reg
) : gen_rtx_SET (reg
, mem
);
5746 unsigned regno2
= aarch64_get_next_set_bit (components
, regno
+ 1);
5747 /* No more registers to handle after REGNO.
5748 Emit a single save/restore and exit. */
5749 if (regno2
== last_regno
)
5751 insn
= emit_insn (set
);
5752 RTX_FRAME_RELATED_P (insn
) = 1;
5754 add_reg_note (insn
, REG_CFA_OFFSET
, copy_rtx (set
));
5756 add_reg_note (insn
, REG_CFA_RESTORE
, reg
);
5760 poly_int64 offset2
= cfun
->machine
->frame
.reg_offset
[regno2
];
5761 /* The next register is not of the same class or its offset is not
5762 mergeable with the current one into a pair. */
5763 if (!satisfies_constraint_Ump (mem
)
5764 || GP_REGNUM_P (regno
) != GP_REGNUM_P (regno2
)
5765 || (aarch64_simd_decl_p (cfun
->decl
) && FP_REGNUM_P (regno
))
5766 || maybe_ne ((offset2
- cfun
->machine
->frame
.reg_offset
[regno
]),
5767 GET_MODE_SIZE (mode
)))
5769 insn
= emit_insn (set
);
5770 RTX_FRAME_RELATED_P (insn
) = 1;
5772 add_reg_note (insn
, REG_CFA_OFFSET
, copy_rtx (set
));
5774 add_reg_note (insn
, REG_CFA_RESTORE
, reg
);
5780 /* REGNO2 can be saved/restored in a pair with REGNO. */
5781 rtx reg2
= gen_rtx_REG (mode
, regno2
);
5782 if (!frame_pointer_needed
)
5783 offset2
+= cfun
->machine
->frame
.frame_size
5784 - cfun
->machine
->frame
.hard_fp_offset
;
5785 rtx addr2
= plus_constant (Pmode
, ptr_reg
, offset2
);
5786 rtx mem2
= gen_frame_mem (mode
, addr2
);
5787 rtx set2
= prologue_p
? gen_rtx_SET (mem2
, reg2
)
5788 : gen_rtx_SET (reg2
, mem2
);
5791 insn
= emit_insn (aarch64_gen_store_pair (mode
, mem
, reg
, mem2
, reg2
));
5793 insn
= emit_insn (aarch64_gen_load_pair (mode
, reg
, mem
, reg2
, mem2
));
5795 RTX_FRAME_RELATED_P (insn
) = 1;
5798 add_reg_note (insn
, REG_CFA_OFFSET
, set
);
5799 add_reg_note (insn
, REG_CFA_OFFSET
, set2
);
5803 add_reg_note (insn
, REG_CFA_RESTORE
, reg
);
5804 add_reg_note (insn
, REG_CFA_RESTORE
, reg2
);
5807 regno
= aarch64_get_next_set_bit (components
, regno2
+ 1);
5811 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS. */
5814 aarch64_emit_prologue_components (sbitmap components
)
5816 aarch64_process_components (components
, true);
5819 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS. */
5822 aarch64_emit_epilogue_components (sbitmap components
)
5824 aarch64_process_components (components
, false);
5827 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS. */
5830 aarch64_set_handled_components (sbitmap components
)
5832 for (unsigned regno
= 0; regno
<= LAST_SAVED_REGNUM
; regno
++)
5833 if (bitmap_bit_p (components
, regno
))
5834 cfun
->machine
->reg_is_wrapped_separately
[regno
] = true;
5837 /* On AArch64 we have an ABI defined safe buffer. This constant is used to
5838 determining the probe offset for alloca. */
5840 static HOST_WIDE_INT
5841 aarch64_stack_clash_protection_alloca_probe_range (void)
5843 return STACK_CLASH_CALLER_GUARD
;
5847 /* Allocate POLY_SIZE bytes of stack space using TEMP1 and TEMP2 as scratch
5848 registers. If POLY_SIZE is not large enough to require a probe this function
5849 will only adjust the stack. When allocating the stack space
5850 FRAME_RELATED_P is then used to indicate if the allocation is frame related.
5851 FINAL_ADJUSTMENT_P indicates whether we are allocating the outgoing
5852 arguments. If we are then we ensure that any allocation larger than the ABI
5853 defined buffer needs a probe so that the invariant of having a 1KB buffer is
5856 We emit barriers after each stack adjustment to prevent optimizations from
5857 breaking the invariant that we never drop the stack more than a page. This
5858 invariant is needed to make it easier to correctly handle asynchronous
5859 events, e.g. if we were to allow the stack to be dropped by more than a page
5860 and then have multiple probes up and we take a signal somewhere in between
5861 then the signal handler doesn't know the state of the stack and can make no
5862 assumptions about which pages have been probed. */
5865 aarch64_allocate_and_probe_stack_space (rtx temp1
, rtx temp2
,
5866 poly_int64 poly_size
,
5867 bool frame_related_p
,
5868 bool final_adjustment_p
)
5870 HOST_WIDE_INT guard_size
5871 = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE
);
5872 HOST_WIDE_INT guard_used_by_caller
= STACK_CLASH_CALLER_GUARD
;
5873 /* When doing the final adjustment for the outgoing argument size we can't
5874 assume that LR was saved at position 0. So subtract it's offset from the
5875 ABI safe buffer so that we don't accidentally allow an adjustment that
5876 would result in an allocation larger than the ABI buffer without
5878 HOST_WIDE_INT min_probe_threshold
5879 = final_adjustment_p
5880 ? guard_used_by_caller
- cfun
->machine
->frame
.reg_offset
[LR_REGNUM
]
5881 : guard_size
- guard_used_by_caller
;
5883 poly_int64 frame_size
= cfun
->machine
->frame
.frame_size
;
5885 /* We should always have a positive probe threshold. */
5886 gcc_assert (min_probe_threshold
> 0);
5888 if (flag_stack_clash_protection
&& !final_adjustment_p
)
5890 poly_int64 initial_adjust
= cfun
->machine
->frame
.initial_adjust
;
5891 poly_int64 final_adjust
= cfun
->machine
->frame
.final_adjust
;
5893 if (known_eq (frame_size
, 0))
5895 dump_stack_clash_frame_info (NO_PROBE_NO_FRAME
, false);
5897 else if (known_lt (initial_adjust
, guard_size
- guard_used_by_caller
)
5898 && known_lt (final_adjust
, guard_used_by_caller
))
5900 dump_stack_clash_frame_info (NO_PROBE_SMALL_FRAME
, true);
5904 /* If SIZE is not large enough to require probing, just adjust the stack and
5906 if (known_lt (poly_size
, min_probe_threshold
)
5907 || !flag_stack_clash_protection
)
5909 aarch64_sub_sp (temp1
, temp2
, poly_size
, frame_related_p
);
5914 /* Handle the SVE non-constant case first. */
5915 if (!poly_size
.is_constant (&size
))
5919 fprintf (dump_file
, "Stack clash SVE prologue: ");
5920 print_dec (poly_size
, dump_file
);
5921 fprintf (dump_file
, " bytes, dynamic probing will be required.\n");
5924 /* First calculate the amount of bytes we're actually spilling. */
5925 aarch64_add_offset (Pmode
, temp1
, CONST0_RTX (Pmode
),
5926 poly_size
, temp1
, temp2
, false, true);
5928 rtx_insn
*insn
= get_last_insn ();
5930 if (frame_related_p
)
5932 /* This is done to provide unwinding information for the stack
5933 adjustments we're about to do, however to prevent the optimizers
5934 from removing the R11 move and leaving the CFA note (which would be
5935 very wrong) we tie the old and new stack pointer together.
5936 The tie will expand to nothing but the optimizers will not touch
5938 rtx stack_ptr_copy
= gen_rtx_REG (Pmode
, STACK_CLASH_SVE_CFA_REGNUM
);
5939 emit_move_insn (stack_ptr_copy
, stack_pointer_rtx
);
5940 emit_insn (gen_stack_tie (stack_ptr_copy
, stack_pointer_rtx
));
5942 /* We want the CFA independent of the stack pointer for the
5943 duration of the loop. */
5944 add_reg_note (insn
, REG_CFA_DEF_CFA
, stack_ptr_copy
);
5945 RTX_FRAME_RELATED_P (insn
) = 1;
5948 rtx probe_const
= gen_int_mode (min_probe_threshold
, Pmode
);
5949 rtx guard_const
= gen_int_mode (guard_size
, Pmode
);
5951 insn
= emit_insn (gen_probe_sve_stack_clash (Pmode
, stack_pointer_rtx
,
5952 stack_pointer_rtx
, temp1
,
5953 probe_const
, guard_const
));
5955 /* Now reset the CFA register if needed. */
5956 if (frame_related_p
)
5958 add_reg_note (insn
, REG_CFA_DEF_CFA
,
5959 gen_rtx_PLUS (Pmode
, stack_pointer_rtx
,
5960 gen_int_mode (poly_size
, Pmode
)));
5961 RTX_FRAME_RELATED_P (insn
) = 1;
5969 "Stack clash AArch64 prologue: " HOST_WIDE_INT_PRINT_DEC
5970 " bytes, probing will be required.\n", size
);
5972 /* Round size to the nearest multiple of guard_size, and calculate the
5973 residual as the difference between the original size and the rounded
5975 HOST_WIDE_INT rounded_size
= ROUND_DOWN (size
, guard_size
);
5976 HOST_WIDE_INT residual
= size
- rounded_size
;
5978 /* We can handle a small number of allocations/probes inline. Otherwise
5980 if (rounded_size
<= STACK_CLASH_MAX_UNROLL_PAGES
* guard_size
)
5982 for (HOST_WIDE_INT i
= 0; i
< rounded_size
; i
+= guard_size
)
5984 aarch64_sub_sp (NULL
, temp2
, guard_size
, true);
5985 emit_stack_probe (plus_constant (Pmode
, stack_pointer_rtx
,
5986 guard_used_by_caller
));
5987 emit_insn (gen_blockage ());
5989 dump_stack_clash_frame_info (PROBE_INLINE
, size
!= rounded_size
);
5993 /* Compute the ending address. */
5994 aarch64_add_offset (Pmode
, temp1
, stack_pointer_rtx
, -rounded_size
,
5995 temp1
, NULL
, false, true);
5996 rtx_insn
*insn
= get_last_insn ();
5998 /* For the initial allocation, we don't have a frame pointer
5999 set up, so we always need CFI notes. If we're doing the
6000 final allocation, then we may have a frame pointer, in which
6001 case it is the CFA, otherwise we need CFI notes.
6003 We can determine which allocation we are doing by looking at
6004 the value of FRAME_RELATED_P since the final allocations are not
6006 if (frame_related_p
)
6008 /* We want the CFA independent of the stack pointer for the
6009 duration of the loop. */
6010 add_reg_note (insn
, REG_CFA_DEF_CFA
,
6011 plus_constant (Pmode
, temp1
, rounded_size
));
6012 RTX_FRAME_RELATED_P (insn
) = 1;
6015 /* This allocates and probes the stack. Note that this re-uses some of
6016 the existing Ada stack protection code. However we are guaranteed not
6017 to enter the non loop or residual branches of that code.
6019 The non-loop part won't be entered because if our allocation amount
6020 doesn't require a loop, the case above would handle it.
6022 The residual amount won't be entered because TEMP1 is a mutliple of
6023 the allocation size. The residual will always be 0. As such, the only
6024 part we are actually using from that code is the loop setup. The
6025 actual probing is done in aarch64_output_probe_stack_range. */
6026 insn
= emit_insn (gen_probe_stack_range (stack_pointer_rtx
,
6027 stack_pointer_rtx
, temp1
));
6029 /* Now reset the CFA register if needed. */
6030 if (frame_related_p
)
6032 add_reg_note (insn
, REG_CFA_DEF_CFA
,
6033 plus_constant (Pmode
, stack_pointer_rtx
, rounded_size
));
6034 RTX_FRAME_RELATED_P (insn
) = 1;
6037 emit_insn (gen_blockage ());
6038 dump_stack_clash_frame_info (PROBE_LOOP
, size
!= rounded_size
);
6041 /* Handle any residuals. Residuals of at least MIN_PROBE_THRESHOLD have to
6042 be probed. This maintains the requirement that each page is probed at
6043 least once. For initial probing we probe only if the allocation is
6044 more than GUARD_SIZE - buffer, and for the outgoing arguments we probe
6045 if the amount is larger than buffer. GUARD_SIZE - buffer + buffer ==
6046 GUARD_SIZE. This works that for any allocation that is large enough to
6047 trigger a probe here, we'll have at least one, and if they're not large
6048 enough for this code to emit anything for them, The page would have been
6049 probed by the saving of FP/LR either by this function or any callees. If
6050 we don't have any callees then we won't have more stack adjustments and so
6054 HOST_WIDE_INT residual_probe_offset
= guard_used_by_caller
;
6055 /* If we're doing final adjustments, and we've done any full page
6056 allocations then any residual needs to be probed. */
6057 if (final_adjustment_p
&& rounded_size
!= 0)
6058 min_probe_threshold
= 0;
6059 /* If doing a small final adjustment, we always probe at offset 0.
6060 This is done to avoid issues when LR is not at position 0 or when
6061 the final adjustment is smaller than the probing offset. */
6062 else if (final_adjustment_p
&& rounded_size
== 0)
6063 residual_probe_offset
= 0;
6065 aarch64_sub_sp (temp1
, temp2
, residual
, frame_related_p
);
6066 if (residual
>= min_probe_threshold
)
6070 "Stack clash AArch64 prologue residuals: "
6071 HOST_WIDE_INT_PRINT_DEC
" bytes, probing will be required."
6074 emit_stack_probe (plus_constant (Pmode
, stack_pointer_rtx
,
6075 residual_probe_offset
));
6076 emit_insn (gen_blockage ());
6081 /* Return 1 if the register is used by the epilogue. We need to say the
6082 return register is used, but only after epilogue generation is complete.
6083 Note that in the case of sibcalls, the values "used by the epilogue" are
6084 considered live at the start of the called function.
6086 For SIMD functions we need to return 1 for FP registers that are saved and
6087 restored by a function but are not zero in call_used_regs. If we do not do
6088 this optimizations may remove the restore of the register. */
6091 aarch64_epilogue_uses (int regno
)
6093 if (epilogue_completed
)
6095 if (regno
== LR_REGNUM
)
6097 if (aarch64_simd_decl_p (cfun
->decl
) && FP_SIMD_SAVED_REGNUM_P (regno
))
6103 /* Add a REG_CFA_EXPRESSION note to INSN to say that register REG
6104 is saved at BASE + OFFSET. */
6107 aarch64_add_cfa_expression (rtx_insn
*insn
, unsigned int reg
,
6108 rtx base
, poly_int64 offset
)
6110 rtx mem
= gen_frame_mem (DImode
, plus_constant (Pmode
, base
, offset
));
6111 add_reg_note (insn
, REG_CFA_EXPRESSION
,
6112 gen_rtx_SET (mem
, regno_reg_rtx
[reg
]));
6115 /* AArch64 stack frames generated by this compiler look like:
6117 +-------------------------------+
6119 | incoming stack arguments |
6121 +-------------------------------+
6122 | | <-- incoming stack pointer (aligned)
6123 | callee-allocated save area |
6124 | for register varargs |
6126 +-------------------------------+
6127 | local variables | <-- frame_pointer_rtx
6129 +-------------------------------+
6131 +-------------------------------+ |
6132 | callee-saved registers | | frame.saved_regs_size
6133 +-------------------------------+ |
6135 +-------------------------------+ |
6136 | FP' | / <- hard_frame_pointer_rtx (aligned)
6137 +-------------------------------+
6138 | dynamic allocation |
6139 +-------------------------------+
6141 +-------------------------------+
6142 | outgoing stack arguments | <-- arg_pointer
6144 +-------------------------------+
6145 | | <-- stack_pointer_rtx (aligned)
6147 Dynamic stack allocations via alloca() decrease stack_pointer_rtx
6148 but leave frame_pointer_rtx and hard_frame_pointer_rtx
6151 By default for stack-clash we assume the guard is at least 64KB, but this
6152 value is configurable to either 4KB or 64KB. We also force the guard size to
6153 be the same as the probing interval and both values are kept in sync.
6155 With those assumptions the callee can allocate up to 63KB (or 3KB depending
6156 on the guard size) of stack space without probing.
6158 When probing is needed, we emit a probe at the start of the prologue
6159 and every PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE bytes thereafter.
6161 We have to track how much space has been allocated and the only stores
6162 to the stack we track as implicit probes are the FP/LR stores.
6164 For outgoing arguments we probe if the size is larger than 1KB, such that
6165 the ABI specified buffer is maintained for the next callee.
6167 The following registers are reserved during frame layout and should not be
6168 used for any other purpose:
6170 - r11: Used by stack clash protection when SVE is enabled.
6171 - r12(EP0) and r13(EP1): Used as temporaries for stack adjustment.
6172 - r14 and r15: Used for speculation tracking.
6173 - r16(IP0), r17(IP1): Used by indirect tailcalls.
6174 - r30(LR), r29(FP): Used by standard frame layout.
6176 These registers must be avoided in frame layout related code unless the
6177 explicit intention is to interact with one of the features listed above. */
6179 /* Generate the prologue instructions for entry into a function.
6180 Establish the stack frame by decreasing the stack pointer with a
6181 properly calculated size and, if necessary, create a frame record
6182 filled with the values of LR and previous frame pointer. The
6183 current FP is also set up if it is in use. */
6186 aarch64_expand_prologue (void)
6188 poly_int64 frame_size
= cfun
->machine
->frame
.frame_size
;
6189 poly_int64 initial_adjust
= cfun
->machine
->frame
.initial_adjust
;
6190 HOST_WIDE_INT callee_adjust
= cfun
->machine
->frame
.callee_adjust
;
6191 poly_int64 final_adjust
= cfun
->machine
->frame
.final_adjust
;
6192 poly_int64 callee_offset
= cfun
->machine
->frame
.callee_offset
;
6193 unsigned reg1
= cfun
->machine
->frame
.wb_candidate1
;
6194 unsigned reg2
= cfun
->machine
->frame
.wb_candidate2
;
6195 bool emit_frame_chain
= cfun
->machine
->frame
.emit_frame_chain
;
6198 /* Sign return address for functions. */
6199 if (aarch64_return_address_signing_enabled ())
6201 switch (aarch64_ra_sign_key
)
6204 insn
= emit_insn (gen_paciasp ());
6207 insn
= emit_insn (gen_pacibsp ());
6212 add_reg_note (insn
, REG_CFA_TOGGLE_RA_MANGLE
, const0_rtx
);
6213 RTX_FRAME_RELATED_P (insn
) = 1;
6216 if (flag_stack_usage_info
)
6217 current_function_static_stack_size
= constant_lower_bound (frame_size
);
6219 if (flag_stack_check
== STATIC_BUILTIN_STACK_CHECK
)
6221 if (crtl
->is_leaf
&& !cfun
->calls_alloca
)
6223 if (maybe_gt (frame_size
, PROBE_INTERVAL
)
6224 && maybe_gt (frame_size
, get_stack_check_protect ()))
6225 aarch64_emit_probe_stack_range (get_stack_check_protect (),
6227 - get_stack_check_protect ()));
6229 else if (maybe_gt (frame_size
, 0))
6230 aarch64_emit_probe_stack_range (get_stack_check_protect (), frame_size
);
6233 rtx tmp0_rtx
= gen_rtx_REG (Pmode
, EP0_REGNUM
);
6234 rtx tmp1_rtx
= gen_rtx_REG (Pmode
, EP1_REGNUM
);
6236 /* In theory we should never have both an initial adjustment
6237 and a callee save adjustment. Verify that is the case since the
6238 code below does not handle it for -fstack-clash-protection. */
6239 gcc_assert (known_eq (initial_adjust
, 0) || callee_adjust
== 0);
6241 /* Will only probe if the initial adjustment is larger than the guard
6242 less the amount of the guard reserved for use by the caller's
6244 aarch64_allocate_and_probe_stack_space (tmp0_rtx
, tmp1_rtx
, initial_adjust
,
6247 if (callee_adjust
!= 0)
6248 aarch64_push_regs (reg1
, reg2
, callee_adjust
);
6250 if (emit_frame_chain
)
6252 poly_int64 reg_offset
= callee_adjust
;
6253 if (callee_adjust
== 0)
6257 reg_offset
= callee_offset
;
6258 aarch64_save_callee_saves (DImode
, reg_offset
, reg1
, reg2
, false);
6260 aarch64_add_offset (Pmode
, hard_frame_pointer_rtx
,
6261 stack_pointer_rtx
, callee_offset
,
6262 tmp1_rtx
, tmp0_rtx
, frame_pointer_needed
);
6263 if (frame_pointer_needed
&& !frame_size
.is_constant ())
6265 /* Variable-sized frames need to describe the save slot
6266 address using DW_CFA_expression rather than DW_CFA_offset.
6267 This means that, without taking further action, the
6268 locations of the registers that we've already saved would
6269 remain based on the stack pointer even after we redefine
6270 the CFA based on the frame pointer. We therefore need new
6271 DW_CFA_expressions to re-express the save slots with addresses
6272 based on the frame pointer. */
6273 rtx_insn
*insn
= get_last_insn ();
6274 gcc_assert (RTX_FRAME_RELATED_P (insn
));
6276 /* Add an explicit CFA definition if this was previously
6278 if (!find_reg_note (insn
, REG_CFA_ADJUST_CFA
, NULL_RTX
))
6280 rtx src
= plus_constant (Pmode
, stack_pointer_rtx
,
6282 add_reg_note (insn
, REG_CFA_ADJUST_CFA
,
6283 gen_rtx_SET (hard_frame_pointer_rtx
, src
));
6286 /* Change the save slot expressions for the registers that
6287 we've already saved. */
6288 reg_offset
-= callee_offset
;
6289 aarch64_add_cfa_expression (insn
, reg2
, hard_frame_pointer_rtx
,
6290 reg_offset
+ UNITS_PER_WORD
);
6291 aarch64_add_cfa_expression (insn
, reg1
, hard_frame_pointer_rtx
,
6294 emit_insn (gen_stack_tie (stack_pointer_rtx
, hard_frame_pointer_rtx
));
6297 aarch64_save_callee_saves (DImode
, callee_offset
, R0_REGNUM
, R30_REGNUM
,
6298 callee_adjust
!= 0 || emit_frame_chain
);
6299 if (aarch64_simd_decl_p (cfun
->decl
))
6300 aarch64_save_callee_saves (TFmode
, callee_offset
, V0_REGNUM
, V31_REGNUM
,
6301 callee_adjust
!= 0 || emit_frame_chain
);
6303 aarch64_save_callee_saves (DFmode
, callee_offset
, V0_REGNUM
, V31_REGNUM
,
6304 callee_adjust
!= 0 || emit_frame_chain
);
6306 /* We may need to probe the final adjustment if it is larger than the guard
6307 that is assumed by the called. */
6308 aarch64_allocate_and_probe_stack_space (tmp1_rtx
, tmp0_rtx
, final_adjust
,
6309 !frame_pointer_needed
, true);
6312 /* Return TRUE if we can use a simple_return insn.
6314 This function checks whether the callee saved stack is empty, which
6315 means no restore actions are need. The pro_and_epilogue will use
6316 this to check whether shrink-wrapping opt is feasible. */
6319 aarch64_use_return_insn_p (void)
6321 if (!reload_completed
)
6327 return known_eq (cfun
->machine
->frame
.frame_size
, 0);
6330 /* Return false for non-leaf SIMD functions in order to avoid
6331 shrink-wrapping them. Doing this will lose the necessary
6332 save/restore of FP registers. */
6335 aarch64_use_simple_return_insn_p (void)
6337 if (aarch64_simd_decl_p (cfun
->decl
) && !crtl
->is_leaf
)
6343 /* Generate the epilogue instructions for returning from a function.
6344 This is almost exactly the reverse of the prolog sequence, except
6345 that we need to insert barriers to avoid scheduling loads that read
6346 from a deallocated stack, and we optimize the unwind records by
6347 emitting them all together if possible. */
6349 aarch64_expand_epilogue (bool for_sibcall
)
6351 poly_int64 initial_adjust
= cfun
->machine
->frame
.initial_adjust
;
6352 HOST_WIDE_INT callee_adjust
= cfun
->machine
->frame
.callee_adjust
;
6353 poly_int64 final_adjust
= cfun
->machine
->frame
.final_adjust
;
6354 poly_int64 callee_offset
= cfun
->machine
->frame
.callee_offset
;
6355 unsigned reg1
= cfun
->machine
->frame
.wb_candidate1
;
6356 unsigned reg2
= cfun
->machine
->frame
.wb_candidate2
;
6359 /* A stack clash protection prologue may not have left EP0_REGNUM or
6360 EP1_REGNUM in a usable state. The same is true for allocations
6361 with an SVE component, since we then need both temporary registers
6362 for each allocation. For stack clash we are in a usable state if
6363 the adjustment is less than GUARD_SIZE - GUARD_USED_BY_CALLER. */
6364 HOST_WIDE_INT guard_size
6365 = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE
);
6366 HOST_WIDE_INT guard_used_by_caller
= STACK_CLASH_CALLER_GUARD
;
6368 /* We can re-use the registers when the allocation amount is smaller than
6369 guard_size - guard_used_by_caller because we won't be doing any probes
6370 then. In such situations the register should remain live with the correct
6372 bool can_inherit_p
= (initial_adjust
.is_constant ()
6373 && final_adjust
.is_constant ())
6374 && (!flag_stack_clash_protection
6375 || known_lt (initial_adjust
,
6376 guard_size
- guard_used_by_caller
));
6378 /* We need to add memory barrier to prevent read from deallocated stack. */
6380 = maybe_ne (get_frame_size ()
6381 + cfun
->machine
->frame
.saved_varargs_size
, 0);
6383 /* Emit a barrier to prevent loads from a deallocated stack. */
6384 if (maybe_gt (final_adjust
, crtl
->outgoing_args_size
)
6385 || cfun
->calls_alloca
6386 || crtl
->calls_eh_return
)
6388 emit_insn (gen_stack_tie (stack_pointer_rtx
, stack_pointer_rtx
));
6389 need_barrier_p
= false;
6392 /* Restore the stack pointer from the frame pointer if it may not
6393 be the same as the stack pointer. */
6394 rtx tmp0_rtx
= gen_rtx_REG (Pmode
, EP0_REGNUM
);
6395 rtx tmp1_rtx
= gen_rtx_REG (Pmode
, EP1_REGNUM
);
6396 if (frame_pointer_needed
6397 && (maybe_ne (final_adjust
, 0) || cfun
->calls_alloca
))
6398 /* If writeback is used when restoring callee-saves, the CFA
6399 is restored on the instruction doing the writeback. */
6400 aarch64_add_offset (Pmode
, stack_pointer_rtx
,
6401 hard_frame_pointer_rtx
, -callee_offset
,
6402 tmp1_rtx
, tmp0_rtx
, callee_adjust
== 0);
6404 /* The case where we need to re-use the register here is very rare, so
6405 avoid the complicated condition and just always emit a move if the
6406 immediate doesn't fit. */
6407 aarch64_add_sp (tmp1_rtx
, tmp0_rtx
, final_adjust
, true);
6409 aarch64_restore_callee_saves (DImode
, callee_offset
, R0_REGNUM
, R30_REGNUM
,
6410 callee_adjust
!= 0, &cfi_ops
);
6411 if (aarch64_simd_decl_p (cfun
->decl
))
6412 aarch64_restore_callee_saves (TFmode
, callee_offset
, V0_REGNUM
, V31_REGNUM
,
6413 callee_adjust
!= 0, &cfi_ops
);
6415 aarch64_restore_callee_saves (DFmode
, callee_offset
, V0_REGNUM
, V31_REGNUM
,
6416 callee_adjust
!= 0, &cfi_ops
);
6419 emit_insn (gen_stack_tie (stack_pointer_rtx
, stack_pointer_rtx
));
6421 if (callee_adjust
!= 0)
6422 aarch64_pop_regs (reg1
, reg2
, callee_adjust
, &cfi_ops
);
6424 if (callee_adjust
!= 0 || maybe_gt (initial_adjust
, 65536))
6426 /* Emit delayed restores and set the CFA to be SP + initial_adjust. */
6427 insn
= get_last_insn ();
6428 rtx new_cfa
= plus_constant (Pmode
, stack_pointer_rtx
, initial_adjust
);
6429 REG_NOTES (insn
) = alloc_reg_note (REG_CFA_DEF_CFA
, new_cfa
, cfi_ops
);
6430 RTX_FRAME_RELATED_P (insn
) = 1;
6434 /* Liveness of EP0_REGNUM can not be trusted across function calls either, so
6435 add restriction on emit_move optimization to leaf functions. */
6436 aarch64_add_sp (tmp0_rtx
, tmp1_rtx
, initial_adjust
,
6437 (!can_inherit_p
|| !crtl
->is_leaf
6438 || df_regs_ever_live_p (EP0_REGNUM
)));
6442 /* Emit delayed restores and reset the CFA to be SP. */
6443 insn
= get_last_insn ();
6444 cfi_ops
= alloc_reg_note (REG_CFA_DEF_CFA
, stack_pointer_rtx
, cfi_ops
);
6445 REG_NOTES (insn
) = cfi_ops
;
6446 RTX_FRAME_RELATED_P (insn
) = 1;
6449 /* We prefer to emit the combined return/authenticate instruction RETAA,
6450 however there are three cases in which we must instead emit an explicit
6451 authentication instruction.
6453 1) Sibcalls don't return in a normal way, so if we're about to call one
6454 we must authenticate.
6456 2) The RETAA instruction is not available before ARMv8.3-A, so if we are
6457 generating code for !TARGET_ARMV8_3 we can't use it and must
6458 explicitly authenticate.
6460 3) On an eh_return path we make extra stack adjustments to update the
6461 canonical frame address to be the exception handler's CFA. We want
6462 to authenticate using the CFA of the function which calls eh_return.
6464 if (aarch64_return_address_signing_enabled ()
6465 && (for_sibcall
|| !TARGET_ARMV8_3
|| crtl
->calls_eh_return
))
6467 switch (aarch64_ra_sign_key
)
6470 insn
= emit_insn (gen_autiasp ());
6473 insn
= emit_insn (gen_autibsp ());
6478 add_reg_note (insn
, REG_CFA_TOGGLE_RA_MANGLE
, const0_rtx
);
6479 RTX_FRAME_RELATED_P (insn
) = 1;
6482 /* Stack adjustment for exception handler. */
6483 if (crtl
->calls_eh_return
&& !for_sibcall
)
6485 /* We need to unwind the stack by the offset computed by
6486 EH_RETURN_STACKADJ_RTX. We have already reset the CFA
6487 to be SP; letting the CFA move during this adjustment
6488 is just as correct as retaining the CFA from the body
6489 of the function. Therefore, do nothing special. */
6490 emit_insn (gen_add2_insn (stack_pointer_rtx
, EH_RETURN_STACKADJ_RTX
));
6493 emit_use (gen_rtx_REG (DImode
, LR_REGNUM
));
6495 emit_jump_insn (ret_rtx
);
6498 /* Implement EH_RETURN_HANDLER_RTX. EH returns need to either return
6499 normally or return to a previous frame after unwinding.
6501 An EH return uses a single shared return sequence. The epilogue is
6502 exactly like a normal epilogue except that it has an extra input
6503 register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
6504 that must be applied after the frame has been destroyed. An extra label
6505 is inserted before the epilogue which initializes this register to zero,
6506 and this is the entry point for a normal return.
6508 An actual EH return updates the return address, initializes the stack
6509 adjustment and jumps directly into the epilogue (bypassing the zeroing
6510 of the adjustment). Since the return address is typically saved on the
6511 stack when a function makes a call, the saved LR must be updated outside
6514 This poses problems as the store is generated well before the epilogue,
6515 so the offset of LR is not known yet. Also optimizations will remove the
6516 store as it appears dead, even after the epilogue is generated (as the
6517 base or offset for loading LR is different in many cases).
6519 To avoid these problems this implementation forces the frame pointer
6520 in eh_return functions so that the location of LR is fixed and known early.
6521 It also marks the store volatile, so no optimization is permitted to
6522 remove the store. */
6524 aarch64_eh_return_handler_rtx (void)
6526 rtx tmp
= gen_frame_mem (Pmode
,
6527 plus_constant (Pmode
, hard_frame_pointer_rtx
, UNITS_PER_WORD
));
6529 /* Mark the store volatile, so no optimization is permitted to remove it. */
6530 MEM_VOLATILE_P (tmp
) = true;
6534 /* Output code to add DELTA to the first argument, and then jump
6535 to FUNCTION. Used for C++ multiple inheritance. */
6537 aarch64_output_mi_thunk (FILE *file
, tree thunk ATTRIBUTE_UNUSED
,
6538 HOST_WIDE_INT delta
,
6539 HOST_WIDE_INT vcall_offset
,
6542 /* The this pointer is always in x0. Note that this differs from
6543 Arm where the this pointer maybe bumped to r1 if r0 is required
6544 to return a pointer to an aggregate. On AArch64 a result value
6545 pointer will be in x8. */
6546 int this_regno
= R0_REGNUM
;
6547 rtx this_rtx
, temp0
, temp1
, addr
, funexp
;
6549 const char *fnname
= IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (thunk
));
6551 if (aarch64_bti_enabled ())
6552 emit_insn (gen_bti_c());
6554 reload_completed
= 1;
6555 emit_note (NOTE_INSN_PROLOGUE_END
);
6557 this_rtx
= gen_rtx_REG (Pmode
, this_regno
);
6558 temp0
= gen_rtx_REG (Pmode
, EP0_REGNUM
);
6559 temp1
= gen_rtx_REG (Pmode
, EP1_REGNUM
);
6561 if (vcall_offset
== 0)
6562 aarch64_add_offset (Pmode
, this_rtx
, this_rtx
, delta
, temp1
, temp0
, false);
6565 gcc_assert ((vcall_offset
& (POINTER_BYTES
- 1)) == 0);
6570 if (delta
>= -256 && delta
< 256)
6571 addr
= gen_rtx_PRE_MODIFY (Pmode
, this_rtx
,
6572 plus_constant (Pmode
, this_rtx
, delta
));
6574 aarch64_add_offset (Pmode
, this_rtx
, this_rtx
, delta
,
6575 temp1
, temp0
, false);
6578 if (Pmode
== ptr_mode
)
6579 aarch64_emit_move (temp0
, gen_rtx_MEM (ptr_mode
, addr
));
6581 aarch64_emit_move (temp0
,
6582 gen_rtx_ZERO_EXTEND (Pmode
,
6583 gen_rtx_MEM (ptr_mode
, addr
)));
6585 if (vcall_offset
>= -256 && vcall_offset
< 4096 * POINTER_BYTES
)
6586 addr
= plus_constant (Pmode
, temp0
, vcall_offset
);
6589 aarch64_internal_mov_immediate (temp1
, GEN_INT (vcall_offset
), true,
6591 addr
= gen_rtx_PLUS (Pmode
, temp0
, temp1
);
6594 if (Pmode
== ptr_mode
)
6595 aarch64_emit_move (temp1
, gen_rtx_MEM (ptr_mode
,addr
));
6597 aarch64_emit_move (temp1
,
6598 gen_rtx_SIGN_EXTEND (Pmode
,
6599 gen_rtx_MEM (ptr_mode
, addr
)));
6601 emit_insn (gen_add2_insn (this_rtx
, temp1
));
6604 /* Generate a tail call to the target function. */
6605 if (!TREE_USED (function
))
6607 assemble_external (function
);
6608 TREE_USED (function
) = 1;
6610 funexp
= XEXP (DECL_RTL (function
), 0);
6611 funexp
= gen_rtx_MEM (FUNCTION_MODE
, funexp
);
6612 insn
= emit_call_insn (gen_sibcall (funexp
, const0_rtx
, NULL_RTX
));
6613 SIBLING_CALL_P (insn
) = 1;
6615 insn
= get_insns ();
6616 shorten_branches (insn
);
6618 assemble_start_function (thunk
, fnname
);
6619 final_start_function (insn
, file
, 1);
6620 final (insn
, file
, 1);
6621 final_end_function ();
6622 assemble_end_function (thunk
, fnname
);
6624 /* Stop pretending to be a post-reload pass. */
6625 reload_completed
= 0;
6629 aarch64_tls_referenced_p (rtx x
)
6631 if (!TARGET_HAVE_TLS
)
6633 subrtx_iterator::array_type array
;
6634 FOR_EACH_SUBRTX (iter
, array
, x
, ALL
)
6636 const_rtx x
= *iter
;
6637 if (GET_CODE (x
) == SYMBOL_REF
&& SYMBOL_REF_TLS_MODEL (x
) != 0)
6639 /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
6640 TLS offsets, not real symbol references. */
6641 if (GET_CODE (x
) == UNSPEC
&& XINT (x
, 1) == UNSPEC_TLS
)
6642 iter
.skip_subrtxes ();
6648 /* Return true if val can be encoded as a 12-bit unsigned immediate with
6649 a left shift of 0 or 12 bits. */
6651 aarch64_uimm12_shift (HOST_WIDE_INT val
)
6653 return ((val
& (((HOST_WIDE_INT
) 0xfff) << 0)) == val
6654 || (val
& (((HOST_WIDE_INT
) 0xfff) << 12)) == val
6658 /* Returns the nearest value to VAL that will fit as a 12-bit unsigned immediate
6659 that can be created with a left shift of 0 or 12. */
6660 static HOST_WIDE_INT
6661 aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val
)
6663 /* Check to see if the value fits in 24 bits, as that is the maximum we can
6664 handle correctly. */
6665 gcc_assert ((val
& 0xffffff) == val
);
6667 if (((val
& 0xfff) << 0) == val
)
6670 return val
& (0xfff << 12);
6673 /* Return true if val is an immediate that can be loaded into a
6674 register by a MOVZ instruction. */
6676 aarch64_movw_imm (HOST_WIDE_INT val
, scalar_int_mode mode
)
6678 if (GET_MODE_SIZE (mode
) > 4)
6680 if ((val
& (((HOST_WIDE_INT
) 0xffff) << 32)) == val
6681 || (val
& (((HOST_WIDE_INT
) 0xffff) << 48)) == val
)
6686 /* Ignore sign extension. */
6687 val
&= (HOST_WIDE_INT
) 0xffffffff;
6689 return ((val
& (((HOST_WIDE_INT
) 0xffff) << 0)) == val
6690 || (val
& (((HOST_WIDE_INT
) 0xffff) << 16)) == val
);
6693 /* VAL is a value with the inner mode of MODE. Replicate it to fill a
6694 64-bit (DImode) integer. */
6696 static unsigned HOST_WIDE_INT
6697 aarch64_replicate_bitmask_imm (unsigned HOST_WIDE_INT val
, machine_mode mode
)
6699 unsigned int size
= GET_MODE_UNIT_PRECISION (mode
);
6702 val
&= (HOST_WIDE_INT_1U
<< size
) - 1;
6709 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2. */
6711 static const unsigned HOST_WIDE_INT bitmask_imm_mul
[] =
6713 0x0000000100000001ull
,
6714 0x0001000100010001ull
,
6715 0x0101010101010101ull
,
6716 0x1111111111111111ull
,
6717 0x5555555555555555ull
,
6721 /* Return true if val is a valid bitmask immediate. */
6724 aarch64_bitmask_imm (HOST_WIDE_INT val_in
, machine_mode mode
)
6726 unsigned HOST_WIDE_INT val
, tmp
, mask
, first_one
, next_one
;
6729 /* Check for a single sequence of one bits and return quickly if so.
6730 The special cases of all ones and all zeroes returns false. */
6731 val
= aarch64_replicate_bitmask_imm (val_in
, mode
);
6732 tmp
= val
+ (val
& -val
);
6734 if (tmp
== (tmp
& -tmp
))
6735 return (val
+ 1) > 1;
6737 /* Replicate 32-bit immediates so we can treat them as 64-bit. */
6739 val
= (val
<< 32) | (val
& 0xffffffff);
6741 /* Invert if the immediate doesn't start with a zero bit - this means we
6742 only need to search for sequences of one bits. */
6746 /* Find the first set bit and set tmp to val with the first sequence of one
6747 bits removed. Return success if there is a single sequence of ones. */
6748 first_one
= val
& -val
;
6749 tmp
= val
& (val
+ first_one
);
6754 /* Find the next set bit and compute the difference in bit position. */
6755 next_one
= tmp
& -tmp
;
6756 bits
= clz_hwi (first_one
) - clz_hwi (next_one
);
6759 /* Check the bit position difference is a power of 2, and that the first
6760 sequence of one bits fits within 'bits' bits. */
6761 if ((mask
>> bits
) != 0 || bits
!= (bits
& -bits
))
6764 /* Check the sequence of one bits is repeated 64/bits times. */
6765 return val
== mask
* bitmask_imm_mul
[__builtin_clz (bits
) - 26];
6768 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
6769 Assumed precondition: VAL_IN Is not zero. */
6771 unsigned HOST_WIDE_INT
6772 aarch64_and_split_imm1 (HOST_WIDE_INT val_in
)
6774 int lowest_bit_set
= ctz_hwi (val_in
);
6775 int highest_bit_set
= floor_log2 (val_in
);
6776 gcc_assert (val_in
!= 0);
6778 return ((HOST_WIDE_INT_UC (2) << highest_bit_set
) -
6779 (HOST_WIDE_INT_1U
<< lowest_bit_set
));
6782 /* Create constant where bits outside of lowest bit set to highest bit set
6785 unsigned HOST_WIDE_INT
6786 aarch64_and_split_imm2 (HOST_WIDE_INT val_in
)
6788 return val_in
| ~aarch64_and_split_imm1 (val_in
);
6791 /* Return true if VAL_IN is a valid 'and' bitmask immediate. */
6794 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in
, machine_mode mode
)
6796 scalar_int_mode int_mode
;
6797 if (!is_a
<scalar_int_mode
> (mode
, &int_mode
))
6800 if (aarch64_bitmask_imm (val_in
, int_mode
))
6803 if (aarch64_move_imm (val_in
, int_mode
))
6806 unsigned HOST_WIDE_INT imm2
= aarch64_and_split_imm2 (val_in
);
6808 return aarch64_bitmask_imm (imm2
, int_mode
);
6811 /* Return true if val is an immediate that can be loaded into a
6812 register in a single instruction. */
6814 aarch64_move_imm (HOST_WIDE_INT val
, machine_mode mode
)
6816 scalar_int_mode int_mode
;
6817 if (!is_a
<scalar_int_mode
> (mode
, &int_mode
))
6820 if (aarch64_movw_imm (val
, int_mode
) || aarch64_movw_imm (~val
, int_mode
))
6822 return aarch64_bitmask_imm (val
, int_mode
);
6826 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED
, rtx x
)
6830 if (GET_CODE (x
) == HIGH
)
6833 /* There's no way to calculate VL-based values using relocations. */
6834 subrtx_iterator::array_type array
;
6835 FOR_EACH_SUBRTX (iter
, array
, x
, ALL
)
6836 if (GET_CODE (*iter
) == CONST_POLY_INT
)
6839 split_const (x
, &base
, &offset
);
6840 if (GET_CODE (base
) == SYMBOL_REF
|| GET_CODE (base
) == LABEL_REF
)
6842 if (aarch64_classify_symbol (base
, INTVAL (offset
))
6843 != SYMBOL_FORCE_TO_MEM
)
6846 /* Avoid generating a 64-bit relocation in ILP32; leave
6847 to aarch64_expand_mov_immediate to handle it properly. */
6848 return mode
!= ptr_mode
;
6851 return aarch64_tls_referenced_p (x
);
6854 /* Implement TARGET_CASE_VALUES_THRESHOLD.
6855 The expansion for a table switch is quite expensive due to the number
6856 of instructions, the table lookup and hard to predict indirect jump.
6857 When optimizing for speed, and -O3 enabled, use the per-core tuning if
6858 set, otherwise use tables for > 16 cases as a tradeoff between size and
6859 performance. When optimizing for size, use the default setting. */
6862 aarch64_case_values_threshold (void)
6864 /* Use the specified limit for the number of cases before using jump
6865 tables at higher optimization levels. */
6867 && selected_cpu
->tune
->max_case_values
!= 0)
6868 return selected_cpu
->tune
->max_case_values
;
6870 return optimize_size
? default_case_values_threshold () : 17;
6873 /* Return true if register REGNO is a valid index register.
6874 STRICT_P is true if REG_OK_STRICT is in effect. */
6877 aarch64_regno_ok_for_index_p (int regno
, bool strict_p
)
6879 if (!HARD_REGISTER_NUM_P (regno
))
6887 regno
= reg_renumber
[regno
];
6889 return GP_REGNUM_P (regno
);
6892 /* Return true if register REGNO is a valid base register for mode MODE.
6893 STRICT_P is true if REG_OK_STRICT is in effect. */
6896 aarch64_regno_ok_for_base_p (int regno
, bool strict_p
)
6898 if (!HARD_REGISTER_NUM_P (regno
))
6906 regno
= reg_renumber
[regno
];
6909 /* The fake registers will be eliminated to either the stack or
6910 hard frame pointer, both of which are usually valid base registers.
6911 Reload deals with the cases where the eliminated form isn't valid. */
6912 return (GP_REGNUM_P (regno
)
6913 || regno
== SP_REGNUM
6914 || regno
== FRAME_POINTER_REGNUM
6915 || regno
== ARG_POINTER_REGNUM
);
6918 /* Return true if X is a valid base register for mode MODE.
6919 STRICT_P is true if REG_OK_STRICT is in effect. */
6922 aarch64_base_register_rtx_p (rtx x
, bool strict_p
)
6925 && GET_CODE (x
) == SUBREG
6926 && contains_reg_of_mode
[GENERAL_REGS
][GET_MODE (SUBREG_REG (x
))])
6929 return (REG_P (x
) && aarch64_regno_ok_for_base_p (REGNO (x
), strict_p
));
6932 /* Return true if address offset is a valid index. If it is, fill in INFO
6933 appropriately. STRICT_P is true if REG_OK_STRICT is in effect. */
6936 aarch64_classify_index (struct aarch64_address_info
*info
, rtx x
,
6937 machine_mode mode
, bool strict_p
)
6939 enum aarch64_address_type type
;
6944 if ((REG_P (x
) || GET_CODE (x
) == SUBREG
)
6945 && GET_MODE (x
) == Pmode
)
6947 type
= ADDRESS_REG_REG
;
6951 /* (sign_extend:DI (reg:SI)) */
6952 else if ((GET_CODE (x
) == SIGN_EXTEND
6953 || GET_CODE (x
) == ZERO_EXTEND
)
6954 && GET_MODE (x
) == DImode
6955 && GET_MODE (XEXP (x
, 0)) == SImode
)
6957 type
= (GET_CODE (x
) == SIGN_EXTEND
)
6958 ? ADDRESS_REG_SXTW
: ADDRESS_REG_UXTW
;
6959 index
= XEXP (x
, 0);
6962 /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
6963 else if (GET_CODE (x
) == MULT
6964 && (GET_CODE (XEXP (x
, 0)) == SIGN_EXTEND
6965 || GET_CODE (XEXP (x
, 0)) == ZERO_EXTEND
)
6966 && GET_MODE (XEXP (x
, 0)) == DImode
6967 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == SImode
6968 && CONST_INT_P (XEXP (x
, 1)))
6970 type
= (GET_CODE (XEXP (x
, 0)) == SIGN_EXTEND
)
6971 ? ADDRESS_REG_SXTW
: ADDRESS_REG_UXTW
;
6972 index
= XEXP (XEXP (x
, 0), 0);
6973 shift
= exact_log2 (INTVAL (XEXP (x
, 1)));
6975 /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
6976 else if (GET_CODE (x
) == ASHIFT
6977 && (GET_CODE (XEXP (x
, 0)) == SIGN_EXTEND
6978 || GET_CODE (XEXP (x
, 0)) == ZERO_EXTEND
)
6979 && GET_MODE (XEXP (x
, 0)) == DImode
6980 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == SImode
6981 && CONST_INT_P (XEXP (x
, 1)))
6983 type
= (GET_CODE (XEXP (x
, 0)) == SIGN_EXTEND
)
6984 ? ADDRESS_REG_SXTW
: ADDRESS_REG_UXTW
;
6985 index
= XEXP (XEXP (x
, 0), 0);
6986 shift
= INTVAL (XEXP (x
, 1));
6988 /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
6989 else if ((GET_CODE (x
) == SIGN_EXTRACT
6990 || GET_CODE (x
) == ZERO_EXTRACT
)
6991 && GET_MODE (x
) == DImode
6992 && GET_CODE (XEXP (x
, 0)) == MULT
6993 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == DImode
6994 && CONST_INT_P (XEXP (XEXP (x
, 0), 1)))
6996 type
= (GET_CODE (x
) == SIGN_EXTRACT
)
6997 ? ADDRESS_REG_SXTW
: ADDRESS_REG_UXTW
;
6998 index
= XEXP (XEXP (x
, 0), 0);
6999 shift
= exact_log2 (INTVAL (XEXP (XEXP (x
, 0), 1)));
7000 if (INTVAL (XEXP (x
, 1)) != 32 + shift
7001 || INTVAL (XEXP (x
, 2)) != 0)
7004 /* (and:DI (mult:DI (reg:DI) (const_int scale))
7005 (const_int 0xffffffff<<shift)) */
7006 else if (GET_CODE (x
) == AND
7007 && GET_MODE (x
) == DImode
7008 && GET_CODE (XEXP (x
, 0)) == MULT
7009 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == DImode
7010 && CONST_INT_P (XEXP (XEXP (x
, 0), 1))
7011 && CONST_INT_P (XEXP (x
, 1)))
7013 type
= ADDRESS_REG_UXTW
;
7014 index
= XEXP (XEXP (x
, 0), 0);
7015 shift
= exact_log2 (INTVAL (XEXP (XEXP (x
, 0), 1)));
7016 if (INTVAL (XEXP (x
, 1)) != (HOST_WIDE_INT
)0xffffffff << shift
)
7019 /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
7020 else if ((GET_CODE (x
) == SIGN_EXTRACT
7021 || GET_CODE (x
) == ZERO_EXTRACT
)
7022 && GET_MODE (x
) == DImode
7023 && GET_CODE (XEXP (x
, 0)) == ASHIFT
7024 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == DImode
7025 && CONST_INT_P (XEXP (XEXP (x
, 0), 1)))
7027 type
= (GET_CODE (x
) == SIGN_EXTRACT
)
7028 ? ADDRESS_REG_SXTW
: ADDRESS_REG_UXTW
;
7029 index
= XEXP (XEXP (x
, 0), 0);
7030 shift
= INTVAL (XEXP (XEXP (x
, 0), 1));
7031 if (INTVAL (XEXP (x
, 1)) != 32 + shift
7032 || INTVAL (XEXP (x
, 2)) != 0)
7035 /* (and:DI (ashift:DI (reg:DI) (const_int shift))
7036 (const_int 0xffffffff<<shift)) */
7037 else if (GET_CODE (x
) == AND
7038 && GET_MODE (x
) == DImode
7039 && GET_CODE (XEXP (x
, 0)) == ASHIFT
7040 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == DImode
7041 && CONST_INT_P (XEXP (XEXP (x
, 0), 1))
7042 && CONST_INT_P (XEXP (x
, 1)))
7044 type
= ADDRESS_REG_UXTW
;
7045 index
= XEXP (XEXP (x
, 0), 0);
7046 shift
= INTVAL (XEXP (XEXP (x
, 0), 1));
7047 if (INTVAL (XEXP (x
, 1)) != (HOST_WIDE_INT
)0xffffffff << shift
)
7050 /* (mult:P (reg:P) (const_int scale)) */
7051 else if (GET_CODE (x
) == MULT
7052 && GET_MODE (x
) == Pmode
7053 && GET_MODE (XEXP (x
, 0)) == Pmode
7054 && CONST_INT_P (XEXP (x
, 1)))
7056 type
= ADDRESS_REG_REG
;
7057 index
= XEXP (x
, 0);
7058 shift
= exact_log2 (INTVAL (XEXP (x
, 1)));
7060 /* (ashift:P (reg:P) (const_int shift)) */
7061 else if (GET_CODE (x
) == ASHIFT
7062 && GET_MODE (x
) == Pmode
7063 && GET_MODE (XEXP (x
, 0)) == Pmode
7064 && CONST_INT_P (XEXP (x
, 1)))
7066 type
= ADDRESS_REG_REG
;
7067 index
= XEXP (x
, 0);
7068 shift
= INTVAL (XEXP (x
, 1));
7074 && GET_CODE (index
) == SUBREG
7075 && contains_reg_of_mode
[GENERAL_REGS
][GET_MODE (SUBREG_REG (index
))])
7076 index
= SUBREG_REG (index
);
7078 if (aarch64_sve_data_mode_p (mode
))
7080 if (type
!= ADDRESS_REG_REG
7081 || (1 << shift
) != GET_MODE_UNIT_SIZE (mode
))
7087 && !(IN_RANGE (shift
, 1, 3)
7088 && known_eq (1 << shift
, GET_MODE_SIZE (mode
))))
7093 && aarch64_regno_ok_for_index_p (REGNO (index
), strict_p
))
7096 info
->offset
= index
;
7097 info
->shift
= shift
;
7104 /* Return true if MODE is one of the modes for which we
7105 support LDP/STP operations. */
7108 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode
)
7110 return mode
== SImode
|| mode
== DImode
7111 || mode
== SFmode
|| mode
== DFmode
7112 || (aarch64_vector_mode_supported_p (mode
)
7113 && (known_eq (GET_MODE_SIZE (mode
), 8)
7114 || (known_eq (GET_MODE_SIZE (mode
), 16)
7115 && (aarch64_tune_params
.extra_tuning_flags
7116 & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS
) == 0)));
7119 /* Return true if REGNO is a virtual pointer register, or an eliminable
7120 "soft" frame register. Like REGNO_PTR_FRAME_P except that we don't
7121 include stack_pointer or hard_frame_pointer. */
7123 virt_or_elim_regno_p (unsigned regno
)
7125 return ((regno
>= FIRST_VIRTUAL_REGISTER
7126 && regno
<= LAST_VIRTUAL_POINTER_REGISTER
)
7127 || regno
== FRAME_POINTER_REGNUM
7128 || regno
== ARG_POINTER_REGNUM
);
7131 /* Return true if X is a valid address of type TYPE for machine mode MODE.
7132 If it is, fill in INFO appropriately. STRICT_P is true if
7133 REG_OK_STRICT is in effect. */
7136 aarch64_classify_address (struct aarch64_address_info
*info
,
7137 rtx x
, machine_mode mode
, bool strict_p
,
7138 aarch64_addr_query_type type
)
7140 enum rtx_code code
= GET_CODE (x
);
7144 HOST_WIDE_INT const_size
;
7146 /* On BE, we use load/store pair for all large int mode load/stores.
7147 TI/TFmode may also use a load/store pair. */
7148 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
7149 bool advsimd_struct_p
= (vec_flags
== (VEC_ADVSIMD
| VEC_STRUCT
));
7150 bool load_store_pair_p
= (type
== ADDR_QUERY_LDP_STP
7151 || type
== ADDR_QUERY_LDP_STP_N
7154 || (BYTES_BIG_ENDIAN
&& advsimd_struct_p
));
7156 /* If we are dealing with ADDR_QUERY_LDP_STP_N that means the incoming mode
7157 corresponds to the actual size of the memory being loaded/stored and the
7158 mode of the corresponding addressing mode is half of that. */
7159 if (type
== ADDR_QUERY_LDP_STP_N
7160 && known_eq (GET_MODE_SIZE (mode
), 16))
7163 bool allow_reg_index_p
= (!load_store_pair_p
7164 && (known_lt (GET_MODE_SIZE (mode
), 16)
7165 || vec_flags
== VEC_ADVSIMD
7166 || vec_flags
& VEC_SVE_DATA
));
7168 /* For SVE, only accept [Rn], [Rn, Rm, LSL #shift] and
7169 [Rn, #offset, MUL VL]. */
7170 if ((vec_flags
& (VEC_SVE_DATA
| VEC_SVE_PRED
)) != 0
7171 && (code
!= REG
&& code
!= PLUS
))
7174 /* On LE, for AdvSIMD, don't support anything other than POST_INC or
7176 if (advsimd_struct_p
7177 && !BYTES_BIG_ENDIAN
7178 && (code
!= POST_INC
&& code
!= REG
))
7181 gcc_checking_assert (GET_MODE (x
) == VOIDmode
7182 || SCALAR_INT_MODE_P (GET_MODE (x
)));
7188 info
->type
= ADDRESS_REG_IMM
;
7190 info
->offset
= const0_rtx
;
7191 info
->const_offset
= 0;
7192 return aarch64_base_register_rtx_p (x
, strict_p
);
7200 && virt_or_elim_regno_p (REGNO (op0
))
7201 && poly_int_rtx_p (op1
, &offset
))
7203 info
->type
= ADDRESS_REG_IMM
;
7206 info
->const_offset
= offset
;
7211 if (maybe_ne (GET_MODE_SIZE (mode
), 0)
7212 && aarch64_base_register_rtx_p (op0
, strict_p
)
7213 && poly_int_rtx_p (op1
, &offset
))
7215 info
->type
= ADDRESS_REG_IMM
;
7218 info
->const_offset
= offset
;
7220 /* TImode and TFmode values are allowed in both pairs of X
7221 registers and individual Q registers. The available
7223 X,X: 7-bit signed scaled offset
7224 Q: 9-bit signed offset
7225 We conservatively require an offset representable in either mode.
7226 When performing the check for pairs of X registers i.e. LDP/STP
7227 pass down DImode since that is the natural size of the LDP/STP
7228 instruction memory accesses. */
7229 if (mode
== TImode
|| mode
== TFmode
)
7230 return (aarch64_offset_7bit_signed_scaled_p (DImode
, offset
)
7231 && (aarch64_offset_9bit_signed_unscaled_p (mode
, offset
)
7232 || offset_12bit_unsigned_scaled_p (mode
, offset
)));
7234 /* A 7bit offset check because OImode will emit a ldp/stp
7235 instruction (only big endian will get here).
7236 For ldp/stp instructions, the offset is scaled for the size of a
7237 single element of the pair. */
7239 return aarch64_offset_7bit_signed_scaled_p (TImode
, offset
);
7241 /* Three 9/12 bit offsets checks because CImode will emit three
7242 ldr/str instructions (only big endian will get here). */
7244 return (aarch64_offset_7bit_signed_scaled_p (TImode
, offset
)
7245 && (aarch64_offset_9bit_signed_unscaled_p (V16QImode
,
7247 || offset_12bit_unsigned_scaled_p (V16QImode
,
7250 /* Two 7bit offsets checks because XImode will emit two ldp/stp
7251 instructions (only big endian will get here). */
7253 return (aarch64_offset_7bit_signed_scaled_p (TImode
, offset
)
7254 && aarch64_offset_7bit_signed_scaled_p (TImode
,
7257 /* Make "m" use the LD1 offset range for SVE data modes, so
7258 that pre-RTL optimizers like ivopts will work to that
7259 instead of the wider LDR/STR range. */
7260 if (vec_flags
== VEC_SVE_DATA
)
7261 return (type
== ADDR_QUERY_M
7262 ? offset_4bit_signed_scaled_p (mode
, offset
)
7263 : offset_9bit_signed_scaled_p (mode
, offset
));
7265 if (vec_flags
== (VEC_SVE_DATA
| VEC_STRUCT
))
7267 poly_int64 end_offset
= (offset
7268 + GET_MODE_SIZE (mode
)
7269 - BYTES_PER_SVE_VECTOR
);
7270 return (type
== ADDR_QUERY_M
7271 ? offset_4bit_signed_scaled_p (mode
, offset
)
7272 : (offset_9bit_signed_scaled_p (SVE_BYTE_MODE
, offset
)
7273 && offset_9bit_signed_scaled_p (SVE_BYTE_MODE
,
7277 if (vec_flags
== VEC_SVE_PRED
)
7278 return offset_9bit_signed_scaled_p (mode
, offset
);
7280 if (load_store_pair_p
)
7281 return ((known_eq (GET_MODE_SIZE (mode
), 4)
7282 || known_eq (GET_MODE_SIZE (mode
), 8)
7283 || known_eq (GET_MODE_SIZE (mode
), 16))
7284 && aarch64_offset_7bit_signed_scaled_p (mode
, offset
));
7286 return (aarch64_offset_9bit_signed_unscaled_p (mode
, offset
)
7287 || offset_12bit_unsigned_scaled_p (mode
, offset
));
7290 if (allow_reg_index_p
)
7292 /* Look for base + (scaled/extended) index register. */
7293 if (aarch64_base_register_rtx_p (op0
, strict_p
)
7294 && aarch64_classify_index (info
, op1
, mode
, strict_p
))
7299 if (aarch64_base_register_rtx_p (op1
, strict_p
)
7300 && aarch64_classify_index (info
, op0
, mode
, strict_p
))
7313 info
->type
= ADDRESS_REG_WB
;
7314 info
->base
= XEXP (x
, 0);
7315 info
->offset
= NULL_RTX
;
7316 return aarch64_base_register_rtx_p (info
->base
, strict_p
);
7320 info
->type
= ADDRESS_REG_WB
;
7321 info
->base
= XEXP (x
, 0);
7322 if (GET_CODE (XEXP (x
, 1)) == PLUS
7323 && poly_int_rtx_p (XEXP (XEXP (x
, 1), 1), &offset
)
7324 && rtx_equal_p (XEXP (XEXP (x
, 1), 0), info
->base
)
7325 && aarch64_base_register_rtx_p (info
->base
, strict_p
))
7327 info
->offset
= XEXP (XEXP (x
, 1), 1);
7328 info
->const_offset
= offset
;
7330 /* TImode and TFmode values are allowed in both pairs of X
7331 registers and individual Q registers. The available
7333 X,X: 7-bit signed scaled offset
7334 Q: 9-bit signed offset
7335 We conservatively require an offset representable in either mode.
7337 if (mode
== TImode
|| mode
== TFmode
)
7338 return (aarch64_offset_7bit_signed_scaled_p (mode
, offset
)
7339 && aarch64_offset_9bit_signed_unscaled_p (mode
, offset
));
7341 if (load_store_pair_p
)
7342 return ((known_eq (GET_MODE_SIZE (mode
), 4)
7343 || known_eq (GET_MODE_SIZE (mode
), 8)
7344 || known_eq (GET_MODE_SIZE (mode
), 16))
7345 && aarch64_offset_7bit_signed_scaled_p (mode
, offset
));
7347 return aarch64_offset_9bit_signed_unscaled_p (mode
, offset
);
7354 /* load literal: pc-relative constant pool entry. Only supported
7355 for SI mode or larger. */
7356 info
->type
= ADDRESS_SYMBOLIC
;
7358 if (!load_store_pair_p
7359 && GET_MODE_SIZE (mode
).is_constant (&const_size
)
7364 split_const (x
, &sym
, &addend
);
7365 return ((GET_CODE (sym
) == LABEL_REF
7366 || (GET_CODE (sym
) == SYMBOL_REF
7367 && CONSTANT_POOL_ADDRESS_P (sym
)
7368 && aarch64_pcrelative_literal_loads
)));
7373 info
->type
= ADDRESS_LO_SUM
;
7374 info
->base
= XEXP (x
, 0);
7375 info
->offset
= XEXP (x
, 1);
7376 if (allow_reg_index_p
7377 && aarch64_base_register_rtx_p (info
->base
, strict_p
))
7380 split_const (info
->offset
, &sym
, &offs
);
7381 if (GET_CODE (sym
) == SYMBOL_REF
7382 && (aarch64_classify_symbol (sym
, INTVAL (offs
))
7383 == SYMBOL_SMALL_ABSOLUTE
))
7385 /* The symbol and offset must be aligned to the access size. */
7388 if (CONSTANT_POOL_ADDRESS_P (sym
))
7389 align
= GET_MODE_ALIGNMENT (get_pool_mode (sym
));
7390 else if (TREE_CONSTANT_POOL_ADDRESS_P (sym
))
7392 tree exp
= SYMBOL_REF_DECL (sym
);
7393 align
= TYPE_ALIGN (TREE_TYPE (exp
));
7394 align
= aarch64_constant_alignment (exp
, align
);
7396 else if (SYMBOL_REF_DECL (sym
))
7397 align
= DECL_ALIGN (SYMBOL_REF_DECL (sym
));
7398 else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym
)
7399 && SYMBOL_REF_BLOCK (sym
) != NULL
)
7400 align
= SYMBOL_REF_BLOCK (sym
)->alignment
;
7402 align
= BITS_PER_UNIT
;
7404 poly_int64 ref_size
= GET_MODE_SIZE (mode
);
7405 if (known_eq (ref_size
, 0))
7406 ref_size
= GET_MODE_SIZE (DImode
);
7408 return (multiple_p (INTVAL (offs
), ref_size
)
7409 && multiple_p (align
/ BITS_PER_UNIT
, ref_size
));
7419 /* Return true if the address X is valid for a PRFM instruction.
7420 STRICT_P is true if we should do strict checking with
7421 aarch64_classify_address. */
7424 aarch64_address_valid_for_prefetch_p (rtx x
, bool strict_p
)
7426 struct aarch64_address_info addr
;
7428 /* PRFM accepts the same addresses as DImode... */
7429 bool res
= aarch64_classify_address (&addr
, x
, DImode
, strict_p
);
7433 /* ... except writeback forms. */
7434 return addr
.type
!= ADDRESS_REG_WB
;
7438 aarch64_symbolic_address_p (rtx x
)
7442 split_const (x
, &x
, &offset
);
7443 return GET_CODE (x
) == SYMBOL_REF
|| GET_CODE (x
) == LABEL_REF
;
7446 /* Classify the base of symbolic expression X. */
7448 enum aarch64_symbol_type
7449 aarch64_classify_symbolic_expression (rtx x
)
7453 split_const (x
, &x
, &offset
);
7454 return aarch64_classify_symbol (x
, INTVAL (offset
));
7458 /* Return TRUE if X is a legitimate address for accessing memory in
7461 aarch64_legitimate_address_hook_p (machine_mode mode
, rtx x
, bool strict_p
)
7463 struct aarch64_address_info addr
;
7465 return aarch64_classify_address (&addr
, x
, mode
, strict_p
);
7468 /* Return TRUE if X is a legitimate address of type TYPE for accessing
7469 memory in mode MODE. STRICT_P is true if REG_OK_STRICT is in effect. */
7471 aarch64_legitimate_address_p (machine_mode mode
, rtx x
, bool strict_p
,
7472 aarch64_addr_query_type type
)
7474 struct aarch64_address_info addr
;
7476 return aarch64_classify_address (&addr
, x
, mode
, strict_p
, type
);
7479 /* Implement TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT. */
7482 aarch64_legitimize_address_displacement (rtx
*offset1
, rtx
*offset2
,
7483 poly_int64 orig_offset
,
7487 if (GET_MODE_SIZE (mode
).is_constant (&size
))
7489 HOST_WIDE_INT const_offset
, second_offset
;
7491 /* A general SVE offset is A * VQ + B. Remove the A component from
7492 coefficient 0 in order to get the constant B. */
7493 const_offset
= orig_offset
.coeffs
[0] - orig_offset
.coeffs
[1];
7495 /* Split an out-of-range address displacement into a base and
7496 offset. Use 4KB range for 1- and 2-byte accesses and a 16KB
7497 range otherwise to increase opportunities for sharing the base
7498 address of different sizes. Unaligned accesses use the signed
7499 9-bit range, TImode/TFmode use the intersection of signed
7500 scaled 7-bit and signed 9-bit offset. */
7501 if (mode
== TImode
|| mode
== TFmode
)
7502 second_offset
= ((const_offset
+ 0x100) & 0x1f8) - 0x100;
7503 else if ((const_offset
& (size
- 1)) != 0)
7504 second_offset
= ((const_offset
+ 0x100) & 0x1ff) - 0x100;
7506 second_offset
= const_offset
& (size
< 4 ? 0xfff : 0x3ffc);
7508 if (second_offset
== 0 || known_eq (orig_offset
, second_offset
))
7511 /* Split the offset into second_offset and the rest. */
7512 *offset1
= gen_int_mode (orig_offset
- second_offset
, Pmode
);
7513 *offset2
= gen_int_mode (second_offset
, Pmode
);
7518 /* Get the mode we should use as the basis of the range. For structure
7519 modes this is the mode of one vector. */
7520 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
7521 machine_mode step_mode
7522 = (vec_flags
& VEC_STRUCT
) != 0 ? SVE_BYTE_MODE
: mode
;
7524 /* Get the "mul vl" multiplier we'd like to use. */
7525 HOST_WIDE_INT factor
= GET_MODE_SIZE (step_mode
).coeffs
[1];
7526 HOST_WIDE_INT vnum
= orig_offset
.coeffs
[1] / factor
;
7527 if (vec_flags
& VEC_SVE_DATA
)
7528 /* LDR supports a 9-bit range, but the move patterns for
7529 structure modes require all vectors to be in range of the
7530 same base. The simplest way of accomodating that while still
7531 promoting reuse of anchor points between different modes is
7532 to use an 8-bit range unconditionally. */
7533 vnum
= ((vnum
+ 128) & 255) - 128;
7535 /* Predicates are only handled singly, so we might as well use
7537 vnum
= ((vnum
+ 256) & 511) - 256;
7541 /* Convert the "mul vl" multiplier into a byte offset. */
7542 poly_int64 second_offset
= GET_MODE_SIZE (step_mode
) * vnum
;
7543 if (known_eq (second_offset
, orig_offset
))
7546 /* Split the offset into second_offset and the rest. */
7547 *offset1
= gen_int_mode (orig_offset
- second_offset
, Pmode
);
7548 *offset2
= gen_int_mode (second_offset
, Pmode
);
7553 /* Return the binary representation of floating point constant VALUE in INTVAL.
7554 If the value cannot be converted, return false without setting INTVAL.
7555 The conversion is done in the given MODE. */
7557 aarch64_reinterpret_float_as_int (rtx value
, unsigned HOST_WIDE_INT
*intval
)
7560 /* We make a general exception for 0. */
7561 if (aarch64_float_const_zero_rtx_p (value
))
7567 scalar_float_mode mode
;
7568 if (GET_CODE (value
) != CONST_DOUBLE
7569 || !is_a
<scalar_float_mode
> (GET_MODE (value
), &mode
)
7570 || GET_MODE_BITSIZE (mode
) > HOST_BITS_PER_WIDE_INT
7571 /* Only support up to DF mode. */
7572 || GET_MODE_BITSIZE (mode
) > GET_MODE_BITSIZE (DFmode
))
7575 unsigned HOST_WIDE_INT ival
= 0;
7578 real_to_target (res
,
7579 CONST_DOUBLE_REAL_VALUE (value
),
7580 REAL_MODE_FORMAT (mode
));
7584 int order
= BYTES_BIG_ENDIAN
? 1 : 0;
7585 ival
= zext_hwi (res
[order
], 32);
7586 ival
|= (zext_hwi (res
[1 - order
], 32) << 32);
7589 ival
= zext_hwi (res
[0], 32);
7595 /* Return TRUE if rtx X is an immediate constant that can be moved using a
7596 single MOV(+MOVK) followed by an FMOV. */
7598 aarch64_float_const_rtx_p (rtx x
)
7600 machine_mode mode
= GET_MODE (x
);
7601 if (mode
== VOIDmode
)
7604 /* Determine whether it's cheaper to write float constants as
7605 mov/movk pairs over ldr/adrp pairs. */
7606 unsigned HOST_WIDE_INT ival
;
7608 if (GET_CODE (x
) == CONST_DOUBLE
7609 && SCALAR_FLOAT_MODE_P (mode
)
7610 && aarch64_reinterpret_float_as_int (x
, &ival
))
7612 scalar_int_mode imode
= (mode
== HFmode
7614 : int_mode_for_mode (mode
).require ());
7615 int num_instr
= aarch64_internal_mov_immediate
7616 (NULL_RTX
, gen_int_mode (ival
, imode
), false, imode
);
7617 return num_instr
< 3;
7623 /* Return TRUE if rtx X is immediate constant 0.0 */
7625 aarch64_float_const_zero_rtx_p (rtx x
)
7627 if (GET_MODE (x
) == VOIDmode
)
7630 if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x
)))
7631 return !HONOR_SIGNED_ZEROS (GET_MODE (x
));
7632 return real_equal (CONST_DOUBLE_REAL_VALUE (x
), &dconst0
);
7635 /* Return TRUE if rtx X is immediate constant that fits in a single
7636 MOVI immediate operation. */
7638 aarch64_can_const_movi_rtx_p (rtx x
, machine_mode mode
)
7644 scalar_int_mode imode
;
7645 unsigned HOST_WIDE_INT ival
;
7647 if (GET_CODE (x
) == CONST_DOUBLE
7648 && SCALAR_FLOAT_MODE_P (mode
))
7650 if (!aarch64_reinterpret_float_as_int (x
, &ival
))
7653 /* We make a general exception for 0. */
7654 if (aarch64_float_const_zero_rtx_p (x
))
7657 imode
= int_mode_for_mode (mode
).require ();
7659 else if (GET_CODE (x
) == CONST_INT
7660 && is_a
<scalar_int_mode
> (mode
, &imode
))
7665 /* use a 64 bit mode for everything except for DI/DF mode, where we use
7666 a 128 bit vector mode. */
7667 int width
= GET_MODE_BITSIZE (imode
) == 64 ? 128 : 64;
7669 vmode
= aarch64_simd_container_mode (imode
, width
);
7670 rtx v_op
= aarch64_simd_gen_const_vector_dup (vmode
, ival
);
7672 return aarch64_simd_valid_immediate (v_op
, NULL
);
7676 /* Return the fixed registers used for condition codes. */
7679 aarch64_fixed_condition_code_regs (unsigned int *p1
, unsigned int *p2
)
7682 *p2
= INVALID_REGNUM
;
7686 /* This function is used by the call expanders of the machine description.
7687 RESULT is the register in which the result is returned. It's NULL for
7688 "call" and "sibcall".
7689 MEM is the location of the function call.
7690 SIBCALL indicates whether this function call is normal call or sibling call.
7691 It will generate different pattern accordingly. */
7694 aarch64_expand_call (rtx result
, rtx mem
, bool sibcall
)
7696 rtx call
, callee
, tmp
;
7700 gcc_assert (MEM_P (mem
));
7701 callee
= XEXP (mem
, 0);
7702 mode
= GET_MODE (callee
);
7703 gcc_assert (mode
== Pmode
);
7705 /* Decide if we should generate indirect calls by loading the
7706 address of the callee into a register before performing
7707 the branch-and-link. */
7708 if (SYMBOL_REF_P (callee
)
7709 ? (aarch64_is_long_call_p (callee
)
7710 || aarch64_is_noplt_call_p (callee
))
7712 XEXP (mem
, 0) = force_reg (mode
, callee
);
7714 call
= gen_rtx_CALL (VOIDmode
, mem
, const0_rtx
);
7716 if (result
!= NULL_RTX
)
7717 call
= gen_rtx_SET (result
, call
);
7722 tmp
= gen_rtx_CLOBBER (VOIDmode
, gen_rtx_REG (Pmode
, LR_REGNUM
));
7724 vec
= gen_rtvec (2, call
, tmp
);
7725 call
= gen_rtx_PARALLEL (VOIDmode
, vec
);
7727 aarch64_emit_call_insn (call
);
7730 /* Emit call insn with PAT and do aarch64-specific handling. */
7733 aarch64_emit_call_insn (rtx pat
)
7735 rtx insn
= emit_call_insn (pat
);
7737 rtx
*fusage
= &CALL_INSN_FUNCTION_USAGE (insn
);
7738 clobber_reg (fusage
, gen_rtx_REG (word_mode
, IP0_REGNUM
));
7739 clobber_reg (fusage
, gen_rtx_REG (word_mode
, IP1_REGNUM
));
7743 aarch64_select_cc_mode (RTX_CODE code
, rtx x
, rtx y
)
7745 machine_mode mode_x
= GET_MODE (x
);
7746 rtx_code code_x
= GET_CODE (x
);
7748 /* All floating point compares return CCFP if it is an equality
7749 comparison, and CCFPE otherwise. */
7750 if (GET_MODE_CLASS (mode_x
) == MODE_FLOAT
)
7777 /* Equality comparisons of short modes against zero can be performed
7778 using the TST instruction with the appropriate bitmask. */
7779 if (y
== const0_rtx
&& (REG_P (x
) || SUBREG_P (x
))
7780 && (code
== EQ
|| code
== NE
)
7781 && (mode_x
== HImode
|| mode_x
== QImode
))
7784 /* Similarly, comparisons of zero_extends from shorter modes can
7785 be performed using an ANDS with an immediate mask. */
7786 if (y
== const0_rtx
&& code_x
== ZERO_EXTEND
7787 && (mode_x
== SImode
|| mode_x
== DImode
)
7788 && (GET_MODE (XEXP (x
, 0)) == HImode
|| GET_MODE (XEXP (x
, 0)) == QImode
)
7789 && (code
== EQ
|| code
== NE
))
7792 if ((mode_x
== SImode
|| mode_x
== DImode
)
7794 && (code
== EQ
|| code
== NE
|| code
== LT
|| code
== GE
)
7795 && (code_x
== PLUS
|| code_x
== MINUS
|| code_x
== AND
7797 || (code_x
== ZERO_EXTRACT
&& CONST_INT_P (XEXP (x
, 1))
7798 && CONST_INT_P (XEXP (x
, 2)))))
7801 /* A compare with a shifted operand. Because of canonicalization,
7802 the comparison will have to be swapped when we emit the assembly
7804 if ((mode_x
== SImode
|| mode_x
== DImode
)
7805 && (REG_P (y
) || GET_CODE (y
) == SUBREG
|| y
== const0_rtx
)
7806 && (code_x
== ASHIFT
|| code_x
== ASHIFTRT
7807 || code_x
== LSHIFTRT
7808 || code_x
== ZERO_EXTEND
|| code_x
== SIGN_EXTEND
))
7811 /* Similarly for a negated operand, but we can only do this for
7813 if ((mode_x
== SImode
|| mode_x
== DImode
)
7814 && (REG_P (y
) || GET_CODE (y
) == SUBREG
)
7815 && (code
== EQ
|| code
== NE
)
7819 /* A test for unsigned overflow from an addition. */
7820 if ((mode_x
== DImode
|| mode_x
== TImode
)
7821 && (code
== LTU
|| code
== GEU
)
7823 && rtx_equal_p (XEXP (x
, 0), y
))
7826 /* A test for unsigned overflow from an add with carry. */
7827 if ((mode_x
== DImode
|| mode_x
== TImode
)
7828 && (code
== LTU
|| code
== GEU
)
7830 && CONST_SCALAR_INT_P (y
)
7831 && (rtx_mode_t (y
, mode_x
)
7832 == (wi::shwi (1, mode_x
)
7833 << (GET_MODE_BITSIZE (mode_x
).to_constant () / 2))))
7836 /* A test for signed overflow. */
7837 if ((mode_x
== DImode
|| mode_x
== TImode
)
7840 && GET_CODE (y
) == SIGN_EXTEND
)
7843 /* For everything else, return CCmode. */
7848 aarch64_get_condition_code_1 (machine_mode
, enum rtx_code
);
7851 aarch64_get_condition_code (rtx x
)
7853 machine_mode mode
= GET_MODE (XEXP (x
, 0));
7854 enum rtx_code comp_code
= GET_CODE (x
);
7856 if (GET_MODE_CLASS (mode
) != MODE_CC
)
7857 mode
= SELECT_CC_MODE (comp_code
, XEXP (x
, 0), XEXP (x
, 1));
7858 return aarch64_get_condition_code_1 (mode
, comp_code
);
7862 aarch64_get_condition_code_1 (machine_mode mode
, enum rtx_code comp_code
)
7870 case GE
: return AARCH64_GE
;
7871 case GT
: return AARCH64_GT
;
7872 case LE
: return AARCH64_LS
;
7873 case LT
: return AARCH64_MI
;
7874 case NE
: return AARCH64_NE
;
7875 case EQ
: return AARCH64_EQ
;
7876 case ORDERED
: return AARCH64_VC
;
7877 case UNORDERED
: return AARCH64_VS
;
7878 case UNLT
: return AARCH64_LT
;
7879 case UNLE
: return AARCH64_LE
;
7880 case UNGT
: return AARCH64_HI
;
7881 case UNGE
: return AARCH64_PL
;
7889 case NE
: return AARCH64_NE
;
7890 case EQ
: return AARCH64_EQ
;
7891 case GE
: return AARCH64_GE
;
7892 case GT
: return AARCH64_GT
;
7893 case LE
: return AARCH64_LE
;
7894 case LT
: return AARCH64_LT
;
7895 case GEU
: return AARCH64_CS
;
7896 case GTU
: return AARCH64_HI
;
7897 case LEU
: return AARCH64_LS
;
7898 case LTU
: return AARCH64_CC
;
7906 case NE
: return AARCH64_NE
;
7907 case EQ
: return AARCH64_EQ
;
7908 case GE
: return AARCH64_LE
;
7909 case GT
: return AARCH64_LT
;
7910 case LE
: return AARCH64_GE
;
7911 case LT
: return AARCH64_GT
;
7912 case GEU
: return AARCH64_LS
;
7913 case GTU
: return AARCH64_CC
;
7914 case LEU
: return AARCH64_CS
;
7915 case LTU
: return AARCH64_HI
;
7923 case NE
: return AARCH64_NE
; /* = any */
7924 case EQ
: return AARCH64_EQ
; /* = none */
7925 case GE
: return AARCH64_PL
; /* = nfrst */
7926 case LT
: return AARCH64_MI
; /* = first */
7927 case GEU
: return AARCH64_CS
; /* = nlast */
7928 case GTU
: return AARCH64_HI
; /* = pmore */
7929 case LEU
: return AARCH64_LS
; /* = plast */
7930 case LTU
: return AARCH64_CC
; /* = last */
7938 case NE
: return AARCH64_NE
;
7939 case EQ
: return AARCH64_EQ
;
7940 case GE
: return AARCH64_PL
;
7941 case LT
: return AARCH64_MI
;
7949 case NE
: return AARCH64_NE
;
7950 case EQ
: return AARCH64_EQ
;
7958 case LTU
: return AARCH64_CS
;
7959 case GEU
: return AARCH64_CC
;
7967 case GEU
: return AARCH64_CS
;
7968 case LTU
: return AARCH64_CC
;
7976 case NE
: return AARCH64_VS
;
7977 case EQ
: return AARCH64_VC
;
7990 aarch64_const_vec_all_same_in_range_p (rtx x
,
7991 HOST_WIDE_INT minval
,
7992 HOST_WIDE_INT maxval
)
7995 return (const_vec_duplicate_p (x
, &elt
)
7996 && CONST_INT_P (elt
)
7997 && IN_RANGE (INTVAL (elt
), minval
, maxval
));
8001 aarch64_const_vec_all_same_int_p (rtx x
, HOST_WIDE_INT val
)
8003 return aarch64_const_vec_all_same_in_range_p (x
, val
, val
);
8006 /* Return true if VEC is a constant in which every element is in the range
8007 [MINVAL, MAXVAL]. The elements do not need to have the same value. */
8010 aarch64_const_vec_all_in_range_p (rtx vec
,
8011 HOST_WIDE_INT minval
,
8012 HOST_WIDE_INT maxval
)
8014 if (GET_CODE (vec
) != CONST_VECTOR
8015 || GET_MODE_CLASS (GET_MODE (vec
)) != MODE_VECTOR_INT
)
8019 if (!CONST_VECTOR_STEPPED_P (vec
))
8020 nunits
= const_vector_encoded_nelts (vec
);
8021 else if (!CONST_VECTOR_NUNITS (vec
).is_constant (&nunits
))
8024 for (int i
= 0; i
< nunits
; i
++)
8026 rtx vec_elem
= CONST_VECTOR_ELT (vec
, i
);
8027 if (!CONST_INT_P (vec_elem
)
8028 || !IN_RANGE (INTVAL (vec_elem
), minval
, maxval
))
8035 #define AARCH64_CC_V 1
8036 #define AARCH64_CC_C (1 << 1)
8037 #define AARCH64_CC_Z (1 << 2)
8038 #define AARCH64_CC_N (1 << 3)
8040 /* N Z C V flags for ccmp. Indexed by AARCH64_COND_CODE. */
8041 static const int aarch64_nzcv_codes
[] =
8043 0, /* EQ, Z == 1. */
8044 AARCH64_CC_Z
, /* NE, Z == 0. */
8045 0, /* CS, C == 1. */
8046 AARCH64_CC_C
, /* CC, C == 0. */
8047 0, /* MI, N == 1. */
8048 AARCH64_CC_N
, /* PL, N == 0. */
8049 0, /* VS, V == 1. */
8050 AARCH64_CC_V
, /* VC, V == 0. */
8051 0, /* HI, C ==1 && Z == 0. */
8052 AARCH64_CC_C
, /* LS, !(C == 1 && Z == 0). */
8053 AARCH64_CC_V
, /* GE, N == V. */
8054 0, /* LT, N != V. */
8055 AARCH64_CC_Z
, /* GT, Z == 0 && N == V. */
8056 0, /* LE, !(Z == 0 && N == V). */
8061 /* Print floating-point vector immediate operand X to F, negating it
8062 first if NEGATE is true. Return true on success, false if it isn't
8063 a constant we can handle. */
8066 aarch64_print_vector_float_operand (FILE *f
, rtx x
, bool negate
)
8070 if (!const_vec_duplicate_p (x
, &elt
))
8073 REAL_VALUE_TYPE r
= *CONST_DOUBLE_REAL_VALUE (elt
);
8075 r
= real_value_negate (&r
);
8077 /* We only handle the SVE single-bit immediates here. */
8078 if (real_equal (&r
, &dconst0
))
8079 asm_fprintf (f
, "0.0");
8080 else if (real_equal (&r
, &dconst1
))
8081 asm_fprintf (f
, "1.0");
8082 else if (real_equal (&r
, &dconsthalf
))
8083 asm_fprintf (f
, "0.5");
8090 /* Return the equivalent letter for size. */
8092 sizetochar (int size
)
8096 case 64: return 'd';
8097 case 32: return 's';
8098 case 16: return 'h';
8099 case 8 : return 'b';
8100 default: gcc_unreachable ();
8104 /* Print operand X to file F in a target specific manner according to CODE.
8105 The acceptable formatting commands given by CODE are:
8106 'c': An integer or symbol address without a preceding #
8108 'C': Take the duplicated element in a vector constant
8109 and print it in hex.
8110 'D': Take the duplicated element in a vector constant
8111 and print it as an unsigned integer, in decimal.
8112 'e': Print the sign/zero-extend size as a character 8->b,
8114 'p': Prints N such that 2^N == X (X must be power of 2 and
8116 'P': Print the number of non-zero bits in X (a const_int).
8117 'H': Print the higher numbered register of a pair (TImode)
8119 'm': Print a condition (eq, ne, etc).
8120 'M': Same as 'm', but invert condition.
8121 'N': Take the duplicated element in a vector constant
8122 and print the negative of it in decimal.
8123 'b/h/s/d/q': Print a scalar FP/SIMD register name.
8124 'S/T/U/V': Print a FP/SIMD register name for a register list.
8125 The register printed is the FP/SIMD register name
8126 of X + 0/1/2/3 for S/T/U/V.
8127 'R': Print a scalar FP/SIMD register name + 1.
8128 'X': Print bottom 16 bits of integer constant in hex.
8129 'w/x': Print a general register name or the zero register
8131 '0': Print a normal operand, if it's a general register,
8132 then we assume DImode.
8133 'k': Print NZCV for conditional compare instructions.
8134 'A': Output address constant representing the first
8135 argument of X, specifying a relocation offset
8137 'L': Output constant address specified by X
8138 with a relocation offset if appropriate.
8139 'G': Prints address of X, specifying a PC relative
8140 relocation mode if appropriate.
8141 'y': Output address of LDP or STP - this is used for
8142 some LDP/STPs which don't use a PARALLEL in their
8143 pattern (so the mode needs to be adjusted).
8144 'z': Output address of a typical LDP or STP. */
8147 aarch64_print_operand (FILE *f
, rtx x
, int code
)
8153 switch (GET_CODE (x
))
8156 fprintf (f
, HOST_WIDE_INT_PRINT_DEC
, INTVAL (x
));
8160 output_addr_const (f
, x
);
8164 if (GET_CODE (XEXP (x
, 0)) == PLUS
8165 && GET_CODE (XEXP (XEXP (x
, 0), 0)) == SYMBOL_REF
)
8167 output_addr_const (f
, x
);
8173 output_operand_lossage ("unsupported operand for code '%c'", code
);
8181 if (!CONST_INT_P (x
)
8182 || (n
= exact_log2 (INTVAL (x
) & ~7)) <= 0)
8184 output_operand_lossage ("invalid operand for '%%%c'", code
);
8200 output_operand_lossage ("invalid operand for '%%%c'", code
);
8210 if (!CONST_INT_P (x
) || (n
= exact_log2 (INTVAL (x
))) < 0)
8212 output_operand_lossage ("invalid operand for '%%%c'", code
);
8216 asm_fprintf (f
, "%d", n
);
8221 if (!CONST_INT_P (x
))
8223 output_operand_lossage ("invalid operand for '%%%c'", code
);
8227 asm_fprintf (f
, "%u", popcount_hwi (INTVAL (x
)));
8231 if (x
== const0_rtx
)
8233 asm_fprintf (f
, "xzr");
8237 if (!REG_P (x
) || !GP_REGNUM_P (REGNO (x
) + 1))
8239 output_operand_lossage ("invalid operand for '%%%c'", code
);
8243 asm_fprintf (f
, "%s", reg_names
[REGNO (x
) + 1]);
8250 /* CONST_TRUE_RTX means al/nv (al is the default, don't print it). */
8251 if (x
== const_true_rtx
)
8258 if (!COMPARISON_P (x
))
8260 output_operand_lossage ("invalid operand for '%%%c'", code
);
8264 cond_code
= aarch64_get_condition_code (x
);
8265 gcc_assert (cond_code
>= 0);
8267 cond_code
= AARCH64_INVERSE_CONDITION_CODE (cond_code
);
8268 if (GET_MODE (XEXP (x
, 0)) == CC_NZCmode
)
8269 fputs (aarch64_sve_condition_codes
[cond_code
], f
);
8271 fputs (aarch64_condition_codes
[cond_code
], f
);
8276 if (!const_vec_duplicate_p (x
, &elt
))
8278 output_operand_lossage ("invalid vector constant");
8282 if (GET_MODE_CLASS (GET_MODE (x
)) == MODE_VECTOR_INT
)
8283 asm_fprintf (f
, "%wd", -INTVAL (elt
));
8284 else if (GET_MODE_CLASS (GET_MODE (x
)) == MODE_VECTOR_FLOAT
8285 && aarch64_print_vector_float_operand (f
, x
, true))
8289 output_operand_lossage ("invalid vector constant");
8299 if (!REG_P (x
) || !FP_REGNUM_P (REGNO (x
)))
8301 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code
);
8304 asm_fprintf (f
, "%c%d", code
, REGNO (x
) - V0_REGNUM
);
8311 if (!REG_P (x
) || !FP_REGNUM_P (REGNO (x
)))
8313 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code
);
8316 asm_fprintf (f
, "%c%d",
8317 aarch64_sve_data_mode_p (GET_MODE (x
)) ? 'z' : 'v',
8318 REGNO (x
) - V0_REGNUM
+ (code
- 'S'));
8322 if (!REG_P (x
) || !FP_REGNUM_P (REGNO (x
)))
8324 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code
);
8327 asm_fprintf (f
, "q%d", REGNO (x
) - V0_REGNUM
+ 1);
8331 if (!CONST_INT_P (x
))
8333 output_operand_lossage ("invalid operand for '%%%c'", code
);
8336 asm_fprintf (f
, "0x%wx", UINTVAL (x
) & 0xffff);
8341 /* Print a replicated constant in hex. */
8342 if (!const_vec_duplicate_p (x
, &elt
) || !CONST_INT_P (elt
))
8344 output_operand_lossage ("invalid operand for '%%%c'", code
);
8347 scalar_mode inner_mode
= GET_MODE_INNER (GET_MODE (x
));
8348 asm_fprintf (f
, "0x%wx", UINTVAL (elt
) & GET_MODE_MASK (inner_mode
));
8354 /* Print a replicated constant in decimal, treating it as
8356 if (!const_vec_duplicate_p (x
, &elt
) || !CONST_INT_P (elt
))
8358 output_operand_lossage ("invalid operand for '%%%c'", code
);
8361 scalar_mode inner_mode
= GET_MODE_INNER (GET_MODE (x
));
8362 asm_fprintf (f
, "%wd", UINTVAL (elt
) & GET_MODE_MASK (inner_mode
));
8369 || (CONST_DOUBLE_P (x
) && aarch64_float_const_zero_rtx_p (x
)))
8371 asm_fprintf (f
, "%czr", code
);
8375 if (REG_P (x
) && GP_REGNUM_P (REGNO (x
)))
8377 asm_fprintf (f
, "%c%d", code
, REGNO (x
) - R0_REGNUM
);
8381 if (REG_P (x
) && REGNO (x
) == SP_REGNUM
)
8383 asm_fprintf (f
, "%ssp", code
== 'w' ? "w" : "");
8392 output_operand_lossage ("missing operand");
8396 switch (GET_CODE (x
))
8399 if (aarch64_sve_data_mode_p (GET_MODE (x
)))
8401 if (REG_NREGS (x
) == 1)
8402 asm_fprintf (f
, "z%d", REGNO (x
) - V0_REGNUM
);
8406 = sizetochar (GET_MODE_UNIT_BITSIZE (GET_MODE (x
)));
8407 asm_fprintf (f
, "{z%d.%c - z%d.%c}",
8408 REGNO (x
) - V0_REGNUM
, suffix
,
8409 END_REGNO (x
) - V0_REGNUM
- 1, suffix
);
8413 asm_fprintf (f
, "%s", reg_names
[REGNO (x
)]);
8417 output_address (GET_MODE (x
), XEXP (x
, 0));
8422 output_addr_const (asm_out_file
, x
);
8426 asm_fprintf (f
, "%wd", INTVAL (x
));
8430 if (!VECTOR_MODE_P (GET_MODE (x
)))
8432 output_addr_const (asm_out_file
, x
);
8438 if (!const_vec_duplicate_p (x
, &elt
))
8440 output_operand_lossage ("invalid vector constant");
8444 if (GET_MODE_CLASS (GET_MODE (x
)) == MODE_VECTOR_INT
)
8445 asm_fprintf (f
, "%wd", INTVAL (elt
));
8446 else if (GET_MODE_CLASS (GET_MODE (x
)) == MODE_VECTOR_FLOAT
8447 && aarch64_print_vector_float_operand (f
, x
, false))
8451 output_operand_lossage ("invalid vector constant");
8457 /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
8458 be getting CONST_DOUBLEs holding integers. */
8459 gcc_assert (GET_MODE (x
) != VOIDmode
);
8460 if (aarch64_float_const_zero_rtx_p (x
))
8465 else if (aarch64_float_const_representable_p (x
))
8468 char float_buf
[buf_size
] = {'\0'};
8469 real_to_decimal_for_mode (float_buf
,
8470 CONST_DOUBLE_REAL_VALUE (x
),
8473 asm_fprintf (asm_out_file
, "%s", float_buf
);
8477 output_operand_lossage ("invalid constant");
8480 output_operand_lossage ("invalid operand");
8486 if (GET_CODE (x
) == HIGH
)
8489 switch (aarch64_classify_symbolic_expression (x
))
8491 case SYMBOL_SMALL_GOT_4G
:
8492 asm_fprintf (asm_out_file
, ":got:");
8495 case SYMBOL_SMALL_TLSGD
:
8496 asm_fprintf (asm_out_file
, ":tlsgd:");
8499 case SYMBOL_SMALL_TLSDESC
:
8500 asm_fprintf (asm_out_file
, ":tlsdesc:");
8503 case SYMBOL_SMALL_TLSIE
:
8504 asm_fprintf (asm_out_file
, ":gottprel:");
8507 case SYMBOL_TLSLE24
:
8508 asm_fprintf (asm_out_file
, ":tprel:");
8511 case SYMBOL_TINY_GOT
:
8518 output_addr_const (asm_out_file
, x
);
8522 switch (aarch64_classify_symbolic_expression (x
))
8524 case SYMBOL_SMALL_GOT_4G
:
8525 asm_fprintf (asm_out_file
, ":lo12:");
8528 case SYMBOL_SMALL_TLSGD
:
8529 asm_fprintf (asm_out_file
, ":tlsgd_lo12:");
8532 case SYMBOL_SMALL_TLSDESC
:
8533 asm_fprintf (asm_out_file
, ":tlsdesc_lo12:");
8536 case SYMBOL_SMALL_TLSIE
:
8537 asm_fprintf (asm_out_file
, ":gottprel_lo12:");
8540 case SYMBOL_TLSLE12
:
8541 asm_fprintf (asm_out_file
, ":tprel_lo12:");
8544 case SYMBOL_TLSLE24
:
8545 asm_fprintf (asm_out_file
, ":tprel_lo12_nc:");
8548 case SYMBOL_TINY_GOT
:
8549 asm_fprintf (asm_out_file
, ":got:");
8552 case SYMBOL_TINY_TLSIE
:
8553 asm_fprintf (asm_out_file
, ":gottprel:");
8559 output_addr_const (asm_out_file
, x
);
8563 switch (aarch64_classify_symbolic_expression (x
))
8565 case SYMBOL_TLSLE24
:
8566 asm_fprintf (asm_out_file
, ":tprel_hi12:");
8571 output_addr_const (asm_out_file
, x
);
8576 HOST_WIDE_INT cond_code
;
8578 if (!CONST_INT_P (x
))
8580 output_operand_lossage ("invalid operand for '%%%c'", code
);
8584 cond_code
= INTVAL (x
);
8585 gcc_assert (cond_code
>= 0 && cond_code
<= AARCH64_NV
);
8586 asm_fprintf (f
, "%d", aarch64_nzcv_codes
[cond_code
]);
8593 machine_mode mode
= GET_MODE (x
);
8595 if (GET_CODE (x
) != MEM
8596 || (code
== 'y' && maybe_ne (GET_MODE_SIZE (mode
), 16)))
8598 output_operand_lossage ("invalid operand for '%%%c'", code
);
8602 if (!aarch64_print_address_internal (f
, mode
, XEXP (x
, 0),
8604 ? ADDR_QUERY_LDP_STP_N
8605 : ADDR_QUERY_LDP_STP
))
8606 output_operand_lossage ("invalid operand prefix '%%%c'", code
);
8611 output_operand_lossage ("invalid operand prefix '%%%c'", code
);
8616 /* Print address 'x' of a memory access with mode 'mode'.
8617 'op' is the context required by aarch64_classify_address. It can either be
8618 MEM for a normal memory access or PARALLEL for LDP/STP. */
8620 aarch64_print_address_internal (FILE *f
, machine_mode mode
, rtx x
,
8621 aarch64_addr_query_type type
)
8623 struct aarch64_address_info addr
;
8626 /* Check all addresses are Pmode - including ILP32. */
8627 if (GET_MODE (x
) != Pmode
8628 && (!CONST_INT_P (x
)
8629 || trunc_int_for_mode (INTVAL (x
), Pmode
) != INTVAL (x
)))
8631 output_operand_lossage ("invalid address mode");
8635 if (aarch64_classify_address (&addr
, x
, mode
, true, type
))
8638 case ADDRESS_REG_IMM
:
8639 if (known_eq (addr
.const_offset
, 0))
8640 asm_fprintf (f
, "[%s]", reg_names
[REGNO (addr
.base
)]);
8641 else if (aarch64_sve_data_mode_p (mode
))
8644 = exact_div (addr
.const_offset
,
8645 BYTES_PER_SVE_VECTOR
).to_constant ();
8646 asm_fprintf (f
, "[%s, #%wd, mul vl]",
8647 reg_names
[REGNO (addr
.base
)], vnum
);
8649 else if (aarch64_sve_pred_mode_p (mode
))
8652 = exact_div (addr
.const_offset
,
8653 BYTES_PER_SVE_PRED
).to_constant ();
8654 asm_fprintf (f
, "[%s, #%wd, mul vl]",
8655 reg_names
[REGNO (addr
.base
)], vnum
);
8658 asm_fprintf (f
, "[%s, %wd]", reg_names
[REGNO (addr
.base
)],
8659 INTVAL (addr
.offset
));
8662 case ADDRESS_REG_REG
:
8663 if (addr
.shift
== 0)
8664 asm_fprintf (f
, "[%s, %s]", reg_names
[REGNO (addr
.base
)],
8665 reg_names
[REGNO (addr
.offset
)]);
8667 asm_fprintf (f
, "[%s, %s, lsl %u]", reg_names
[REGNO (addr
.base
)],
8668 reg_names
[REGNO (addr
.offset
)], addr
.shift
);
8671 case ADDRESS_REG_UXTW
:
8672 if (addr
.shift
== 0)
8673 asm_fprintf (f
, "[%s, w%d, uxtw]", reg_names
[REGNO (addr
.base
)],
8674 REGNO (addr
.offset
) - R0_REGNUM
);
8676 asm_fprintf (f
, "[%s, w%d, uxtw %u]", reg_names
[REGNO (addr
.base
)],
8677 REGNO (addr
.offset
) - R0_REGNUM
, addr
.shift
);
8680 case ADDRESS_REG_SXTW
:
8681 if (addr
.shift
== 0)
8682 asm_fprintf (f
, "[%s, w%d, sxtw]", reg_names
[REGNO (addr
.base
)],
8683 REGNO (addr
.offset
) - R0_REGNUM
);
8685 asm_fprintf (f
, "[%s, w%d, sxtw %u]", reg_names
[REGNO (addr
.base
)],
8686 REGNO (addr
.offset
) - R0_REGNUM
, addr
.shift
);
8689 case ADDRESS_REG_WB
:
8690 /* Writeback is only supported for fixed-width modes. */
8691 size
= GET_MODE_SIZE (mode
).to_constant ();
8692 switch (GET_CODE (x
))
8695 asm_fprintf (f
, "[%s, %d]!", reg_names
[REGNO (addr
.base
)], size
);
8698 asm_fprintf (f
, "[%s], %d", reg_names
[REGNO (addr
.base
)], size
);
8701 asm_fprintf (f
, "[%s, -%d]!", reg_names
[REGNO (addr
.base
)], size
);
8704 asm_fprintf (f
, "[%s], -%d", reg_names
[REGNO (addr
.base
)], size
);
8707 asm_fprintf (f
, "[%s, %wd]!", reg_names
[REGNO (addr
.base
)],
8708 INTVAL (addr
.offset
));
8711 asm_fprintf (f
, "[%s], %wd", reg_names
[REGNO (addr
.base
)],
8712 INTVAL (addr
.offset
));
8719 case ADDRESS_LO_SUM
:
8720 asm_fprintf (f
, "[%s, #:lo12:", reg_names
[REGNO (addr
.base
)]);
8721 output_addr_const (f
, addr
.offset
);
8722 asm_fprintf (f
, "]");
8725 case ADDRESS_SYMBOLIC
:
8726 output_addr_const (f
, x
);
8733 /* Print address 'x' of a memory access with mode 'mode'. */
8735 aarch64_print_operand_address (FILE *f
, machine_mode mode
, rtx x
)
8737 if (!aarch64_print_address_internal (f
, mode
, x
, ADDR_QUERY_ANY
))
8738 output_addr_const (f
, x
);
8742 aarch64_label_mentioned_p (rtx x
)
8747 if (GET_CODE (x
) == LABEL_REF
)
8750 /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
8751 referencing instruction, but they are constant offsets, not
8753 if (GET_CODE (x
) == UNSPEC
&& XINT (x
, 1) == UNSPEC_TLS
)
8756 fmt
= GET_RTX_FORMAT (GET_CODE (x
));
8757 for (i
= GET_RTX_LENGTH (GET_CODE (x
)) - 1; i
>= 0; i
--)
8763 for (j
= XVECLEN (x
, i
) - 1; j
>= 0; j
--)
8764 if (aarch64_label_mentioned_p (XVECEXP (x
, i
, j
)))
8767 else if (fmt
[i
] == 'e' && aarch64_label_mentioned_p (XEXP (x
, i
)))
8774 /* Implement REGNO_REG_CLASS. */
8777 aarch64_regno_regclass (unsigned regno
)
8779 if (GP_REGNUM_P (regno
))
8780 return GENERAL_REGS
;
8782 if (regno
== SP_REGNUM
)
8785 if (regno
== FRAME_POINTER_REGNUM
8786 || regno
== ARG_POINTER_REGNUM
)
8787 return POINTER_REGS
;
8789 if (FP_REGNUM_P (regno
))
8790 return (FP_LO8_REGNUM_P (regno
) ? FP_LO8_REGS
8791 : FP_LO_REGNUM_P (regno
) ? FP_LO_REGS
: FP_REGS
);
8793 if (PR_REGNUM_P (regno
))
8794 return PR_LO_REGNUM_P (regno
) ? PR_LO_REGS
: PR_HI_REGS
;
8799 /* OFFSET is an address offset for mode MODE, which has SIZE bytes.
8800 If OFFSET is out of range, return an offset of an anchor point
8801 that is in range. Return 0 otherwise. */
8803 static HOST_WIDE_INT
8804 aarch64_anchor_offset (HOST_WIDE_INT offset
, HOST_WIDE_INT size
,
8807 /* Does it look like we'll need a 16-byte load/store-pair operation? */
8809 return (offset
+ 0x400) & ~0x7f0;
8811 /* For offsets that aren't a multiple of the access size, the limit is
8813 if (offset
& (size
- 1))
8815 /* BLKmode typically uses LDP of X-registers. */
8816 if (mode
== BLKmode
)
8817 return (offset
+ 512) & ~0x3ff;
8818 return (offset
+ 0x100) & ~0x1ff;
8821 /* Small negative offsets are supported. */
8822 if (IN_RANGE (offset
, -256, 0))
8825 if (mode
== TImode
|| mode
== TFmode
)
8826 return (offset
+ 0x100) & ~0x1ff;
8828 /* Use 12-bit offset by access size. */
8829 return offset
& (~0xfff * size
);
8833 aarch64_legitimize_address (rtx x
, rtx
/* orig_x */, machine_mode mode
)
8835 /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
8836 where mask is selected by alignment and size of the offset.
8837 We try to pick as large a range for the offset as possible to
8838 maximize the chance of a CSE. However, for aligned addresses
8839 we limit the range to 4k so that structures with different sized
8840 elements are likely to use the same base. We need to be careful
8841 not to split a CONST for some forms of address expression, otherwise
8842 it will generate sub-optimal code. */
8844 if (GET_CODE (x
) == PLUS
&& CONST_INT_P (XEXP (x
, 1)))
8846 rtx base
= XEXP (x
, 0);
8847 rtx offset_rtx
= XEXP (x
, 1);
8848 HOST_WIDE_INT offset
= INTVAL (offset_rtx
);
8850 if (GET_CODE (base
) == PLUS
)
8852 rtx op0
= XEXP (base
, 0);
8853 rtx op1
= XEXP (base
, 1);
8855 /* Force any scaling into a temp for CSE. */
8856 op0
= force_reg (Pmode
, op0
);
8857 op1
= force_reg (Pmode
, op1
);
8859 /* Let the pointer register be in op0. */
8860 if (REG_POINTER (op1
))
8861 std::swap (op0
, op1
);
8863 /* If the pointer is virtual or frame related, then we know that
8864 virtual register instantiation or register elimination is going
8865 to apply a second constant. We want the two constants folded
8866 together easily. Therefore, emit as (OP0 + CONST) + OP1. */
8867 if (virt_or_elim_regno_p (REGNO (op0
)))
8869 base
= expand_binop (Pmode
, add_optab
, op0
, offset_rtx
,
8870 NULL_RTX
, true, OPTAB_DIRECT
);
8871 return gen_rtx_PLUS (Pmode
, base
, op1
);
8874 /* Otherwise, in order to encourage CSE (and thence loop strength
8875 reduce) scaled addresses, emit as (OP0 + OP1) + CONST. */
8876 base
= expand_binop (Pmode
, add_optab
, op0
, op1
,
8877 NULL_RTX
, true, OPTAB_DIRECT
);
8878 x
= gen_rtx_PLUS (Pmode
, base
, offset_rtx
);
8882 if (GET_MODE_SIZE (mode
).is_constant (&size
))
8884 HOST_WIDE_INT base_offset
= aarch64_anchor_offset (offset
, size
,
8886 if (base_offset
!= 0)
8888 base
= plus_constant (Pmode
, base
, base_offset
);
8889 base
= force_operand (base
, NULL_RTX
);
8890 return plus_constant (Pmode
, base
, offset
- base_offset
);
8899 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED
, rtx x
,
8902 secondary_reload_info
*sri
)
8904 /* Use aarch64_sve_reload_be for SVE reloads that cannot be handled
8905 directly by the *aarch64_sve_mov<mode>_be move pattern. See the
8906 comment at the head of aarch64-sve.md for more details about the
8907 big-endian handling. */
8908 if (BYTES_BIG_ENDIAN
8909 && reg_class_subset_p (rclass
, FP_REGS
)
8910 && !((REG_P (x
) && HARD_REGISTER_P (x
))
8911 || aarch64_simd_valid_immediate (x
, NULL
))
8912 && aarch64_sve_data_mode_p (mode
))
8914 sri
->icode
= CODE_FOR_aarch64_sve_reload_be
;
8918 /* If we have to disable direct literal pool loads and stores because the
8919 function is too big, then we need a scratch register. */
8920 if (MEM_P (x
) && GET_CODE (x
) == SYMBOL_REF
&& CONSTANT_POOL_ADDRESS_P (x
)
8921 && (SCALAR_FLOAT_MODE_P (GET_MODE (x
))
8922 || targetm
.vector_mode_supported_p (GET_MODE (x
)))
8923 && !aarch64_pcrelative_literal_loads
)
8925 sri
->icode
= code_for_aarch64_reload_movcp (mode
, DImode
);
8929 /* Without the TARGET_SIMD instructions we cannot move a Q register
8930 to a Q register directly. We need a scratch. */
8931 if (REG_P (x
) && (mode
== TFmode
|| mode
== TImode
) && mode
== GET_MODE (x
)
8932 && FP_REGNUM_P (REGNO (x
)) && !TARGET_SIMD
8933 && reg_class_subset_p (rclass
, FP_REGS
))
8935 sri
->icode
= code_for_aarch64_reload_mov (mode
);
8939 /* A TFmode or TImode memory access should be handled via an FP_REGS
8940 because AArch64 has richer addressing modes for LDR/STR instructions
8941 than LDP/STP instructions. */
8942 if (TARGET_FLOAT
&& rclass
== GENERAL_REGS
8943 && known_eq (GET_MODE_SIZE (mode
), 16) && MEM_P (x
))
8946 if (rclass
== FP_REGS
&& (mode
== TImode
|| mode
== TFmode
) && CONSTANT_P(x
))
8947 return GENERAL_REGS
;
8953 aarch64_can_eliminate (const int from ATTRIBUTE_UNUSED
, const int to
)
8955 gcc_assert (from
== ARG_POINTER_REGNUM
|| from
== FRAME_POINTER_REGNUM
);
8957 /* If we need a frame pointer, ARG_POINTER_REGNUM and FRAME_POINTER_REGNUM
8958 can only eliminate to HARD_FRAME_POINTER_REGNUM. */
8959 if (frame_pointer_needed
)
8960 return to
== HARD_FRAME_POINTER_REGNUM
;
8965 aarch64_initial_elimination_offset (unsigned from
, unsigned to
)
8967 if (to
== HARD_FRAME_POINTER_REGNUM
)
8969 if (from
== ARG_POINTER_REGNUM
)
8970 return cfun
->machine
->frame
.hard_fp_offset
;
8972 if (from
== FRAME_POINTER_REGNUM
)
8973 return cfun
->machine
->frame
.hard_fp_offset
8974 - cfun
->machine
->frame
.locals_offset
;
8977 if (to
== STACK_POINTER_REGNUM
)
8979 if (from
== FRAME_POINTER_REGNUM
)
8980 return cfun
->machine
->frame
.frame_size
8981 - cfun
->machine
->frame
.locals_offset
;
8984 return cfun
->machine
->frame
.frame_size
;
8987 /* Implement RETURN_ADDR_RTX. We do not support moving back to a
8991 aarch64_return_addr (int count
, rtx frame ATTRIBUTE_UNUSED
)
8995 return get_hard_reg_initial_val (Pmode
, LR_REGNUM
);
9000 aarch64_asm_trampoline_template (FILE *f
)
9005 if (aarch64_bti_enabled ())
9007 asm_fprintf (f
, "\thint\t34 // bti c\n");
9014 asm_fprintf (f
, "\tldr\tw%d, .+%d\n", IP1_REGNUM
- R0_REGNUM
, offset1
);
9015 asm_fprintf (f
, "\tldr\tw%d, .+%d\n", STATIC_CHAIN_REGNUM
- R0_REGNUM
,
9020 asm_fprintf (f
, "\tldr\t%s, .+%d\n", reg_names
[IP1_REGNUM
], offset1
);
9021 asm_fprintf (f
, "\tldr\t%s, .+%d\n", reg_names
[STATIC_CHAIN_REGNUM
],
9024 asm_fprintf (f
, "\tbr\t%s\n", reg_names
[IP1_REGNUM
]);
9026 /* The trampoline needs an extra padding instruction. In case if BTI is
9027 enabled the padding instruction is replaced by the BTI instruction at
9029 if (!aarch64_bti_enabled ())
9030 assemble_aligned_integer (4, const0_rtx
);
9032 assemble_aligned_integer (POINTER_BYTES
, const0_rtx
);
9033 assemble_aligned_integer (POINTER_BYTES
, const0_rtx
);
9037 aarch64_trampoline_init (rtx m_tramp
, tree fndecl
, rtx chain_value
)
9039 rtx fnaddr
, mem
, a_tramp
;
9040 const int tramp_code_sz
= 16;
9042 /* Don't need to copy the trailing D-words, we fill those in below. */
9043 emit_block_move (m_tramp
, assemble_trampoline_template (),
9044 GEN_INT (tramp_code_sz
), BLOCK_OP_NORMAL
);
9045 mem
= adjust_address (m_tramp
, ptr_mode
, tramp_code_sz
);
9046 fnaddr
= XEXP (DECL_RTL (fndecl
), 0);
9047 if (GET_MODE (fnaddr
) != ptr_mode
)
9048 fnaddr
= convert_memory_address (ptr_mode
, fnaddr
);
9049 emit_move_insn (mem
, fnaddr
);
9051 mem
= adjust_address (m_tramp
, ptr_mode
, tramp_code_sz
+ POINTER_BYTES
);
9052 emit_move_insn (mem
, chain_value
);
9054 /* XXX We should really define a "clear_cache" pattern and use
9055 gen_clear_cache(). */
9056 a_tramp
= XEXP (m_tramp
, 0);
9057 emit_library_call (gen_rtx_SYMBOL_REF (Pmode
, "__clear_cache"),
9058 LCT_NORMAL
, VOIDmode
, a_tramp
, ptr_mode
,
9059 plus_constant (ptr_mode
, a_tramp
, TRAMPOLINE_SIZE
),
9063 static unsigned char
9064 aarch64_class_max_nregs (reg_class_t regclass
, machine_mode mode
)
9066 /* ??? Logically we should only need to provide a value when
9067 HARD_REGNO_MODE_OK says that at least one register in REGCLASS
9068 can hold MODE, but at the moment we need to handle all modes.
9069 Just ignore any runtime parts for registers that can't store them. */
9070 HOST_WIDE_INT lowest_size
= constant_lower_bound (GET_MODE_SIZE (mode
));
9074 case TAILCALL_ADDR_REGS
:
9078 case POINTER_AND_FP_REGS
:
9082 if (aarch64_sve_data_mode_p (mode
)
9083 && constant_multiple_p (GET_MODE_SIZE (mode
),
9084 BYTES_PER_SVE_VECTOR
, &nregs
))
9086 return (aarch64_vector_data_mode_p (mode
)
9087 ? CEIL (lowest_size
, UNITS_PER_VREG
)
9088 : CEIL (lowest_size
, UNITS_PER_WORD
));
9105 aarch64_preferred_reload_class (rtx x
, reg_class_t regclass
)
9107 if (regclass
== POINTER_REGS
)
9108 return GENERAL_REGS
;
9110 if (regclass
== STACK_REG
)
9113 && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x
)), POINTER_REGS
))
9119 /* Register eliminiation can result in a request for
9120 SP+constant->FP_REGS. We cannot support such operations which
9121 use SP as source and an FP_REG as destination, so reject out
9123 if (! reg_class_subset_p (regclass
, GENERAL_REGS
) && GET_CODE (x
) == PLUS
)
9125 rtx lhs
= XEXP (x
, 0);
9127 /* Look through a possible SUBREG introduced by ILP32. */
9128 if (GET_CODE (lhs
) == SUBREG
)
9129 lhs
= SUBREG_REG (lhs
);
9131 gcc_assert (REG_P (lhs
));
9132 gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs
)),
9141 aarch64_asm_output_labelref (FILE* f
, const char *name
)
9143 asm_fprintf (f
, "%U%s", name
);
9147 aarch64_elf_asm_constructor (rtx symbol
, int priority
)
9149 if (priority
== DEFAULT_INIT_PRIORITY
)
9150 default_ctor_section_asm_out_constructor (symbol
, priority
);
9154 /* While priority is known to be in range [0, 65535], so 18 bytes
9155 would be enough, the compiler might not know that. To avoid
9156 -Wformat-truncation false positive, use a larger size. */
9158 snprintf (buf
, sizeof (buf
), ".init_array.%.5u", priority
);
9159 s
= get_section (buf
, SECTION_WRITE
| SECTION_NOTYPE
, NULL
);
9160 switch_to_section (s
);
9161 assemble_align (POINTER_SIZE
);
9162 assemble_aligned_integer (POINTER_BYTES
, symbol
);
9167 aarch64_elf_asm_destructor (rtx symbol
, int priority
)
9169 if (priority
== DEFAULT_INIT_PRIORITY
)
9170 default_dtor_section_asm_out_destructor (symbol
, priority
);
9174 /* While priority is known to be in range [0, 65535], so 18 bytes
9175 would be enough, the compiler might not know that. To avoid
9176 -Wformat-truncation false positive, use a larger size. */
9178 snprintf (buf
, sizeof (buf
), ".fini_array.%.5u", priority
);
9179 s
= get_section (buf
, SECTION_WRITE
| SECTION_NOTYPE
, NULL
);
9180 switch_to_section (s
);
9181 assemble_align (POINTER_SIZE
);
9182 assemble_aligned_integer (POINTER_BYTES
, symbol
);
9187 aarch64_output_casesi (rtx
*operands
)
9191 rtx diff_vec
= PATTERN (NEXT_INSN (as_a
<rtx_insn
*> (operands
[2])));
9193 static const char *const patterns
[4][2] =
9196 "ldrb\t%w3, [%0,%w1,uxtw]",
9197 "add\t%3, %4, %w3, sxtb #2"
9200 "ldrh\t%w3, [%0,%w1,uxtw #1]",
9201 "add\t%3, %4, %w3, sxth #2"
9204 "ldr\t%w3, [%0,%w1,uxtw #2]",
9205 "add\t%3, %4, %w3, sxtw #2"
9207 /* We assume that DImode is only generated when not optimizing and
9208 that we don't really need 64-bit address offsets. That would
9209 imply an object file with 8GB of code in a single function! */
9211 "ldr\t%w3, [%0,%w1,uxtw #2]",
9212 "add\t%3, %4, %w3, sxtw #2"
9216 gcc_assert (GET_CODE (diff_vec
) == ADDR_DIFF_VEC
);
9218 scalar_int_mode mode
= as_a
<scalar_int_mode
> (GET_MODE (diff_vec
));
9219 index
= exact_log2 (GET_MODE_SIZE (mode
));
9221 gcc_assert (index
>= 0 && index
<= 3);
9223 /* Need to implement table size reduction, by chaning the code below. */
9224 output_asm_insn (patterns
[index
][0], operands
);
9225 ASM_GENERATE_INTERNAL_LABEL (label
, "Lrtx", CODE_LABEL_NUMBER (operands
[2]));
9226 snprintf (buf
, sizeof (buf
),
9227 "adr\t%%4, %s", targetm
.strip_name_encoding (label
));
9228 output_asm_insn (buf
, operands
);
9229 output_asm_insn (patterns
[index
][1], operands
);
9230 output_asm_insn ("br\t%3", operands
);
9231 assemble_label (asm_out_file
, label
);
9236 /* Return size in bits of an arithmetic operand which is shifted/scaled and
9237 masked such that it is suitable for a UXTB, UXTH, or UXTW extend
9241 aarch64_uxt_size (int shift
, HOST_WIDE_INT mask
)
9243 if (shift
>= 0 && shift
<= 3)
9246 for (size
= 8; size
<= 32; size
*= 2)
9248 HOST_WIDE_INT bits
= ((HOST_WIDE_INT
)1U << size
) - 1;
9249 if (mask
== bits
<< shift
)
9256 /* Constant pools are per function only when PC relative
9257 literal loads are true or we are in the large memory
9261 aarch64_can_use_per_function_literal_pools_p (void)
9263 return (aarch64_pcrelative_literal_loads
9264 || aarch64_cmodel
== AARCH64_CMODEL_LARGE
);
9268 aarch64_use_blocks_for_constant_p (machine_mode
, const_rtx
)
9270 /* We can't use blocks for constants when we're using a per-function
9272 return !aarch64_can_use_per_function_literal_pools_p ();
9275 /* Select appropriate section for constants depending
9276 on where we place literal pools. */
9279 aarch64_select_rtx_section (machine_mode mode
,
9281 unsigned HOST_WIDE_INT align
)
9283 if (aarch64_can_use_per_function_literal_pools_p ())
9284 return function_section (current_function_decl
);
9286 return default_elf_select_rtx_section (mode
, x
, align
);
9289 /* Implement ASM_OUTPUT_POOL_EPILOGUE. */
9291 aarch64_asm_output_pool_epilogue (FILE *f
, const char *, tree
,
9292 HOST_WIDE_INT offset
)
9294 /* When using per-function literal pools, we must ensure that any code
9295 section is aligned to the minimal instruction length, lest we get
9296 errors from the assembler re "unaligned instructions". */
9297 if ((offset
& 3) && aarch64_can_use_per_function_literal_pools_p ())
9298 ASM_OUTPUT_ALIGN (f
, 2);
9303 /* Helper function for rtx cost calculation. Strip a shift expression
9304 from X. Returns the inner operand if successful, or the original
9305 expression on failure. */
9307 aarch64_strip_shift (rtx x
)
9311 /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
9312 we can convert both to ROR during final output. */
9313 if ((GET_CODE (op
) == ASHIFT
9314 || GET_CODE (op
) == ASHIFTRT
9315 || GET_CODE (op
) == LSHIFTRT
9316 || GET_CODE (op
) == ROTATERT
9317 || GET_CODE (op
) == ROTATE
)
9318 && CONST_INT_P (XEXP (op
, 1)))
9319 return XEXP (op
, 0);
9321 if (GET_CODE (op
) == MULT
9322 && CONST_INT_P (XEXP (op
, 1))
9323 && ((unsigned) exact_log2 (INTVAL (XEXP (op
, 1)))) < 64)
9324 return XEXP (op
, 0);
9329 /* Helper function for rtx cost calculation. Strip an extend
9330 expression from X. Returns the inner operand if successful, or the
9331 original expression on failure. We deal with a number of possible
9332 canonicalization variations here. If STRIP_SHIFT is true, then
9333 we can strip off a shift also. */
9335 aarch64_strip_extend (rtx x
, bool strip_shift
)
9337 scalar_int_mode mode
;
9340 if (!is_a
<scalar_int_mode
> (GET_MODE (op
), &mode
))
9343 /* Zero and sign extraction of a widened value. */
9344 if ((GET_CODE (op
) == ZERO_EXTRACT
|| GET_CODE (op
) == SIGN_EXTRACT
)
9345 && XEXP (op
, 2) == const0_rtx
9346 && GET_CODE (XEXP (op
, 0)) == MULT
9347 && aarch64_is_extend_from_extract (mode
, XEXP (XEXP (op
, 0), 1),
9349 return XEXP (XEXP (op
, 0), 0);
9351 /* It can also be represented (for zero-extend) as an AND with an
9353 if (GET_CODE (op
) == AND
9354 && GET_CODE (XEXP (op
, 0)) == MULT
9355 && CONST_INT_P (XEXP (XEXP (op
, 0), 1))
9356 && CONST_INT_P (XEXP (op
, 1))
9357 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op
, 0), 1))),
9358 INTVAL (XEXP (op
, 1))) != 0)
9359 return XEXP (XEXP (op
, 0), 0);
9361 /* Now handle extended register, as this may also have an optional
9362 left shift by 1..4. */
9364 && GET_CODE (op
) == ASHIFT
9365 && CONST_INT_P (XEXP (op
, 1))
9366 && ((unsigned HOST_WIDE_INT
) INTVAL (XEXP (op
, 1))) <= 4)
9369 if (GET_CODE (op
) == ZERO_EXTEND
9370 || GET_CODE (op
) == SIGN_EXTEND
)
9379 /* Return true iff CODE is a shift supported in combination
9380 with arithmetic instructions. */
9383 aarch64_shift_p (enum rtx_code code
)
9385 return code
== ASHIFT
|| code
== ASHIFTRT
|| code
== LSHIFTRT
;
9389 /* Return true iff X is a cheap shift without a sign extend. */
9392 aarch64_cheap_mult_shift_p (rtx x
)
9399 if (!(aarch64_tune_params
.extra_tuning_flags
9400 & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND
))
9403 if (GET_CODE (op0
) == SIGN_EXTEND
)
9406 if (GET_CODE (x
) == ASHIFT
&& CONST_INT_P (op1
)
9407 && UINTVAL (op1
) <= 4)
9410 if (GET_CODE (x
) != MULT
|| !CONST_INT_P (op1
))
9413 HOST_WIDE_INT l2
= exact_log2 (INTVAL (op1
));
9415 if (l2
> 0 && l2
<= 4)
9421 /* Helper function for rtx cost calculation. Calculate the cost of
9422 a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
9423 Return the calculated cost of the expression, recursing manually in to
9424 operands where needed. */
9427 aarch64_rtx_mult_cost (rtx x
, enum rtx_code code
, int outer
, bool speed
)
9430 const struct cpu_cost_table
*extra_cost
9431 = aarch64_tune_params
.insn_extra_cost
;
9433 bool compound_p
= (outer
== PLUS
|| outer
== MINUS
);
9434 machine_mode mode
= GET_MODE (x
);
9436 gcc_checking_assert (code
== MULT
);
9441 if (VECTOR_MODE_P (mode
))
9442 mode
= GET_MODE_INNER (mode
);
9444 /* Integer multiply/fma. */
9445 if (GET_MODE_CLASS (mode
) == MODE_INT
)
9447 /* The multiply will be canonicalized as a shift, cost it as such. */
9448 if (aarch64_shift_p (GET_CODE (x
))
9449 || (CONST_INT_P (op1
)
9450 && exact_log2 (INTVAL (op1
)) > 0))
9452 bool is_extend
= GET_CODE (op0
) == ZERO_EXTEND
9453 || GET_CODE (op0
) == SIGN_EXTEND
;
9458 /* If the shift is considered cheap,
9459 then don't add any cost. */
9460 if (aarch64_cheap_mult_shift_p (x
))
9462 else if (REG_P (op1
))
9463 /* ARITH + shift-by-register. */
9464 cost
+= extra_cost
->alu
.arith_shift_reg
;
9466 /* ARITH + extended register. We don't have a cost field
9467 for ARITH+EXTEND+SHIFT, so use extend_arith here. */
9468 cost
+= extra_cost
->alu
.extend_arith
;
9470 /* ARITH + shift-by-immediate. */
9471 cost
+= extra_cost
->alu
.arith_shift
;
9474 /* LSL (immediate). */
9475 cost
+= extra_cost
->alu
.shift
;
9478 /* Strip extends as we will have costed them in the case above. */
9480 op0
= aarch64_strip_extend (op0
, true);
9482 cost
+= rtx_cost (op0
, VOIDmode
, code
, 0, speed
);
9487 /* MNEG or [US]MNEGL. Extract the NEG operand and indicate that it's a
9488 compound and let the below cases handle it. After all, MNEG is a
9489 special-case alias of MSUB. */
9490 if (GET_CODE (op0
) == NEG
)
9492 op0
= XEXP (op0
, 0);
9496 /* Integer multiplies or FMAs have zero/sign extending variants. */
9497 if ((GET_CODE (op0
) == ZERO_EXTEND
9498 && GET_CODE (op1
) == ZERO_EXTEND
)
9499 || (GET_CODE (op0
) == SIGN_EXTEND
9500 && GET_CODE (op1
) == SIGN_EXTEND
))
9502 cost
+= rtx_cost (XEXP (op0
, 0), VOIDmode
, MULT
, 0, speed
);
9503 cost
+= rtx_cost (XEXP (op1
, 0), VOIDmode
, MULT
, 1, speed
);
9508 /* SMADDL/UMADDL/UMSUBL/SMSUBL. */
9509 cost
+= extra_cost
->mult
[0].extend_add
;
9511 /* MUL/SMULL/UMULL. */
9512 cost
+= extra_cost
->mult
[0].extend
;
9518 /* This is either an integer multiply or a MADD. In both cases
9519 we want to recurse and cost the operands. */
9520 cost
+= rtx_cost (op0
, mode
, MULT
, 0, speed
);
9521 cost
+= rtx_cost (op1
, mode
, MULT
, 1, speed
);
9527 cost
+= extra_cost
->mult
[mode
== DImode
].add
;
9530 cost
+= extra_cost
->mult
[mode
== DImode
].simple
;
9539 /* Floating-point FMA/FMUL can also support negations of the
9540 operands, unless the rounding mode is upward or downward in
9541 which case FNMUL is different than FMUL with operand negation. */
9542 bool neg0
= GET_CODE (op0
) == NEG
;
9543 bool neg1
= GET_CODE (op1
) == NEG
;
9544 if (compound_p
|| !flag_rounding_math
|| (neg0
&& neg1
))
9547 op0
= XEXP (op0
, 0);
9549 op1
= XEXP (op1
, 0);
9553 /* FMADD/FNMADD/FNMSUB/FMSUB. */
9554 cost
+= extra_cost
->fp
[mode
== DFmode
].fma
;
9557 cost
+= extra_cost
->fp
[mode
== DFmode
].mult
;
9560 cost
+= rtx_cost (op0
, mode
, MULT
, 0, speed
);
9561 cost
+= rtx_cost (op1
, mode
, MULT
, 1, speed
);
9567 aarch64_address_cost (rtx x
,
9569 addr_space_t as ATTRIBUTE_UNUSED
,
9572 enum rtx_code c
= GET_CODE (x
);
9573 const struct cpu_addrcost_table
*addr_cost
= aarch64_tune_params
.addr_cost
;
9574 struct aarch64_address_info info
;
9578 if (!aarch64_classify_address (&info
, x
, mode
, false))
9580 if (GET_CODE (x
) == CONST
|| GET_CODE (x
) == SYMBOL_REF
)
9582 /* This is a CONST or SYMBOL ref which will be split
9583 in a different way depending on the code model in use.
9584 Cost it through the generic infrastructure. */
9585 int cost_symbol_ref
= rtx_cost (x
, Pmode
, MEM
, 1, speed
);
9586 /* Divide through by the cost of one instruction to
9587 bring it to the same units as the address costs. */
9588 cost_symbol_ref
/= COSTS_N_INSNS (1);
9589 /* The cost is then the cost of preparing the address,
9590 followed by an immediate (possibly 0) offset. */
9591 return cost_symbol_ref
+ addr_cost
->imm_offset
;
9595 /* This is most likely a jump table from a case
9597 return addr_cost
->register_offset
;
9603 case ADDRESS_LO_SUM
:
9604 case ADDRESS_SYMBOLIC
:
9605 case ADDRESS_REG_IMM
:
9606 cost
+= addr_cost
->imm_offset
;
9609 case ADDRESS_REG_WB
:
9610 if (c
== PRE_INC
|| c
== PRE_DEC
|| c
== PRE_MODIFY
)
9611 cost
+= addr_cost
->pre_modify
;
9612 else if (c
== POST_INC
|| c
== POST_DEC
|| c
== POST_MODIFY
)
9613 cost
+= addr_cost
->post_modify
;
9619 case ADDRESS_REG_REG
:
9620 cost
+= addr_cost
->register_offset
;
9623 case ADDRESS_REG_SXTW
:
9624 cost
+= addr_cost
->register_sextend
;
9627 case ADDRESS_REG_UXTW
:
9628 cost
+= addr_cost
->register_zextend
;
9638 /* For the sake of calculating the cost of the shifted register
9639 component, we can treat same sized modes in the same way. */
9640 if (known_eq (GET_MODE_BITSIZE (mode
), 16))
9641 cost
+= addr_cost
->addr_scale_costs
.hi
;
9642 else if (known_eq (GET_MODE_BITSIZE (mode
), 32))
9643 cost
+= addr_cost
->addr_scale_costs
.si
;
9644 else if (known_eq (GET_MODE_BITSIZE (mode
), 64))
9645 cost
+= addr_cost
->addr_scale_costs
.di
;
9647 /* We can't tell, or this is a 128-bit vector. */
9648 cost
+= addr_cost
->addr_scale_costs
.ti
;
9654 /* Return the cost of a branch. If SPEED_P is true then the compiler is
9655 optimizing for speed. If PREDICTABLE_P is true then the branch is predicted
9659 aarch64_branch_cost (bool speed_p
, bool predictable_p
)
9661 /* When optimizing for speed, use the cost of unpredictable branches. */
9662 const struct cpu_branch_cost
*branch_costs
=
9663 aarch64_tune_params
.branch_costs
;
9665 if (!speed_p
|| predictable_p
)
9666 return branch_costs
->predictable
;
9668 return branch_costs
->unpredictable
;
9671 /* Return true if the RTX X in mode MODE is a zero or sign extract
9672 usable in an ADD or SUB (extended register) instruction. */
9674 aarch64_rtx_arith_op_extract_p (rtx x
, scalar_int_mode mode
)
9676 /* Catch add with a sign extract.
9677 This is add_<optab><mode>_multp2. */
9678 if (GET_CODE (x
) == SIGN_EXTRACT
9679 || GET_CODE (x
) == ZERO_EXTRACT
)
9681 rtx op0
= XEXP (x
, 0);
9682 rtx op1
= XEXP (x
, 1);
9683 rtx op2
= XEXP (x
, 2);
9685 if (GET_CODE (op0
) == MULT
9686 && CONST_INT_P (op1
)
9687 && op2
== const0_rtx
9688 && CONST_INT_P (XEXP (op0
, 1))
9689 && aarch64_is_extend_from_extract (mode
,
9696 /* The simple case <ARITH>, XD, XN, XM, [us]xt.
9698 else if (GET_CODE (x
) == SIGN_EXTEND
9699 || GET_CODE (x
) == ZERO_EXTEND
)
9700 return REG_P (XEXP (x
, 0));
9706 aarch64_frint_unspec_p (unsigned int u
)
9724 /* Return true iff X is an rtx that will match an extr instruction
9725 i.e. as described in the *extr<mode>5_insn family of patterns.
9726 OP0 and OP1 will be set to the operands of the shifts involved
9727 on success and will be NULL_RTX otherwise. */
9730 aarch64_extr_rtx_p (rtx x
, rtx
*res_op0
, rtx
*res_op1
)
9733 scalar_int_mode mode
;
9734 if (!is_a
<scalar_int_mode
> (GET_MODE (x
), &mode
))
9737 *res_op0
= NULL_RTX
;
9738 *res_op1
= NULL_RTX
;
9740 if (GET_CODE (x
) != IOR
)
9746 if ((GET_CODE (op0
) == ASHIFT
&& GET_CODE (op1
) == LSHIFTRT
)
9747 || (GET_CODE (op1
) == ASHIFT
&& GET_CODE (op0
) == LSHIFTRT
))
9749 /* Canonicalise locally to ashift in op0, lshiftrt in op1. */
9750 if (GET_CODE (op1
) == ASHIFT
)
9751 std::swap (op0
, op1
);
9753 if (!CONST_INT_P (XEXP (op0
, 1)) || !CONST_INT_P (XEXP (op1
, 1)))
9756 unsigned HOST_WIDE_INT shft_amnt_0
= UINTVAL (XEXP (op0
, 1));
9757 unsigned HOST_WIDE_INT shft_amnt_1
= UINTVAL (XEXP (op1
, 1));
9759 if (shft_amnt_0
< GET_MODE_BITSIZE (mode
)
9760 && shft_amnt_0
+ shft_amnt_1
== GET_MODE_BITSIZE (mode
))
9762 *res_op0
= XEXP (op0
, 0);
9763 *res_op1
= XEXP (op1
, 0);
9771 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
9772 storing it in *COST. Result is true if the total cost of the operation
9773 has now been calculated. */
9775 aarch64_if_then_else_costs (rtx op0
, rtx op1
, rtx op2
, int *cost
, bool speed
)
9779 enum rtx_code cmpcode
;
9781 if (COMPARISON_P (op0
))
9783 inner
= XEXP (op0
, 0);
9784 comparator
= XEXP (op0
, 1);
9785 cmpcode
= GET_CODE (op0
);
9790 comparator
= const0_rtx
;
9794 if (GET_CODE (op1
) == PC
|| GET_CODE (op2
) == PC
)
9796 /* Conditional branch. */
9797 if (GET_MODE_CLASS (GET_MODE (inner
)) == MODE_CC
)
9801 if (cmpcode
== NE
|| cmpcode
== EQ
)
9803 if (comparator
== const0_rtx
)
9805 /* TBZ/TBNZ/CBZ/CBNZ. */
9806 if (GET_CODE (inner
) == ZERO_EXTRACT
)
9808 *cost
+= rtx_cost (XEXP (inner
, 0), VOIDmode
,
9809 ZERO_EXTRACT
, 0, speed
);
9812 *cost
+= rtx_cost (inner
, VOIDmode
, cmpcode
, 0, speed
);
9817 else if (cmpcode
== LT
|| cmpcode
== GE
)
9820 if (comparator
== const0_rtx
)
9825 else if (GET_MODE_CLASS (GET_MODE (inner
)) == MODE_CC
)
9828 if (GET_CODE (op1
) == COMPARE
)
9830 /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0. */
9831 if (XEXP (op1
, 1) == const0_rtx
)
9835 machine_mode mode
= GET_MODE (XEXP (op1
, 0));
9836 const struct cpu_cost_table
*extra_cost
9837 = aarch64_tune_params
.insn_extra_cost
;
9839 if (GET_MODE_CLASS (mode
) == MODE_INT
)
9840 *cost
+= extra_cost
->alu
.arith
;
9842 *cost
+= extra_cost
->fp
[mode
== DFmode
].compare
;
9847 /* It's a conditional operation based on the status flags,
9848 so it must be some flavor of CSEL. */
9850 /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL. */
9851 if (GET_CODE (op1
) == NEG
9852 || GET_CODE (op1
) == NOT
9853 || (GET_CODE (op1
) == PLUS
&& XEXP (op1
, 1) == const1_rtx
))
9854 op1
= XEXP (op1
, 0);
9855 else if (GET_CODE (op1
) == ZERO_EXTEND
&& GET_CODE (op2
) == ZERO_EXTEND
)
9857 /* CSEL with zero-extension (*cmovdi_insn_uxtw). */
9858 op1
= XEXP (op1
, 0);
9859 op2
= XEXP (op2
, 0);
9862 *cost
+= rtx_cost (op1
, VOIDmode
, IF_THEN_ELSE
, 1, speed
);
9863 *cost
+= rtx_cost (op2
, VOIDmode
, IF_THEN_ELSE
, 2, speed
);
9867 /* We don't know what this is, cost all operands. */
9871 /* Check whether X is a bitfield operation of the form shift + extend that
9872 maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction. If so, return the
9873 operand to which the bitfield operation is applied. Otherwise return
9877 aarch64_extend_bitfield_pattern_p (rtx x
)
9879 rtx_code outer_code
= GET_CODE (x
);
9880 machine_mode outer_mode
= GET_MODE (x
);
9882 if (outer_code
!= ZERO_EXTEND
&& outer_code
!= SIGN_EXTEND
9883 && outer_mode
!= SImode
&& outer_mode
!= DImode
)
9886 rtx inner
= XEXP (x
, 0);
9887 rtx_code inner_code
= GET_CODE (inner
);
9888 machine_mode inner_mode
= GET_MODE (inner
);
9894 if (CONST_INT_P (XEXP (inner
, 1))
9895 && (inner_mode
== QImode
|| inner_mode
== HImode
))
9896 op
= XEXP (inner
, 0);
9899 if (outer_code
== ZERO_EXTEND
&& CONST_INT_P (XEXP (inner
, 1))
9900 && (inner_mode
== QImode
|| inner_mode
== HImode
))
9901 op
= XEXP (inner
, 0);
9904 if (outer_code
== SIGN_EXTEND
&& CONST_INT_P (XEXP (inner
, 1))
9905 && (inner_mode
== QImode
|| inner_mode
== HImode
))
9906 op
= XEXP (inner
, 0);
9915 /* Return true if the mask and a shift amount from an RTX of the form
9916 (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
9917 mode MODE. See the *andim_ashift<mode>_bfiz pattern. */
9920 aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode mode
, rtx mask
,
9923 return CONST_INT_P (mask
) && CONST_INT_P (shft_amnt
)
9924 && INTVAL (shft_amnt
) < GET_MODE_BITSIZE (mode
)
9925 && exact_log2 ((INTVAL (mask
) >> INTVAL (shft_amnt
)) + 1) >= 0
9927 & ((HOST_WIDE_INT_1U
<< INTVAL (shft_amnt
)) - 1)) == 0;
9930 /* Return true if the masks and a shift amount from an RTX of the form
9931 ((x & MASK1) | ((y << SHIFT_AMNT) & MASK2)) are valid to combine into
9932 a BFI instruction of mode MODE. See *arch64_bfi patterns. */
9935 aarch64_masks_and_shift_for_bfi_p (scalar_int_mode mode
,
9936 unsigned HOST_WIDE_INT mask1
,
9937 unsigned HOST_WIDE_INT shft_amnt
,
9938 unsigned HOST_WIDE_INT mask2
)
9940 unsigned HOST_WIDE_INT t
;
9942 /* Verify that there is no overlap in what bits are set in the two masks. */
9943 if (mask1
!= ~mask2
)
9946 /* Verify that mask2 is not all zeros or ones. */
9947 if (mask2
== 0 || mask2
== HOST_WIDE_INT_M1U
)
9950 /* The shift amount should always be less than the mode size. */
9951 gcc_assert (shft_amnt
< GET_MODE_BITSIZE (mode
));
9953 /* Verify that the mask being shifted is contiguous and would be in the
9954 least significant bits after shifting by shft_amnt. */
9955 t
= mask2
+ (HOST_WIDE_INT_1U
<< shft_amnt
);
9956 return (t
== (t
& -t
));
9959 /* Calculate the cost of calculating X, storing it in *COST. Result
9960 is true if the total cost of the operation has now been calculated. */
9962 aarch64_rtx_costs (rtx x
, machine_mode mode
, int outer ATTRIBUTE_UNUSED
,
9963 int param ATTRIBUTE_UNUSED
, int *cost
, bool speed
)
9966 const struct cpu_cost_table
*extra_cost
9967 = aarch64_tune_params
.insn_extra_cost
;
9968 int code
= GET_CODE (x
);
9969 scalar_int_mode int_mode
;
9971 /* By default, assume that everything has equivalent cost to the
9972 cheapest instruction. Any additional costs are applied as a delta
9973 above this default. */
9974 *cost
= COSTS_N_INSNS (1);
9979 /* The cost depends entirely on the operands to SET. */
9984 switch (GET_CODE (op0
))
9989 rtx address
= XEXP (op0
, 0);
9990 if (VECTOR_MODE_P (mode
))
9991 *cost
+= extra_cost
->ldst
.storev
;
9992 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
9993 *cost
+= extra_cost
->ldst
.store
;
9994 else if (mode
== SFmode
)
9995 *cost
+= extra_cost
->ldst
.storef
;
9996 else if (mode
== DFmode
)
9997 *cost
+= extra_cost
->ldst
.stored
;
10000 COSTS_N_INSNS (aarch64_address_cost (address
, mode
,
10004 *cost
+= rtx_cost (op1
, mode
, SET
, 1, speed
);
10008 if (! REG_P (SUBREG_REG (op0
)))
10009 *cost
+= rtx_cost (SUBREG_REG (op0
), VOIDmode
, SET
, 0, speed
);
10011 /* Fall through. */
10013 /* The cost is one per vector-register copied. */
10014 if (VECTOR_MODE_P (GET_MODE (op0
)) && REG_P (op1
))
10016 int nregs
= aarch64_hard_regno_nregs (V0_REGNUM
, GET_MODE (op0
));
10017 *cost
= COSTS_N_INSNS (nregs
);
10019 /* const0_rtx is in general free, but we will use an
10020 instruction to set a register to 0. */
10021 else if (REG_P (op1
) || op1
== const0_rtx
)
10023 /* The cost is 1 per register copied. */
10024 int nregs
= aarch64_hard_regno_nregs (R0_REGNUM
, GET_MODE (op0
));
10025 *cost
= COSTS_N_INSNS (nregs
);
10028 /* Cost is just the cost of the RHS of the set. */
10029 *cost
+= rtx_cost (op1
, mode
, SET
, 1, speed
);
10034 /* Bit-field insertion. Strip any redundant widening of
10035 the RHS to meet the width of the target. */
10036 if (GET_CODE (op1
) == SUBREG
)
10037 op1
= SUBREG_REG (op1
);
10038 if ((GET_CODE (op1
) == ZERO_EXTEND
10039 || GET_CODE (op1
) == SIGN_EXTEND
)
10040 && CONST_INT_P (XEXP (op0
, 1))
10041 && is_a
<scalar_int_mode
> (GET_MODE (XEXP (op1
, 0)), &int_mode
)
10042 && GET_MODE_BITSIZE (int_mode
) >= INTVAL (XEXP (op0
, 1)))
10043 op1
= XEXP (op1
, 0);
10045 if (CONST_INT_P (op1
))
10047 /* MOV immediate is assumed to always be cheap. */
10048 *cost
= COSTS_N_INSNS (1);
10054 *cost
+= extra_cost
->alu
.bfi
;
10055 *cost
+= rtx_cost (op1
, VOIDmode
, (enum rtx_code
) code
, 1, speed
);
10061 /* We can't make sense of this, assume default cost. */
10062 *cost
= COSTS_N_INSNS (1);
10068 /* If an instruction can incorporate a constant within the
10069 instruction, the instruction's expression avoids calling
10070 rtx_cost() on the constant. If rtx_cost() is called on a
10071 constant, then it is usually because the constant must be
10072 moved into a register by one or more instructions.
10074 The exception is constant 0, which can be expressed
10075 as XZR/WZR and is therefore free. The exception to this is
10076 if we have (set (reg) (const0_rtx)) in which case we must cost
10077 the move. However, we can catch that when we cost the SET, so
10078 we don't need to consider that here. */
10079 if (x
== const0_rtx
)
10083 /* To an approximation, building any other constant is
10084 proportionally expensive to the number of instructions
10085 required to build that constant. This is true whether we
10086 are compiling for SPEED or otherwise. */
10087 if (!is_a
<scalar_int_mode
> (mode
, &int_mode
))
10088 int_mode
= word_mode
;
10089 *cost
= COSTS_N_INSNS (aarch64_internal_mov_immediate
10090 (NULL_RTX
, x
, false, int_mode
));
10096 /* First determine number of instructions to do the move
10097 as an integer constant. */
10098 if (!aarch64_float_const_representable_p (x
)
10099 && !aarch64_can_const_movi_rtx_p (x
, mode
)
10100 && aarch64_float_const_rtx_p (x
))
10102 unsigned HOST_WIDE_INT ival
;
10103 bool succeed
= aarch64_reinterpret_float_as_int (x
, &ival
);
10104 gcc_assert (succeed
);
10106 scalar_int_mode imode
= (mode
== HFmode
10108 : int_mode_for_mode (mode
).require ());
10109 int ncost
= aarch64_internal_mov_immediate
10110 (NULL_RTX
, gen_int_mode (ival
, imode
), false, imode
);
10111 *cost
+= COSTS_N_INSNS (ncost
);
10117 /* mov[df,sf]_aarch64. */
10118 if (aarch64_float_const_representable_p (x
))
10119 /* FMOV (scalar immediate). */
10120 *cost
+= extra_cost
->fp
[mode
== DFmode
].fpconst
;
10121 else if (!aarch64_float_const_zero_rtx_p (x
))
10123 /* This will be a load from memory. */
10124 if (mode
== DFmode
)
10125 *cost
+= extra_cost
->ldst
.loadd
;
10127 *cost
+= extra_cost
->ldst
.loadf
;
10130 /* Otherwise this is +0.0. We get this using MOVI d0, #0
10131 or MOV v0.s[0], wzr - neither of which are modeled by the
10132 cost tables. Just use the default cost. */
10142 /* For loads we want the base cost of a load, plus an
10143 approximation for the additional cost of the addressing
10145 rtx address
= XEXP (x
, 0);
10146 if (VECTOR_MODE_P (mode
))
10147 *cost
+= extra_cost
->ldst
.loadv
;
10148 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
10149 *cost
+= extra_cost
->ldst
.load
;
10150 else if (mode
== SFmode
)
10151 *cost
+= extra_cost
->ldst
.loadf
;
10152 else if (mode
== DFmode
)
10153 *cost
+= extra_cost
->ldst
.loadd
;
10156 COSTS_N_INSNS (aarch64_address_cost (address
, mode
,
10165 if (VECTOR_MODE_P (mode
))
10170 *cost
+= extra_cost
->vect
.alu
;
10175 if (GET_MODE_CLASS (mode
) == MODE_INT
)
10177 if (GET_RTX_CLASS (GET_CODE (op0
)) == RTX_COMPARE
10178 || GET_RTX_CLASS (GET_CODE (op0
)) == RTX_COMM_COMPARE
)
10181 *cost
+= rtx_cost (XEXP (op0
, 0), VOIDmode
, NEG
, 0, speed
);
10185 /* Cost this as SUB wzr, X. */
10186 op0
= CONST0_RTX (mode
);
10191 if (GET_MODE_CLASS (mode
) == MODE_FLOAT
)
10193 /* Support (neg(fma...)) as a single instruction only if
10194 sign of zeros is unimportant. This matches the decision
10195 making in aarch64.md. */
10196 if (GET_CODE (op0
) == FMA
&& !HONOR_SIGNED_ZEROS (GET_MODE (op0
)))
10199 *cost
= rtx_cost (op0
, mode
, NEG
, 0, speed
);
10202 if (GET_CODE (op0
) == MULT
)
10205 *cost
= rtx_cost (op0
, mode
, NEG
, 0, speed
);
10210 *cost
+= extra_cost
->fp
[mode
== DFmode
].neg
;
10220 if (VECTOR_MODE_P (mode
))
10221 *cost
+= extra_cost
->vect
.alu
;
10223 *cost
+= extra_cost
->alu
.clz
;
10232 if (op1
== const0_rtx
10233 && GET_CODE (op0
) == AND
)
10236 mode
= GET_MODE (op0
);
10240 if (GET_MODE_CLASS (GET_MODE (op0
)) == MODE_INT
)
10242 /* TODO: A write to the CC flags possibly costs extra, this
10243 needs encoding in the cost tables. */
10245 mode
= GET_MODE (op0
);
10247 if (GET_CODE (op0
) == AND
)
10253 if (GET_CODE (op0
) == PLUS
)
10255 /* ADDS (and CMN alias). */
10260 if (GET_CODE (op0
) == MINUS
)
10267 if (GET_CODE (op0
) == ZERO_EXTRACT
&& op1
== const0_rtx
10268 && GET_MODE (x
) == CC_NZmode
&& CONST_INT_P (XEXP (op0
, 1))
10269 && CONST_INT_P (XEXP (op0
, 2)))
10271 /* COMPARE of ZERO_EXTRACT form of TST-immediate.
10272 Handle it here directly rather than going to cost_logic
10273 since we know the immediate generated for the TST is valid
10274 so we can avoid creating an intermediate rtx for it only
10275 for costing purposes. */
10277 *cost
+= extra_cost
->alu
.logical
;
10279 *cost
+= rtx_cost (XEXP (op0
, 0), GET_MODE (op0
),
10280 ZERO_EXTRACT
, 0, speed
);
10284 if (GET_CODE (op1
) == NEG
)
10288 *cost
+= extra_cost
->alu
.arith
;
10290 *cost
+= rtx_cost (op0
, mode
, COMPARE
, 0, speed
);
10291 *cost
+= rtx_cost (XEXP (op1
, 0), mode
, NEG
, 1, speed
);
10297 Compare can freely swap the order of operands, and
10298 canonicalization puts the more complex operation first.
10299 But the integer MINUS logic expects the shift/extend
10300 operation in op1. */
10302 || (GET_CODE (op0
) == SUBREG
&& REG_P (SUBREG_REG (op0
)))))
10310 if (GET_MODE_CLASS (GET_MODE (op0
)) == MODE_FLOAT
)
10314 *cost
+= extra_cost
->fp
[mode
== DFmode
].compare
;
10316 if (CONST_DOUBLE_P (op1
) && aarch64_float_const_zero_rtx_p (op1
))
10318 *cost
+= rtx_cost (op0
, VOIDmode
, COMPARE
, 0, speed
);
10319 /* FCMP supports constant 0.0 for no extra cost. */
10325 if (VECTOR_MODE_P (mode
))
10327 /* Vector compare. */
10329 *cost
+= extra_cost
->vect
.alu
;
10331 if (aarch64_float_const_zero_rtx_p (op1
))
10333 /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
10347 *cost
+= rtx_cost (op0
, mode
, MINUS
, 0, speed
);
10349 /* Detect valid immediates. */
10350 if ((GET_MODE_CLASS (mode
) == MODE_INT
10351 || (GET_MODE_CLASS (mode
) == MODE_CC
10352 && GET_MODE_CLASS (GET_MODE (op0
)) == MODE_INT
))
10353 && CONST_INT_P (op1
)
10354 && aarch64_uimm12_shift (INTVAL (op1
)))
10357 /* SUB(S) (immediate). */
10358 *cost
+= extra_cost
->alu
.arith
;
10362 /* Look for SUB (extended register). */
10363 if (is_a
<scalar_int_mode
> (mode
, &int_mode
)
10364 && aarch64_rtx_arith_op_extract_p (op1
, int_mode
))
10367 *cost
+= extra_cost
->alu
.extend_arith
;
10369 op1
= aarch64_strip_extend (op1
, true);
10370 *cost
+= rtx_cost (op1
, VOIDmode
,
10371 (enum rtx_code
) GET_CODE (op1
), 0, speed
);
10375 rtx new_op1
= aarch64_strip_extend (op1
, false);
10377 /* Cost this as an FMA-alike operation. */
10378 if ((GET_CODE (new_op1
) == MULT
10379 || aarch64_shift_p (GET_CODE (new_op1
)))
10380 && code
!= COMPARE
)
10382 *cost
+= aarch64_rtx_mult_cost (new_op1
, MULT
,
10383 (enum rtx_code
) code
,
10388 *cost
+= rtx_cost (new_op1
, VOIDmode
, MINUS
, 1, speed
);
10392 if (VECTOR_MODE_P (mode
))
10395 *cost
+= extra_cost
->vect
.alu
;
10397 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
10400 *cost
+= extra_cost
->alu
.arith
;
10402 else if (GET_MODE_CLASS (mode
) == MODE_FLOAT
)
10405 *cost
+= extra_cost
->fp
[mode
== DFmode
].addsub
;
10419 if (GET_RTX_CLASS (GET_CODE (op0
)) == RTX_COMPARE
10420 || GET_RTX_CLASS (GET_CODE (op0
)) == RTX_COMM_COMPARE
)
10423 *cost
+= rtx_cost (XEXP (op0
, 0), mode
, PLUS
, 0, speed
);
10424 *cost
+= rtx_cost (op1
, mode
, PLUS
, 1, speed
);
10428 if (GET_MODE_CLASS (mode
) == MODE_INT
10429 && ((CONST_INT_P (op1
) && aarch64_uimm12_shift (INTVAL (op1
)))
10430 || aarch64_sve_addvl_addpl_immediate (op1
, mode
)))
10432 *cost
+= rtx_cost (op0
, mode
, PLUS
, 0, speed
);
10435 /* ADD (immediate). */
10436 *cost
+= extra_cost
->alu
.arith
;
10440 *cost
+= rtx_cost (op1
, mode
, PLUS
, 1, speed
);
10442 /* Look for ADD (extended register). */
10443 if (is_a
<scalar_int_mode
> (mode
, &int_mode
)
10444 && aarch64_rtx_arith_op_extract_p (op0
, int_mode
))
10447 *cost
+= extra_cost
->alu
.extend_arith
;
10449 op0
= aarch64_strip_extend (op0
, true);
10450 *cost
+= rtx_cost (op0
, VOIDmode
,
10451 (enum rtx_code
) GET_CODE (op0
), 0, speed
);
10455 /* Strip any extend, leave shifts behind as we will
10456 cost them through mult_cost. */
10457 new_op0
= aarch64_strip_extend (op0
, false);
10459 if (GET_CODE (new_op0
) == MULT
10460 || aarch64_shift_p (GET_CODE (new_op0
)))
10462 *cost
+= aarch64_rtx_mult_cost (new_op0
, MULT
, PLUS
,
10467 *cost
+= rtx_cost (new_op0
, VOIDmode
, PLUS
, 0, speed
);
10471 if (VECTOR_MODE_P (mode
))
10474 *cost
+= extra_cost
->vect
.alu
;
10476 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
10479 *cost
+= extra_cost
->alu
.arith
;
10481 else if (GET_MODE_CLASS (mode
) == MODE_FLOAT
)
10484 *cost
+= extra_cost
->fp
[mode
== DFmode
].addsub
;
10491 *cost
= COSTS_N_INSNS (1);
10495 if (VECTOR_MODE_P (mode
))
10496 *cost
+= extra_cost
->vect
.alu
;
10498 *cost
+= extra_cost
->alu
.rev
;
10503 if (aarch_rev16_p (x
))
10505 *cost
= COSTS_N_INSNS (1);
10509 if (VECTOR_MODE_P (mode
))
10510 *cost
+= extra_cost
->vect
.alu
;
10512 *cost
+= extra_cost
->alu
.rev
;
10517 if (aarch64_extr_rtx_p (x
, &op0
, &op1
))
10519 *cost
+= rtx_cost (op0
, mode
, IOR
, 0, speed
);
10520 *cost
+= rtx_cost (op1
, mode
, IOR
, 1, speed
);
10522 *cost
+= extra_cost
->alu
.shift
;
10526 /* Fall through. */
10533 if (VECTOR_MODE_P (mode
))
10536 *cost
+= extra_cost
->vect
.alu
;
10541 && GET_CODE (op0
) == MULT
10542 && CONST_INT_P (XEXP (op0
, 1))
10543 && CONST_INT_P (op1
)
10544 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0
, 1))),
10545 INTVAL (op1
)) != 0)
10547 /* This is a UBFM/SBFM. */
10548 *cost
+= rtx_cost (XEXP (op0
, 0), mode
, ZERO_EXTRACT
, 0, speed
);
10550 *cost
+= extra_cost
->alu
.bfx
;
10554 if (is_int_mode (mode
, &int_mode
))
10556 if (CONST_INT_P (op1
))
10558 /* We have a mask + shift version of a UBFIZ
10559 i.e. the *andim_ashift<mode>_bfiz pattern. */
10560 if (GET_CODE (op0
) == ASHIFT
10561 && aarch64_mask_and_shift_for_ubfiz_p (int_mode
, op1
,
10564 *cost
+= rtx_cost (XEXP (op0
, 0), int_mode
,
10565 (enum rtx_code
) code
, 0, speed
);
10567 *cost
+= extra_cost
->alu
.bfx
;
10571 else if (aarch64_bitmask_imm (INTVAL (op1
), int_mode
))
10573 /* We possibly get the immediate for free, this is not
10575 *cost
+= rtx_cost (op0
, int_mode
,
10576 (enum rtx_code
) code
, 0, speed
);
10578 *cost
+= extra_cost
->alu
.logical
;
10587 /* Handle ORN, EON, or BIC. */
10588 if (GET_CODE (op0
) == NOT
)
10589 op0
= XEXP (op0
, 0);
10591 new_op0
= aarch64_strip_shift (op0
);
10593 /* If we had a shift on op0 then this is a logical-shift-
10594 by-register/immediate operation. Otherwise, this is just
10595 a logical operation. */
10598 if (new_op0
!= op0
)
10600 /* Shift by immediate. */
10601 if (CONST_INT_P (XEXP (op0
, 1)))
10602 *cost
+= extra_cost
->alu
.log_shift
;
10604 *cost
+= extra_cost
->alu
.log_shift_reg
;
10607 *cost
+= extra_cost
->alu
.logical
;
10610 /* In both cases we want to cost both operands. */
10611 *cost
+= rtx_cost (new_op0
, int_mode
, (enum rtx_code
) code
,
10613 *cost
+= rtx_cost (op1
, int_mode
, (enum rtx_code
) code
,
10623 op0
= aarch64_strip_shift (x
);
10625 if (VECTOR_MODE_P (mode
))
10628 *cost
+= extra_cost
->vect
.alu
;
10632 /* MVN-shifted-reg. */
10635 *cost
+= rtx_cost (op0
, mode
, (enum rtx_code
) code
, 0, speed
);
10638 *cost
+= extra_cost
->alu
.log_shift
;
10642 /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
10643 Handle the second form here taking care that 'a' in the above can
10645 else if (GET_CODE (op0
) == XOR
)
10647 rtx newop0
= XEXP (op0
, 0);
10648 rtx newop1
= XEXP (op0
, 1);
10649 rtx op0_stripped
= aarch64_strip_shift (newop0
);
10651 *cost
+= rtx_cost (newop1
, mode
, (enum rtx_code
) code
, 1, speed
);
10652 *cost
+= rtx_cost (op0_stripped
, mode
, XOR
, 0, speed
);
10656 if (op0_stripped
!= newop0
)
10657 *cost
+= extra_cost
->alu
.log_shift
;
10659 *cost
+= extra_cost
->alu
.logical
;
10666 *cost
+= extra_cost
->alu
.logical
;
10673 /* If a value is written in SI mode, then zero extended to DI
10674 mode, the operation will in general be free as a write to
10675 a 'w' register implicitly zeroes the upper bits of an 'x'
10676 register. However, if this is
10678 (set (reg) (zero_extend (reg)))
10680 we must cost the explicit register move. */
10682 && GET_MODE (op0
) == SImode
10685 int op_cost
= rtx_cost (op0
, VOIDmode
, ZERO_EXTEND
, 0, speed
);
10687 /* If OP_COST is non-zero, then the cost of the zero extend
10688 is effectively the cost of the inner operation. Otherwise
10689 we have a MOV instruction and we take the cost from the MOV
10690 itself. This is true independently of whether we are
10691 optimizing for space or time. */
10697 else if (MEM_P (op0
))
10699 /* All loads can zero extend to any size for free. */
10700 *cost
= rtx_cost (op0
, VOIDmode
, ZERO_EXTEND
, param
, speed
);
10704 op0
= aarch64_extend_bitfield_pattern_p (x
);
10707 *cost
+= rtx_cost (op0
, mode
, ZERO_EXTEND
, 0, speed
);
10709 *cost
+= extra_cost
->alu
.bfx
;
10715 if (VECTOR_MODE_P (mode
))
10718 *cost
+= extra_cost
->vect
.alu
;
10722 /* We generate an AND instead of UXTB/UXTH. */
10723 *cost
+= extra_cost
->alu
.logical
;
10729 if (MEM_P (XEXP (x
, 0)))
10734 rtx address
= XEXP (XEXP (x
, 0), 0);
10735 *cost
+= extra_cost
->ldst
.load_sign_extend
;
10738 COSTS_N_INSNS (aarch64_address_cost (address
, mode
,
10744 op0
= aarch64_extend_bitfield_pattern_p (x
);
10747 *cost
+= rtx_cost (op0
, mode
, SIGN_EXTEND
, 0, speed
);
10749 *cost
+= extra_cost
->alu
.bfx
;
10755 if (VECTOR_MODE_P (mode
))
10756 *cost
+= extra_cost
->vect
.alu
;
10758 *cost
+= extra_cost
->alu
.extend
;
10766 if (CONST_INT_P (op1
))
10770 if (VECTOR_MODE_P (mode
))
10772 /* Vector shift (immediate). */
10773 *cost
+= extra_cost
->vect
.alu
;
10777 /* LSL (immediate), UBMF, UBFIZ and friends. These are all
10779 *cost
+= extra_cost
->alu
.shift
;
10783 /* We can incorporate zero/sign extend for free. */
10784 if (GET_CODE (op0
) == ZERO_EXTEND
10785 || GET_CODE (op0
) == SIGN_EXTEND
)
10786 op0
= XEXP (op0
, 0);
10788 *cost
+= rtx_cost (op0
, VOIDmode
, ASHIFT
, 0, speed
);
10793 if (VECTOR_MODE_P (mode
))
10796 /* Vector shift (register). */
10797 *cost
+= extra_cost
->vect
.alu
;
10803 *cost
+= extra_cost
->alu
.shift_reg
;
10805 if (GET_CODE (op1
) == AND
&& REG_P (XEXP (op1
, 0))
10806 && CONST_INT_P (XEXP (op1
, 1))
10807 && known_eq (INTVAL (XEXP (op1
, 1)),
10808 GET_MODE_BITSIZE (mode
) - 1))
10810 *cost
+= rtx_cost (op0
, mode
, (rtx_code
) code
, 0, speed
);
10811 /* We already demanded XEXP (op1, 0) to be REG_P, so
10812 don't recurse into it. */
10816 return false; /* All arguments need to be in registers. */
10826 if (CONST_INT_P (op1
))
10828 /* ASR (immediate) and friends. */
10831 if (VECTOR_MODE_P (mode
))
10832 *cost
+= extra_cost
->vect
.alu
;
10834 *cost
+= extra_cost
->alu
.shift
;
10837 *cost
+= rtx_cost (op0
, mode
, (enum rtx_code
) code
, 0, speed
);
10842 if (VECTOR_MODE_P (mode
))
10845 /* Vector shift (register). */
10846 *cost
+= extra_cost
->vect
.alu
;
10851 /* ASR (register) and friends. */
10852 *cost
+= extra_cost
->alu
.shift_reg
;
10854 if (GET_CODE (op1
) == AND
&& REG_P (XEXP (op1
, 0))
10855 && CONST_INT_P (XEXP (op1
, 1))
10856 && known_eq (INTVAL (XEXP (op1
, 1)),
10857 GET_MODE_BITSIZE (mode
) - 1))
10859 *cost
+= rtx_cost (op0
, mode
, (rtx_code
) code
, 0, speed
);
10860 /* We already demanded XEXP (op1, 0) to be REG_P, so
10861 don't recurse into it. */
10865 return false; /* All arguments need to be in registers. */
10870 if (aarch64_cmodel
== AARCH64_CMODEL_LARGE
10871 || aarch64_cmodel
== AARCH64_CMODEL_SMALL_SPIC
)
10875 *cost
+= extra_cost
->ldst
.load
;
10877 else if (aarch64_cmodel
== AARCH64_CMODEL_SMALL
10878 || aarch64_cmodel
== AARCH64_CMODEL_SMALL_PIC
)
10880 /* ADRP, followed by ADD. */
10881 *cost
+= COSTS_N_INSNS (1);
10883 *cost
+= 2 * extra_cost
->alu
.arith
;
10885 else if (aarch64_cmodel
== AARCH64_CMODEL_TINY
10886 || aarch64_cmodel
== AARCH64_CMODEL_TINY_PIC
)
10890 *cost
+= extra_cost
->alu
.arith
;
10895 /* One extra load instruction, after accessing the GOT. */
10896 *cost
+= COSTS_N_INSNS (1);
10898 *cost
+= extra_cost
->ldst
.load
;
10904 /* ADRP/ADD (immediate). */
10906 *cost
+= extra_cost
->alu
.arith
;
10914 if (VECTOR_MODE_P (mode
))
10915 *cost
+= extra_cost
->vect
.alu
;
10917 *cost
+= extra_cost
->alu
.bfx
;
10920 /* We can trust that the immediates used will be correct (there
10921 are no by-register forms), so we need only cost op0. */
10922 *cost
+= rtx_cost (XEXP (x
, 0), VOIDmode
, (enum rtx_code
) code
, 0, speed
);
10926 *cost
+= aarch64_rtx_mult_cost (x
, MULT
, 0, speed
);
10927 /* aarch64_rtx_mult_cost always handles recursion to its
10932 /* We can expand signed mod by power of 2 using a NEGS, two parallel
10933 ANDs and a CSNEG. Assume here that CSNEG is the same as the cost of
10934 an unconditional negate. This case should only ever be reached through
10935 the set_smod_pow2_cheap check in expmed.c. */
10936 if (CONST_INT_P (XEXP (x
, 1))
10937 && exact_log2 (INTVAL (XEXP (x
, 1))) > 0
10938 && (mode
== SImode
|| mode
== DImode
))
10940 /* We expand to 4 instructions. Reset the baseline. */
10941 *cost
= COSTS_N_INSNS (4);
10944 *cost
+= 2 * extra_cost
->alu
.logical
10945 + 2 * extra_cost
->alu
.arith
;
10950 /* Fall-through. */
10954 /* Slighly prefer UMOD over SMOD. */
10955 if (VECTOR_MODE_P (mode
))
10956 *cost
+= extra_cost
->vect
.alu
;
10957 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
10958 *cost
+= (extra_cost
->mult
[mode
== DImode
].add
10959 + extra_cost
->mult
[mode
== DImode
].idiv
10960 + (code
== MOD
? 1 : 0));
10962 return false; /* All arguments need to be in registers. */
10969 if (VECTOR_MODE_P (mode
))
10970 *cost
+= extra_cost
->vect
.alu
;
10971 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
10972 /* There is no integer SQRT, so only DIV and UDIV can get
10974 *cost
+= (extra_cost
->mult
[mode
== DImode
].idiv
10975 /* Slighly prefer UDIV over SDIV. */
10976 + (code
== DIV
? 1 : 0));
10978 *cost
+= extra_cost
->fp
[mode
== DFmode
].div
;
10980 return false; /* All arguments need to be in registers. */
10983 return aarch64_if_then_else_costs (XEXP (x
, 0), XEXP (x
, 1),
10984 XEXP (x
, 2), cost
, speed
);
10997 return false; /* All arguments must be in registers. */
11006 if (VECTOR_MODE_P (mode
))
11007 *cost
+= extra_cost
->vect
.alu
;
11009 *cost
+= extra_cost
->fp
[mode
== DFmode
].fma
;
11012 /* FMSUB, FNMADD, and FNMSUB are free. */
11013 if (GET_CODE (op0
) == NEG
)
11014 op0
= XEXP (op0
, 0);
11016 if (GET_CODE (op2
) == NEG
)
11017 op2
= XEXP (op2
, 0);
11019 /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
11020 and the by-element operand as operand 0. */
11021 if (GET_CODE (op1
) == NEG
)
11022 op1
= XEXP (op1
, 0);
11024 /* Catch vector-by-element operations. The by-element operand can
11025 either be (vec_duplicate (vec_select (x))) or just
11026 (vec_select (x)), depending on whether we are multiplying by
11027 a vector or a scalar.
11029 Canonicalization is not very good in these cases, FMA4 will put the
11030 by-element operand as operand 0, FNMA4 will have it as operand 1. */
11031 if (GET_CODE (op0
) == VEC_DUPLICATE
)
11032 op0
= XEXP (op0
, 0);
11033 else if (GET_CODE (op1
) == VEC_DUPLICATE
)
11034 op1
= XEXP (op1
, 0);
11036 if (GET_CODE (op0
) == VEC_SELECT
)
11037 op0
= XEXP (op0
, 0);
11038 else if (GET_CODE (op1
) == VEC_SELECT
)
11039 op1
= XEXP (op1
, 0);
11041 /* If the remaining parameters are not registers,
11042 get the cost to put them into registers. */
11043 *cost
+= rtx_cost (op0
, mode
, FMA
, 0, speed
);
11044 *cost
+= rtx_cost (op1
, mode
, FMA
, 1, speed
);
11045 *cost
+= rtx_cost (op2
, mode
, FMA
, 2, speed
);
11049 case UNSIGNED_FLOAT
:
11051 *cost
+= extra_cost
->fp
[mode
== DFmode
].fromint
;
11057 if (VECTOR_MODE_P (mode
))
11059 /*Vector truncate. */
11060 *cost
+= extra_cost
->vect
.alu
;
11063 *cost
+= extra_cost
->fp
[mode
== DFmode
].widen
;
11067 case FLOAT_TRUNCATE
:
11070 if (VECTOR_MODE_P (mode
))
11072 /*Vector conversion. */
11073 *cost
+= extra_cost
->vect
.alu
;
11076 *cost
+= extra_cost
->fp
[mode
== DFmode
].narrow
;
11083 /* Strip the rounding part. They will all be implemented
11084 by the fcvt* family of instructions anyway. */
11085 if (GET_CODE (x
) == UNSPEC
)
11087 unsigned int uns_code
= XINT (x
, 1);
11089 if (uns_code
== UNSPEC_FRINTA
11090 || uns_code
== UNSPEC_FRINTM
11091 || uns_code
== UNSPEC_FRINTN
11092 || uns_code
== UNSPEC_FRINTP
11093 || uns_code
== UNSPEC_FRINTZ
)
11094 x
= XVECEXP (x
, 0, 0);
11099 if (VECTOR_MODE_P (mode
))
11100 *cost
+= extra_cost
->vect
.alu
;
11102 *cost
+= extra_cost
->fp
[GET_MODE (x
) == DFmode
].toint
;
11105 /* We can combine fmul by a power of 2 followed by a fcvt into a single
11106 fixed-point fcvt. */
11107 if (GET_CODE (x
) == MULT
11108 && ((VECTOR_MODE_P (mode
)
11109 && aarch64_vec_fpconst_pow_of_2 (XEXP (x
, 1)) > 0)
11110 || aarch64_fpconst_pow_of_2 (XEXP (x
, 1)) > 0))
11112 *cost
+= rtx_cost (XEXP (x
, 0), VOIDmode
, (rtx_code
) code
,
11117 *cost
+= rtx_cost (x
, VOIDmode
, (enum rtx_code
) code
, 0, speed
);
11121 if (VECTOR_MODE_P (mode
))
11123 /* ABS (vector). */
11125 *cost
+= extra_cost
->vect
.alu
;
11127 else if (GET_MODE_CLASS (mode
) == MODE_FLOAT
)
11131 /* FABD, which is analogous to FADD. */
11132 if (GET_CODE (op0
) == MINUS
)
11134 *cost
+= rtx_cost (XEXP (op0
, 0), mode
, MINUS
, 0, speed
);
11135 *cost
+= rtx_cost (XEXP (op0
, 1), mode
, MINUS
, 1, speed
);
11137 *cost
+= extra_cost
->fp
[mode
== DFmode
].addsub
;
11141 /* Simple FABS is analogous to FNEG. */
11143 *cost
+= extra_cost
->fp
[mode
== DFmode
].neg
;
11147 /* Integer ABS will either be split to
11148 two arithmetic instructions, or will be an ABS
11149 (scalar), which we don't model. */
11150 *cost
= COSTS_N_INSNS (2);
11152 *cost
+= 2 * extra_cost
->alu
.arith
;
11160 if (VECTOR_MODE_P (mode
))
11161 *cost
+= extra_cost
->vect
.alu
;
11164 /* FMAXNM/FMINNM/FMAX/FMIN.
11165 TODO: This may not be accurate for all implementations, but
11166 we do not model this in the cost tables. */
11167 *cost
+= extra_cost
->fp
[mode
== DFmode
].addsub
;
11173 /* The floating point round to integer frint* instructions. */
11174 if (aarch64_frint_unspec_p (XINT (x
, 1)))
11177 *cost
+= extra_cost
->fp
[mode
== DFmode
].roundint
;
11182 if (XINT (x
, 1) == UNSPEC_RBIT
)
11185 *cost
+= extra_cost
->alu
.rev
;
11193 /* Decompose <su>muldi3_highpart. */
11194 if (/* (truncate:DI */
11197 && GET_MODE (XEXP (x
, 0)) == TImode
11198 && GET_CODE (XEXP (x
, 0)) == LSHIFTRT
11200 && GET_CODE (XEXP (XEXP (x
, 0), 0)) == MULT
11201 /* (ANY_EXTEND:TI (reg:DI))
11202 (ANY_EXTEND:TI (reg:DI))) */
11203 && ((GET_CODE (XEXP (XEXP (XEXP (x
, 0), 0), 0)) == ZERO_EXTEND
11204 && GET_CODE (XEXP (XEXP (XEXP (x
, 0), 0), 1)) == ZERO_EXTEND
)
11205 || (GET_CODE (XEXP (XEXP (XEXP (x
, 0), 0), 0)) == SIGN_EXTEND
11206 && GET_CODE (XEXP (XEXP (XEXP (x
, 0), 0), 1)) == SIGN_EXTEND
))
11207 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x
, 0), 0), 0), 0)) == DImode
11208 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x
, 0), 0), 1), 0)) == DImode
11209 /* (const_int 64) */
11210 && CONST_INT_P (XEXP (XEXP (x
, 0), 1))
11211 && UINTVAL (XEXP (XEXP (x
, 0), 1)) == 64)
11215 *cost
+= extra_cost
->mult
[mode
== DImode
].extend
;
11216 *cost
+= rtx_cost (XEXP (XEXP (XEXP (XEXP (x
, 0), 0), 0), 0),
11217 mode
, MULT
, 0, speed
);
11218 *cost
+= rtx_cost (XEXP (XEXP (XEXP (XEXP (x
, 0), 0), 1), 0),
11219 mode
, MULT
, 1, speed
);
11223 /* Fall through. */
11229 && flag_aarch64_verbose_cost
)
11230 fprintf (dump_file
,
11231 "\nFailed to cost RTX. Assuming default cost.\n");
11236 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
11237 calculated for X. This cost is stored in *COST. Returns true
11238 if the total cost of X was calculated. */
11240 aarch64_rtx_costs_wrapper (rtx x
, machine_mode mode
, int outer
,
11241 int param
, int *cost
, bool speed
)
11243 bool result
= aarch64_rtx_costs (x
, mode
, outer
, param
, cost
, speed
);
11246 && flag_aarch64_verbose_cost
)
11248 print_rtl_single (dump_file
, x
);
11249 fprintf (dump_file
, "\n%s cost: %d (%s)\n",
11250 speed
? "Hot" : "Cold",
11251 *cost
, result
? "final" : "partial");
11258 aarch64_register_move_cost (machine_mode mode
,
11259 reg_class_t from_i
, reg_class_t to_i
)
11261 enum reg_class from
= (enum reg_class
) from_i
;
11262 enum reg_class to
= (enum reg_class
) to_i
;
11263 const struct cpu_regmove_cost
*regmove_cost
11264 = aarch64_tune_params
.regmove_cost
;
11266 /* Caller save and pointer regs are equivalent to GENERAL_REGS. */
11267 if (to
== TAILCALL_ADDR_REGS
|| to
== POINTER_REGS
)
11270 if (from
== TAILCALL_ADDR_REGS
|| from
== POINTER_REGS
)
11271 from
= GENERAL_REGS
;
11273 /* Moving between GPR and stack cost is the same as GP2GP. */
11274 if ((from
== GENERAL_REGS
&& to
== STACK_REG
)
11275 || (to
== GENERAL_REGS
&& from
== STACK_REG
))
11276 return regmove_cost
->GP2GP
;
11278 /* To/From the stack register, we move via the gprs. */
11279 if (to
== STACK_REG
|| from
== STACK_REG
)
11280 return aarch64_register_move_cost (mode
, from
, GENERAL_REGS
)
11281 + aarch64_register_move_cost (mode
, GENERAL_REGS
, to
);
11283 if (known_eq (GET_MODE_SIZE (mode
), 16))
11285 /* 128-bit operations on general registers require 2 instructions. */
11286 if (from
== GENERAL_REGS
&& to
== GENERAL_REGS
)
11287 return regmove_cost
->GP2GP
* 2;
11288 else if (from
== GENERAL_REGS
)
11289 return regmove_cost
->GP2FP
* 2;
11290 else if (to
== GENERAL_REGS
)
11291 return regmove_cost
->FP2GP
* 2;
11293 /* When AdvSIMD instructions are disabled it is not possible to move
11294 a 128-bit value directly between Q registers. This is handled in
11295 secondary reload. A general register is used as a scratch to move
11296 the upper DI value and the lower DI value is moved directly,
11297 hence the cost is the sum of three moves. */
11299 return regmove_cost
->GP2FP
+ regmove_cost
->FP2GP
+ regmove_cost
->FP2FP
;
11301 return regmove_cost
->FP2FP
;
11304 if (from
== GENERAL_REGS
&& to
== GENERAL_REGS
)
11305 return regmove_cost
->GP2GP
;
11306 else if (from
== GENERAL_REGS
)
11307 return regmove_cost
->GP2FP
;
11308 else if (to
== GENERAL_REGS
)
11309 return regmove_cost
->FP2GP
;
11311 return regmove_cost
->FP2FP
;
11315 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED
,
11316 reg_class_t rclass ATTRIBUTE_UNUSED
,
11317 bool in ATTRIBUTE_UNUSED
)
11319 return aarch64_tune_params
.memmov_cost
;
11322 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
11323 to optimize 1.0/sqrt. */
11326 use_rsqrt_p (machine_mode mode
)
11328 return (!flag_trapping_math
11329 && flag_unsafe_math_optimizations
11330 && ((aarch64_tune_params
.approx_modes
->recip_sqrt
11331 & AARCH64_APPROX_MODE (mode
))
11332 || flag_mrecip_low_precision_sqrt
));
11335 /* Function to decide when to use the approximate reciprocal square root
11339 aarch64_builtin_reciprocal (tree fndecl
)
11341 machine_mode mode
= TYPE_MODE (TREE_TYPE (fndecl
));
11343 if (!use_rsqrt_p (mode
))
11345 return aarch64_builtin_rsqrt (DECL_MD_FUNCTION_CODE (fndecl
));
11348 /* Emit instruction sequence to compute either the approximate square root
11349 or its approximate reciprocal, depending on the flag RECP, and return
11350 whether the sequence was emitted or not. */
11353 aarch64_emit_approx_sqrt (rtx dst
, rtx src
, bool recp
)
11355 machine_mode mode
= GET_MODE (dst
);
11357 if (GET_MODE_INNER (mode
) == HFmode
)
11359 gcc_assert (!recp
);
11365 if (!(flag_mlow_precision_sqrt
11366 || (aarch64_tune_params
.approx_modes
->sqrt
11367 & AARCH64_APPROX_MODE (mode
))))
11370 if (flag_finite_math_only
11371 || flag_trapping_math
11372 || !flag_unsafe_math_optimizations
11373 || optimize_function_for_size_p (cfun
))
11377 /* Caller assumes we cannot fail. */
11378 gcc_assert (use_rsqrt_p (mode
));
11380 machine_mode mmsk
= mode_for_int_vector (mode
).require ();
11381 rtx xmsk
= gen_reg_rtx (mmsk
);
11383 /* When calculating the approximate square root, compare the
11384 argument with 0.0 and create a mask. */
11385 emit_insn (gen_rtx_SET (xmsk
,
11387 gen_rtx_EQ (mmsk
, src
,
11388 CONST0_RTX (mode
)))));
11390 /* Estimate the approximate reciprocal square root. */
11391 rtx xdst
= gen_reg_rtx (mode
);
11392 emit_insn (gen_aarch64_rsqrte (mode
, xdst
, src
));
11394 /* Iterate over the series twice for SF and thrice for DF. */
11395 int iterations
= (GET_MODE_INNER (mode
) == DFmode
) ? 3 : 2;
11397 /* Optionally iterate over the series once less for faster performance
11398 while sacrificing the accuracy. */
11399 if ((recp
&& flag_mrecip_low_precision_sqrt
)
11400 || (!recp
&& flag_mlow_precision_sqrt
))
11403 /* Iterate over the series to calculate the approximate reciprocal square
11405 rtx x1
= gen_reg_rtx (mode
);
11406 while (iterations
--)
11408 rtx x2
= gen_reg_rtx (mode
);
11409 emit_set_insn (x2
, gen_rtx_MULT (mode
, xdst
, xdst
));
11411 emit_insn (gen_aarch64_rsqrts (mode
, x1
, src
, x2
));
11413 if (iterations
> 0)
11414 emit_set_insn (xdst
, gen_rtx_MULT (mode
, xdst
, x1
));
11419 /* Qualify the approximate reciprocal square root when the argument is
11420 0.0 by squashing the intermediary result to 0.0. */
11421 rtx xtmp
= gen_reg_rtx (mmsk
);
11422 emit_set_insn (xtmp
, gen_rtx_AND (mmsk
, gen_rtx_NOT (mmsk
, xmsk
),
11423 gen_rtx_SUBREG (mmsk
, xdst
, 0)));
11424 emit_move_insn (xdst
, gen_rtx_SUBREG (mode
, xtmp
, 0));
11426 /* Calculate the approximate square root. */
11427 emit_set_insn (xdst
, gen_rtx_MULT (mode
, xdst
, src
));
11430 /* Finalize the approximation. */
11431 emit_set_insn (dst
, gen_rtx_MULT (mode
, xdst
, x1
));
11436 /* Emit the instruction sequence to compute the approximation for the division
11437 of NUM by DEN in QUO and return whether the sequence was emitted or not. */
11440 aarch64_emit_approx_div (rtx quo
, rtx num
, rtx den
)
11442 machine_mode mode
= GET_MODE (quo
);
11444 if (GET_MODE_INNER (mode
) == HFmode
)
11447 bool use_approx_division_p
= (flag_mlow_precision_div
11448 || (aarch64_tune_params
.approx_modes
->division
11449 & AARCH64_APPROX_MODE (mode
)));
11451 if (!flag_finite_math_only
11452 || flag_trapping_math
11453 || !flag_unsafe_math_optimizations
11454 || optimize_function_for_size_p (cfun
)
11455 || !use_approx_division_p
)
11458 if (!TARGET_SIMD
&& VECTOR_MODE_P (mode
))
11461 /* Estimate the approximate reciprocal. */
11462 rtx xrcp
= gen_reg_rtx (mode
);
11463 emit_insn (gen_aarch64_frecpe (mode
, xrcp
, den
));
11465 /* Iterate over the series twice for SF and thrice for DF. */
11466 int iterations
= (GET_MODE_INNER (mode
) == DFmode
) ? 3 : 2;
11468 /* Optionally iterate over the series once less for faster performance,
11469 while sacrificing the accuracy. */
11470 if (flag_mlow_precision_div
)
11473 /* Iterate over the series to calculate the approximate reciprocal. */
11474 rtx xtmp
= gen_reg_rtx (mode
);
11475 while (iterations
--)
11477 emit_insn (gen_aarch64_frecps (mode
, xtmp
, xrcp
, den
));
11479 if (iterations
> 0)
11480 emit_set_insn (xrcp
, gen_rtx_MULT (mode
, xrcp
, xtmp
));
11483 if (num
!= CONST1_RTX (mode
))
11485 /* As the approximate reciprocal of DEN is already calculated, only
11486 calculate the approximate division when NUM is not 1.0. */
11487 rtx xnum
= force_reg (mode
, num
);
11488 emit_set_insn (xrcp
, gen_rtx_MULT (mode
, xrcp
, xnum
));
11491 /* Finalize the approximation. */
11492 emit_set_insn (quo
, gen_rtx_MULT (mode
, xrcp
, xtmp
));
11496 /* Return the number of instructions that can be issued per cycle. */
11498 aarch64_sched_issue_rate (void)
11500 return aarch64_tune_params
.issue_rate
;
11504 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
11506 int issue_rate
= aarch64_sched_issue_rate ();
11508 return issue_rate
> 1 && !sched_fusion
? issue_rate
: 0;
11512 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
11513 autopref_multipass_dfa_lookahead_guard from haifa-sched.c. It only
11514 has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0. */
11517 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn
*insn
,
11520 return autopref_multipass_dfa_lookahead_guard (insn
, ready_index
);
11524 /* Vectorizer cost model target hooks. */
11526 /* Implement targetm.vectorize.builtin_vectorization_cost. */
11528 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost
,
11530 int misalign ATTRIBUTE_UNUSED
)
11533 const cpu_vector_cost
*costs
= aarch64_tune_params
.vec_costs
;
11536 if (vectype
!= NULL
)
11537 fp
= FLOAT_TYPE_P (vectype
);
11539 switch (type_of_cost
)
11542 return fp
? costs
->scalar_fp_stmt_cost
: costs
->scalar_int_stmt_cost
;
11545 return costs
->scalar_load_cost
;
11548 return costs
->scalar_store_cost
;
11551 return fp
? costs
->vec_fp_stmt_cost
: costs
->vec_int_stmt_cost
;
11554 return costs
->vec_align_load_cost
;
11557 return costs
->vec_store_cost
;
11559 case vec_to_scalar
:
11560 return costs
->vec_to_scalar_cost
;
11562 case scalar_to_vec
:
11563 return costs
->scalar_to_vec_cost
;
11565 case unaligned_load
:
11566 case vector_gather_load
:
11567 return costs
->vec_unalign_load_cost
;
11569 case unaligned_store
:
11570 case vector_scatter_store
:
11571 return costs
->vec_unalign_store_cost
;
11573 case cond_branch_taken
:
11574 return costs
->cond_taken_branch_cost
;
11576 case cond_branch_not_taken
:
11577 return costs
->cond_not_taken_branch_cost
;
11580 return costs
->vec_permute_cost
;
11582 case vec_promote_demote
:
11583 return fp
? costs
->vec_fp_stmt_cost
: costs
->vec_int_stmt_cost
;
11585 case vec_construct
:
11586 elements
= estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype
));
11587 return elements
/ 2 + 1;
11590 gcc_unreachable ();
11594 /* Implement targetm.vectorize.add_stmt_cost. */
11596 aarch64_add_stmt_cost (void *data
, int count
, enum vect_cost_for_stmt kind
,
11597 struct _stmt_vec_info
*stmt_info
, int misalign
,
11598 enum vect_cost_model_location where
)
11600 unsigned *cost
= (unsigned *) data
;
11601 unsigned retval
= 0;
11603 if (flag_vect_cost_model
)
11605 tree vectype
= stmt_info
? stmt_vectype (stmt_info
) : NULL_TREE
;
11607 aarch64_builtin_vectorization_cost (kind
, vectype
, misalign
);
11609 /* Statements in an inner loop relative to the loop being
11610 vectorized are weighted more heavily. The value here is
11611 arbitrary and could potentially be improved with analysis. */
11612 if (where
== vect_body
&& stmt_info
&& stmt_in_inner_loop_p (stmt_info
))
11613 count
*= 50; /* FIXME */
11615 retval
= (unsigned) (count
* stmt_cost
);
11616 cost
[where
] += retval
;
11622 static void initialize_aarch64_code_model (struct gcc_options
*);
11624 /* Parse the TO_PARSE string and put the architecture struct that it
11625 selects into RES and the architectural features into ISA_FLAGS.
11626 Return an aarch64_parse_opt_result describing the parse result.
11627 If there is an error parsing, RES and ISA_FLAGS are left unchanged.
11628 When the TO_PARSE string contains an invalid extension,
11629 a copy of the string is created and stored to INVALID_EXTENSION. */
11631 static enum aarch64_parse_opt_result
11632 aarch64_parse_arch (const char *to_parse
, const struct processor
**res
,
11633 uint64_t *isa_flags
, std::string
*invalid_extension
)
11636 const struct processor
*arch
;
11639 ext
= strchr (to_parse
, '+');
11642 len
= ext
- to_parse
;
11644 len
= strlen (to_parse
);
11647 return AARCH64_PARSE_MISSING_ARG
;
11650 /* Loop through the list of supported ARCHes to find a match. */
11651 for (arch
= all_architectures
; arch
->name
!= NULL
; arch
++)
11653 if (strlen (arch
->name
) == len
11654 && strncmp (arch
->name
, to_parse
, len
) == 0)
11656 uint64_t isa_temp
= arch
->flags
;
11660 /* TO_PARSE string contains at least one extension. */
11661 enum aarch64_parse_opt_result ext_res
11662 = aarch64_parse_extension (ext
, &isa_temp
, invalid_extension
);
11664 if (ext_res
!= AARCH64_PARSE_OK
)
11667 /* Extension parsing was successful. Confirm the result
11668 arch and ISA flags. */
11670 *isa_flags
= isa_temp
;
11671 return AARCH64_PARSE_OK
;
11675 /* ARCH name not found in list. */
11676 return AARCH64_PARSE_INVALID_ARG
;
11679 /* Parse the TO_PARSE string and put the result tuning in RES and the
11680 architecture flags in ISA_FLAGS. Return an aarch64_parse_opt_result
11681 describing the parse result. If there is an error parsing, RES and
11682 ISA_FLAGS are left unchanged.
11683 When the TO_PARSE string contains an invalid extension,
11684 a copy of the string is created and stored to INVALID_EXTENSION. */
11686 static enum aarch64_parse_opt_result
11687 aarch64_parse_cpu (const char *to_parse
, const struct processor
**res
,
11688 uint64_t *isa_flags
, std::string
*invalid_extension
)
11691 const struct processor
*cpu
;
11694 ext
= strchr (to_parse
, '+');
11697 len
= ext
- to_parse
;
11699 len
= strlen (to_parse
);
11702 return AARCH64_PARSE_MISSING_ARG
;
11705 /* Loop through the list of supported CPUs to find a match. */
11706 for (cpu
= all_cores
; cpu
->name
!= NULL
; cpu
++)
11708 if (strlen (cpu
->name
) == len
&& strncmp (cpu
->name
, to_parse
, len
) == 0)
11710 uint64_t isa_temp
= cpu
->flags
;
11715 /* TO_PARSE string contains at least one extension. */
11716 enum aarch64_parse_opt_result ext_res
11717 = aarch64_parse_extension (ext
, &isa_temp
, invalid_extension
);
11719 if (ext_res
!= AARCH64_PARSE_OK
)
11722 /* Extension parsing was successfull. Confirm the result
11723 cpu and ISA flags. */
11725 *isa_flags
= isa_temp
;
11726 return AARCH64_PARSE_OK
;
11730 /* CPU name not found in list. */
11731 return AARCH64_PARSE_INVALID_ARG
;
11734 /* Parse the TO_PARSE string and put the cpu it selects into RES.
11735 Return an aarch64_parse_opt_result describing the parse result.
11736 If the parsing fails the RES does not change. */
11738 static enum aarch64_parse_opt_result
11739 aarch64_parse_tune (const char *to_parse
, const struct processor
**res
)
11741 const struct processor
*cpu
;
11743 /* Loop through the list of supported CPUs to find a match. */
11744 for (cpu
= all_cores
; cpu
->name
!= NULL
; cpu
++)
11746 if (strcmp (cpu
->name
, to_parse
) == 0)
11749 return AARCH64_PARSE_OK
;
11753 /* CPU name not found in list. */
11754 return AARCH64_PARSE_INVALID_ARG
;
11757 /* Parse TOKEN, which has length LENGTH to see if it is an option
11758 described in FLAG. If it is, return the index bit for that fusion type.
11759 If not, error (printing OPTION_NAME) and return zero. */
11761 static unsigned int
11762 aarch64_parse_one_option_token (const char *token
,
11764 const struct aarch64_flag_desc
*flag
,
11765 const char *option_name
)
11767 for (; flag
->name
!= NULL
; flag
++)
11769 if (length
== strlen (flag
->name
)
11770 && !strncmp (flag
->name
, token
, length
))
11774 error ("unknown flag passed in %<-moverride=%s%> (%s)", option_name
, token
);
11778 /* Parse OPTION which is a comma-separated list of flags to enable.
11779 FLAGS gives the list of flags we understand, INITIAL_STATE gives any
11780 default state we inherit from the CPU tuning structures. OPTION_NAME
11781 gives the top-level option we are parsing in the -moverride string,
11782 for use in error messages. */
11784 static unsigned int
11785 aarch64_parse_boolean_options (const char *option
,
11786 const struct aarch64_flag_desc
*flags
,
11787 unsigned int initial_state
,
11788 const char *option_name
)
11790 const char separator
= '.';
11791 const char* specs
= option
;
11792 const char* ntoken
= option
;
11793 unsigned int found_flags
= initial_state
;
11795 while ((ntoken
= strchr (specs
, separator
)))
11797 size_t token_length
= ntoken
- specs
;
11798 unsigned token_ops
= aarch64_parse_one_option_token (specs
,
11802 /* If we find "none" (or, for simplicity's sake, an error) anywhere
11803 in the token stream, reset the supported operations. So:
11805 adrp+add.cmp+branch.none.adrp+add
11807 would have the result of turning on only adrp+add fusion. */
11811 found_flags
|= token_ops
;
11815 /* We ended with a comma, print something. */
11818 error ("%s string ill-formed\n", option_name
);
11822 /* We still have one more token to parse. */
11823 size_t token_length
= strlen (specs
);
11824 unsigned token_ops
= aarch64_parse_one_option_token (specs
,
11831 found_flags
|= token_ops
;
11832 return found_flags
;
11835 /* Support for overriding instruction fusion. */
11838 aarch64_parse_fuse_string (const char *fuse_string
,
11839 struct tune_params
*tune
)
11841 tune
->fusible_ops
= aarch64_parse_boolean_options (fuse_string
,
11842 aarch64_fusible_pairs
,
11847 /* Support for overriding other tuning flags. */
11850 aarch64_parse_tune_string (const char *tune_string
,
11851 struct tune_params
*tune
)
11853 tune
->extra_tuning_flags
11854 = aarch64_parse_boolean_options (tune_string
,
11855 aarch64_tuning_flags
,
11856 tune
->extra_tuning_flags
,
11860 /* Parse the sve_width tuning moverride string in TUNE_STRING.
11861 Accept the valid SVE vector widths allowed by
11862 aarch64_sve_vector_bits_enum and use it to override sve_width
11866 aarch64_parse_sve_width_string (const char *tune_string
,
11867 struct tune_params
*tune
)
11871 int n
= sscanf (tune_string
, "%d", &width
);
11874 error ("invalid format for sve_width");
11886 error ("invalid sve_width value: %d", width
);
11888 tune
->sve_width
= (enum aarch64_sve_vector_bits_enum
) width
;
11891 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
11892 we understand. If it is, extract the option string and handoff to
11893 the appropriate function. */
11896 aarch64_parse_one_override_token (const char* token
,
11898 struct tune_params
*tune
)
11900 const struct aarch64_tuning_override_function
*fn
11901 = aarch64_tuning_override_functions
;
11903 const char *option_part
= strchr (token
, '=');
11906 error ("tuning string missing in option (%s)", token
);
11910 /* Get the length of the option name. */
11911 length
= option_part
- token
;
11912 /* Skip the '=' to get to the option string. */
11915 for (; fn
->name
!= NULL
; fn
++)
11917 if (!strncmp (fn
->name
, token
, length
))
11919 fn
->parse_override (option_part
, tune
);
11924 error ("unknown tuning option (%s)",token
);
11928 /* A checking mechanism for the implementation of the tls size. */
11931 initialize_aarch64_tls_size (struct gcc_options
*opts
)
11933 if (aarch64_tls_size
== 0)
11934 aarch64_tls_size
= 24;
11936 switch (opts
->x_aarch64_cmodel_var
)
11938 case AARCH64_CMODEL_TINY
:
11939 /* Both the default and maximum TLS size allowed under tiny is 1M which
11940 needs two instructions to address, so we clamp the size to 24. */
11941 if (aarch64_tls_size
> 24)
11942 aarch64_tls_size
= 24;
11944 case AARCH64_CMODEL_SMALL
:
11945 /* The maximum TLS size allowed under small is 4G. */
11946 if (aarch64_tls_size
> 32)
11947 aarch64_tls_size
= 32;
11949 case AARCH64_CMODEL_LARGE
:
11950 /* The maximum TLS size allowed under large is 16E.
11951 FIXME: 16E should be 64bit, we only support 48bit offset now. */
11952 if (aarch64_tls_size
> 48)
11953 aarch64_tls_size
= 48;
11956 gcc_unreachable ();
11962 /* Parse STRING looking for options in the format:
11963 string :: option:string
11964 option :: name=substring
11966 substring :: defined by option. */
11969 aarch64_parse_override_string (const char* input_string
,
11970 struct tune_params
* tune
)
11972 const char separator
= ':';
11973 size_t string_length
= strlen (input_string
) + 1;
11974 char *string_root
= (char *) xmalloc (sizeof (*string_root
) * string_length
);
11975 char *string
= string_root
;
11976 strncpy (string
, input_string
, string_length
);
11977 string
[string_length
- 1] = '\0';
11979 char* ntoken
= string
;
11981 while ((ntoken
= strchr (string
, separator
)))
11983 size_t token_length
= ntoken
- string
;
11984 /* Make this substring look like a string. */
11986 aarch64_parse_one_override_token (string
, token_length
, tune
);
11990 /* One last option to parse. */
11991 aarch64_parse_one_override_token (string
, strlen (string
), tune
);
11992 free (string_root
);
11997 aarch64_override_options_after_change_1 (struct gcc_options
*opts
)
11999 if (accepted_branch_protection_string
)
12001 opts
->x_aarch64_branch_protection_string
12002 = xstrdup (accepted_branch_protection_string
);
12005 /* PR 70044: We have to be careful about being called multiple times for the
12006 same function. This means all changes should be repeatable. */
12008 /* Set aarch64_use_frame_pointer based on -fno-omit-frame-pointer.
12009 Disable the frame pointer flag so the mid-end will not use a frame
12010 pointer in leaf functions in order to support -fomit-leaf-frame-pointer.
12011 Set x_flag_omit_frame_pointer to the special value 2 to differentiate
12012 between -fomit-frame-pointer (1) and -fno-omit-frame-pointer (2). */
12013 aarch64_use_frame_pointer
= opts
->x_flag_omit_frame_pointer
!= 1;
12014 if (opts
->x_flag_omit_frame_pointer
== 0)
12015 opts
->x_flag_omit_frame_pointer
= 2;
12017 /* If not optimizing for size, set the default
12018 alignment to what the target wants. */
12019 if (!opts
->x_optimize_size
)
12021 if (opts
->x_flag_align_loops
&& !opts
->x_str_align_loops
)
12022 opts
->x_str_align_loops
= aarch64_tune_params
.loop_align
;
12023 if (opts
->x_flag_align_jumps
&& !opts
->x_str_align_jumps
)
12024 opts
->x_str_align_jumps
= aarch64_tune_params
.jump_align
;
12025 if (opts
->x_flag_align_functions
&& !opts
->x_str_align_functions
)
12026 opts
->x_str_align_functions
= aarch64_tune_params
.function_align
;
12029 /* We default to no pc-relative literal loads. */
12031 aarch64_pcrelative_literal_loads
= false;
12033 /* If -mpc-relative-literal-loads is set on the command line, this
12034 implies that the user asked for PC relative literal loads. */
12035 if (opts
->x_pcrelative_literal_loads
== 1)
12036 aarch64_pcrelative_literal_loads
= true;
12038 /* In the tiny memory model it makes no sense to disallow PC relative
12039 literal pool loads. */
12040 if (aarch64_cmodel
== AARCH64_CMODEL_TINY
12041 || aarch64_cmodel
== AARCH64_CMODEL_TINY_PIC
)
12042 aarch64_pcrelative_literal_loads
= true;
12044 /* When enabling the lower precision Newton series for the square root, also
12045 enable it for the reciprocal square root, since the latter is an
12046 intermediary step for the former. */
12047 if (flag_mlow_precision_sqrt
)
12048 flag_mrecip_low_precision_sqrt
= true;
12051 /* 'Unpack' up the internal tuning structs and update the options
12052 in OPTS. The caller must have set up selected_tune and selected_arch
12053 as all the other target-specific codegen decisions are
12054 derived from them. */
12057 aarch64_override_options_internal (struct gcc_options
*opts
)
12059 aarch64_tune_flags
= selected_tune
->flags
;
12060 aarch64_tune
= selected_tune
->sched_core
;
12061 /* Make a copy of the tuning parameters attached to the core, which
12062 we may later overwrite. */
12063 aarch64_tune_params
= *(selected_tune
->tune
);
12064 aarch64_architecture_version
= selected_arch
->architecture_version
;
12066 if (opts
->x_aarch64_override_tune_string
)
12067 aarch64_parse_override_string (opts
->x_aarch64_override_tune_string
,
12068 &aarch64_tune_params
);
12070 /* This target defaults to strict volatile bitfields. */
12071 if (opts
->x_flag_strict_volatile_bitfields
< 0 && abi_version_at_least (2))
12072 opts
->x_flag_strict_volatile_bitfields
= 1;
12074 if (aarch64_stack_protector_guard
== SSP_GLOBAL
12075 && opts
->x_aarch64_stack_protector_guard_offset_str
)
12077 error ("incompatible options %<-mstack-protector-guard=global%> and "
12078 "%<-mstack-protector-guard-offset=%s%>",
12079 aarch64_stack_protector_guard_offset_str
);
12082 if (aarch64_stack_protector_guard
== SSP_SYSREG
12083 && !(opts
->x_aarch64_stack_protector_guard_offset_str
12084 && opts
->x_aarch64_stack_protector_guard_reg_str
))
12086 error ("both %<-mstack-protector-guard-offset%> and "
12087 "%<-mstack-protector-guard-reg%> must be used "
12088 "with %<-mstack-protector-guard=sysreg%>");
12091 if (opts
->x_aarch64_stack_protector_guard_reg_str
)
12093 if (strlen (opts
->x_aarch64_stack_protector_guard_reg_str
) > 100)
12094 error ("specify a system register with a small string length.");
12097 if (opts
->x_aarch64_stack_protector_guard_offset_str
)
12100 const char *str
= aarch64_stack_protector_guard_offset_str
;
12102 long offs
= strtol (aarch64_stack_protector_guard_offset_str
, &end
, 0);
12103 if (!*str
|| *end
|| errno
)
12104 error ("%qs is not a valid offset in %qs", str
,
12105 "-mstack-protector-guard-offset=");
12106 aarch64_stack_protector_guard_offset
= offs
;
12109 initialize_aarch64_code_model (opts
);
12110 initialize_aarch64_tls_size (opts
);
12112 int queue_depth
= 0;
12113 switch (aarch64_tune_params
.autoprefetcher_model
)
12115 case tune_params::AUTOPREFETCHER_OFF
:
12118 case tune_params::AUTOPREFETCHER_WEAK
:
12121 case tune_params::AUTOPREFETCHER_STRONG
:
12122 queue_depth
= max_insn_queue_index
+ 1;
12125 gcc_unreachable ();
12128 /* We don't mind passing in global_options_set here as we don't use
12129 the *options_set structs anyway. */
12130 maybe_set_param_value (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH
,
12132 opts
->x_param_values
,
12133 global_options_set
.x_param_values
);
12135 /* Set up parameters to be used in prefetching algorithm. Do not
12136 override the defaults unless we are tuning for a core we have
12137 researched values for. */
12138 if (aarch64_tune_params
.prefetch
->num_slots
> 0)
12139 maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES
,
12140 aarch64_tune_params
.prefetch
->num_slots
,
12141 opts
->x_param_values
,
12142 global_options_set
.x_param_values
);
12143 if (aarch64_tune_params
.prefetch
->l1_cache_size
>= 0)
12144 maybe_set_param_value (PARAM_L1_CACHE_SIZE
,
12145 aarch64_tune_params
.prefetch
->l1_cache_size
,
12146 opts
->x_param_values
,
12147 global_options_set
.x_param_values
);
12148 if (aarch64_tune_params
.prefetch
->l1_cache_line_size
>= 0)
12149 maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE
,
12150 aarch64_tune_params
.prefetch
->l1_cache_line_size
,
12151 opts
->x_param_values
,
12152 global_options_set
.x_param_values
);
12153 if (aarch64_tune_params
.prefetch
->l2_cache_size
>= 0)
12154 maybe_set_param_value (PARAM_L2_CACHE_SIZE
,
12155 aarch64_tune_params
.prefetch
->l2_cache_size
,
12156 opts
->x_param_values
,
12157 global_options_set
.x_param_values
);
12158 if (!aarch64_tune_params
.prefetch
->prefetch_dynamic_strides
)
12159 maybe_set_param_value (PARAM_PREFETCH_DYNAMIC_STRIDES
,
12161 opts
->x_param_values
,
12162 global_options_set
.x_param_values
);
12163 if (aarch64_tune_params
.prefetch
->minimum_stride
>= 0)
12164 maybe_set_param_value (PARAM_PREFETCH_MINIMUM_STRIDE
,
12165 aarch64_tune_params
.prefetch
->minimum_stride
,
12166 opts
->x_param_values
,
12167 global_options_set
.x_param_values
);
12169 /* Use the alternative scheduling-pressure algorithm by default. */
12170 maybe_set_param_value (PARAM_SCHED_PRESSURE_ALGORITHM
, SCHED_PRESSURE_MODEL
,
12171 opts
->x_param_values
,
12172 global_options_set
.x_param_values
);
12174 /* If the user hasn't changed it via configure then set the default to 64 KB
12175 for the backend. */
12176 maybe_set_param_value (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE
,
12177 DEFAULT_STK_CLASH_GUARD_SIZE
== 0
12178 ? 16 : DEFAULT_STK_CLASH_GUARD_SIZE
,
12179 opts
->x_param_values
,
12180 global_options_set
.x_param_values
);
12182 /* Validate the guard size. */
12183 int guard_size
= PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE
);
12185 /* Enforce that interval is the same size as size so the mid-end does the
12187 maybe_set_param_value (PARAM_STACK_CLASH_PROTECTION_PROBE_INTERVAL
,
12189 opts
->x_param_values
,
12190 global_options_set
.x_param_values
);
12192 /* The maybe_set calls won't update the value if the user has explicitly set
12193 one. Which means we need to validate that probing interval and guard size
12196 = PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_PROBE_INTERVAL
);
12197 if (guard_size
!= probe_interval
)
12198 error ("stack clash guard size %<%d%> must be equal to probing interval "
12199 "%<%d%>", guard_size
, probe_interval
);
12201 /* Enable sw prefetching at specified optimization level for
12202 CPUS that have prefetch. Lower optimization level threshold by 1
12203 when profiling is enabled. */
12204 if (opts
->x_flag_prefetch_loop_arrays
< 0
12205 && !opts
->x_optimize_size
12206 && aarch64_tune_params
.prefetch
->default_opt_level
>= 0
12207 && opts
->x_optimize
>= aarch64_tune_params
.prefetch
->default_opt_level
)
12208 opts
->x_flag_prefetch_loop_arrays
= 1;
12210 if (opts
->x_aarch64_arch_string
== NULL
)
12211 opts
->x_aarch64_arch_string
= selected_arch
->name
;
12212 if (opts
->x_aarch64_cpu_string
== NULL
)
12213 opts
->x_aarch64_cpu_string
= selected_cpu
->name
;
12214 if (opts
->x_aarch64_tune_string
== NULL
)
12215 opts
->x_aarch64_tune_string
= selected_tune
->name
;
12217 aarch64_override_options_after_change_1 (opts
);
12220 /* Print a hint with a suggestion for a core or architecture name that
12221 most closely resembles what the user passed in STR. ARCH is true if
12222 the user is asking for an architecture name. ARCH is false if the user
12223 is asking for a core name. */
12226 aarch64_print_hint_for_core_or_arch (const char *str
, bool arch
)
12228 auto_vec
<const char *> candidates
;
12229 const struct processor
*entry
= arch
? all_architectures
: all_cores
;
12230 for (; entry
->name
!= NULL
; entry
++)
12231 candidates
.safe_push (entry
->name
);
12233 #ifdef HAVE_LOCAL_CPU_DETECT
12234 /* Add also "native" as possible value. */
12236 candidates
.safe_push ("native");
12240 const char *hint
= candidates_list_and_hint (str
, s
, candidates
);
12242 inform (input_location
, "valid arguments are: %s;"
12243 " did you mean %qs?", s
, hint
);
12245 inform (input_location
, "valid arguments are: %s", s
);
12250 /* Print a hint with a suggestion for a core name that most closely resembles
12251 what the user passed in STR. */
12254 aarch64_print_hint_for_core (const char *str
)
12256 aarch64_print_hint_for_core_or_arch (str
, false);
12259 /* Print a hint with a suggestion for an architecture name that most closely
12260 resembles what the user passed in STR. */
12263 aarch64_print_hint_for_arch (const char *str
)
12265 aarch64_print_hint_for_core_or_arch (str
, true);
12269 /* Print a hint with a suggestion for an extension name
12270 that most closely resembles what the user passed in STR. */
12273 aarch64_print_hint_for_extensions (const std::string
&str
)
12275 auto_vec
<const char *> candidates
;
12276 aarch64_get_all_extension_candidates (&candidates
);
12278 const char *hint
= candidates_list_and_hint (str
.c_str (), s
, candidates
);
12280 inform (input_location
, "valid arguments are: %s;"
12281 " did you mean %qs?", s
, hint
);
12283 inform (input_location
, "valid arguments are: %s;", s
);
12288 /* Validate a command-line -mcpu option. Parse the cpu and extensions (if any)
12289 specified in STR and throw errors if appropriate. Put the results if
12290 they are valid in RES and ISA_FLAGS. Return whether the option is
12294 aarch64_validate_mcpu (const char *str
, const struct processor
**res
,
12295 uint64_t *isa_flags
)
12297 std::string invalid_extension
;
12298 enum aarch64_parse_opt_result parse_res
12299 = aarch64_parse_cpu (str
, res
, isa_flags
, &invalid_extension
);
12301 if (parse_res
== AARCH64_PARSE_OK
)
12306 case AARCH64_PARSE_MISSING_ARG
:
12307 error ("missing cpu name in %<-mcpu=%s%>", str
);
12309 case AARCH64_PARSE_INVALID_ARG
:
12310 error ("unknown value %qs for %<-mcpu%>", str
);
12311 aarch64_print_hint_for_core (str
);
12313 case AARCH64_PARSE_INVALID_FEATURE
:
12314 error ("invalid feature modifier %qs in %<-mcpu=%s%>",
12315 invalid_extension
.c_str (), str
);
12316 aarch64_print_hint_for_extensions (invalid_extension
);
12319 gcc_unreachable ();
12325 /* Parses CONST_STR for branch protection features specified in
12326 aarch64_branch_protect_types, and set any global variables required. Returns
12327 the parsing result and assigns LAST_STR to the last processed token from
12328 CONST_STR so that it can be used for error reporting. */
12331 aarch64_parse_opt_result
aarch64_parse_branch_protection (const char *const_str
,
12334 char *str_root
= xstrdup (const_str
);
12335 char* token_save
= NULL
;
12336 char *str
= strtok_r (str_root
, "+", &token_save
);
12337 enum aarch64_parse_opt_result res
= AARCH64_PARSE_OK
;
12339 res
= AARCH64_PARSE_MISSING_ARG
;
12342 char *next_str
= strtok_r (NULL
, "+", &token_save
);
12343 /* Reset the branch protection features to their defaults. */
12344 aarch64_handle_no_branch_protection (NULL
, NULL
);
12346 while (str
&& res
== AARCH64_PARSE_OK
)
12348 const aarch64_branch_protect_type
* type
= aarch64_branch_protect_types
;
12349 bool found
= false;
12350 /* Search for this type. */
12351 while (type
&& type
->name
&& !found
&& res
== AARCH64_PARSE_OK
)
12353 if (strcmp (str
, type
->name
) == 0)
12356 res
= type
->handler (str
, next_str
);
12358 next_str
= strtok_r (NULL
, "+", &token_save
);
12363 if (found
&& res
== AARCH64_PARSE_OK
)
12365 bool found_subtype
= true;
12366 /* Loop through each token until we find one that isn't a
12368 while (found_subtype
)
12370 found_subtype
= false;
12371 const aarch64_branch_protect_type
*subtype
= type
->subtypes
;
12372 /* Search for the subtype. */
12373 while (str
&& subtype
&& subtype
->name
&& !found_subtype
12374 && res
== AARCH64_PARSE_OK
)
12376 if (strcmp (str
, subtype
->name
) == 0)
12378 found_subtype
= true;
12379 res
= subtype
->handler (str
, next_str
);
12381 next_str
= strtok_r (NULL
, "+", &token_save
);
12389 res
= AARCH64_PARSE_INVALID_ARG
;
12392 /* Copy the last processed token into the argument to pass it back.
12393 Used by option and attribute validation to print the offending token. */
12396 if (str
) strcpy (*last_str
, str
);
12397 else *last_str
= NULL
;
12399 if (res
== AARCH64_PARSE_OK
)
12401 /* If needed, alloc the accepted string then copy in const_str.
12402 Used by override_option_after_change_1. */
12403 if (!accepted_branch_protection_string
)
12404 accepted_branch_protection_string
= (char *) xmalloc (
12405 BRANCH_PROTECT_STR_MAX
12407 strncpy (accepted_branch_protection_string
, const_str
,
12408 BRANCH_PROTECT_STR_MAX
+ 1);
12409 /* Forcibly null-terminate. */
12410 accepted_branch_protection_string
[BRANCH_PROTECT_STR_MAX
] = '\0';
12416 aarch64_validate_mbranch_protection (const char *const_str
)
12418 char *str
= (char *) xmalloc (strlen (const_str
));
12419 enum aarch64_parse_opt_result res
=
12420 aarch64_parse_branch_protection (const_str
, &str
);
12421 if (res
== AARCH64_PARSE_INVALID_ARG
)
12422 error ("invalid argument %<%s%> for %<-mbranch-protection=%>", str
);
12423 else if (res
== AARCH64_PARSE_MISSING_ARG
)
12424 error ("missing argument for %<-mbranch-protection=%>");
12426 return res
== AARCH64_PARSE_OK
;
12429 /* Validate a command-line -march option. Parse the arch and extensions
12430 (if any) specified in STR and throw errors if appropriate. Put the
12431 results, if they are valid, in RES and ISA_FLAGS. Return whether the
12432 option is valid. */
12435 aarch64_validate_march (const char *str
, const struct processor
**res
,
12436 uint64_t *isa_flags
)
12438 std::string invalid_extension
;
12439 enum aarch64_parse_opt_result parse_res
12440 = aarch64_parse_arch (str
, res
, isa_flags
, &invalid_extension
);
12442 if (parse_res
== AARCH64_PARSE_OK
)
12447 case AARCH64_PARSE_MISSING_ARG
:
12448 error ("missing arch name in %<-march=%s%>", str
);
12450 case AARCH64_PARSE_INVALID_ARG
:
12451 error ("unknown value %qs for %<-march%>", str
);
12452 aarch64_print_hint_for_arch (str
);
12454 case AARCH64_PARSE_INVALID_FEATURE
:
12455 error ("invalid feature modifier %qs in %<-march=%s%>",
12456 invalid_extension
.c_str (), str
);
12457 aarch64_print_hint_for_extensions (invalid_extension
);
12460 gcc_unreachable ();
12466 /* Validate a command-line -mtune option. Parse the cpu
12467 specified in STR and throw errors if appropriate. Put the
12468 result, if it is valid, in RES. Return whether the option is
12472 aarch64_validate_mtune (const char *str
, const struct processor
**res
)
12474 enum aarch64_parse_opt_result parse_res
12475 = aarch64_parse_tune (str
, res
);
12477 if (parse_res
== AARCH64_PARSE_OK
)
12482 case AARCH64_PARSE_MISSING_ARG
:
12483 error ("missing cpu name in %<-mtune=%s%>", str
);
12485 case AARCH64_PARSE_INVALID_ARG
:
12486 error ("unknown value %qs for %<-mtune%>", str
);
12487 aarch64_print_hint_for_core (str
);
12490 gcc_unreachable ();
12495 /* Return the CPU corresponding to the enum CPU.
12496 If it doesn't specify a cpu, return the default. */
12498 static const struct processor
*
12499 aarch64_get_tune_cpu (enum aarch64_processor cpu
)
12501 if (cpu
!= aarch64_none
)
12502 return &all_cores
[cpu
];
12504 /* The & 0x3f is to extract the bottom 6 bits that encode the
12505 default cpu as selected by the --with-cpu GCC configure option
12507 ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
12508 flags mechanism should be reworked to make it more sane. */
12509 return &all_cores
[TARGET_CPU_DEFAULT
& 0x3f];
12512 /* Return the architecture corresponding to the enum ARCH.
12513 If it doesn't specify a valid architecture, return the default. */
12515 static const struct processor
*
12516 aarch64_get_arch (enum aarch64_arch arch
)
12518 if (arch
!= aarch64_no_arch
)
12519 return &all_architectures
[arch
];
12521 const struct processor
*cpu
= &all_cores
[TARGET_CPU_DEFAULT
& 0x3f];
12523 return &all_architectures
[cpu
->arch
];
12526 /* Return the VG value associated with -msve-vector-bits= value VALUE. */
12529 aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits_enum value
)
12531 /* For now generate vector-length agnostic code for -msve-vector-bits=128.
12532 This ensures we can clearly distinguish SVE and Advanced SIMD modes when
12533 deciding which .md file patterns to use and when deciding whether
12534 something is a legitimate address or constant. */
12535 if (value
== SVE_SCALABLE
|| value
== SVE_128
)
12536 return poly_uint16 (2, 2);
12538 return (int) value
/ 64;
12541 /* Implement TARGET_OPTION_OVERRIDE. This is called once in the beginning
12542 and is used to parse the -m{cpu,tune,arch} strings and setup the initial
12543 tuning structs. In particular it must set selected_tune and
12544 aarch64_isa_flags that define the available ISA features and tuning
12545 decisions. It must also set selected_arch as this will be used to
12546 output the .arch asm tags for each function. */
12549 aarch64_override_options (void)
12551 uint64_t cpu_isa
= 0;
12552 uint64_t arch_isa
= 0;
12553 aarch64_isa_flags
= 0;
12555 bool valid_cpu
= true;
12556 bool valid_tune
= true;
12557 bool valid_arch
= true;
12559 selected_cpu
= NULL
;
12560 selected_arch
= NULL
;
12561 selected_tune
= NULL
;
12563 if (aarch64_branch_protection_string
)
12564 aarch64_validate_mbranch_protection (aarch64_branch_protection_string
);
12566 /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
12567 If either of -march or -mtune is given, they override their
12568 respective component of -mcpu. */
12569 if (aarch64_cpu_string
)
12570 valid_cpu
= aarch64_validate_mcpu (aarch64_cpu_string
, &selected_cpu
,
12573 if (aarch64_arch_string
)
12574 valid_arch
= aarch64_validate_march (aarch64_arch_string
, &selected_arch
,
12577 if (aarch64_tune_string
)
12578 valid_tune
= aarch64_validate_mtune (aarch64_tune_string
, &selected_tune
);
12580 #ifdef SUBTARGET_OVERRIDE_OPTIONS
12581 SUBTARGET_OVERRIDE_OPTIONS
;
12584 /* If the user did not specify a processor, choose the default
12585 one for them. This will be the CPU set during configuration using
12586 --with-cpu, otherwise it is "generic". */
12591 selected_cpu
= &all_cores
[selected_arch
->ident
];
12592 aarch64_isa_flags
= arch_isa
;
12593 explicit_arch
= selected_arch
->arch
;
12597 /* Get default configure-time CPU. */
12598 selected_cpu
= aarch64_get_tune_cpu (aarch64_none
);
12599 aarch64_isa_flags
= TARGET_CPU_DEFAULT
>> 6;
12603 explicit_tune_core
= selected_tune
->ident
;
12605 /* If both -mcpu and -march are specified check that they are architecturally
12606 compatible, warn if they're not and prefer the -march ISA flags. */
12607 else if (selected_arch
)
12609 if (selected_arch
->arch
!= selected_cpu
->arch
)
12611 warning (0, "switch %<-mcpu=%s%> conflicts with %<-march=%s%> switch",
12612 all_architectures
[selected_cpu
->arch
].name
,
12613 selected_arch
->name
);
12615 aarch64_isa_flags
= arch_isa
;
12616 explicit_arch
= selected_arch
->arch
;
12617 explicit_tune_core
= selected_tune
? selected_tune
->ident
12618 : selected_cpu
->ident
;
12622 /* -mcpu but no -march. */
12623 aarch64_isa_flags
= cpu_isa
;
12624 explicit_tune_core
= selected_tune
? selected_tune
->ident
12625 : selected_cpu
->ident
;
12626 gcc_assert (selected_cpu
);
12627 selected_arch
= &all_architectures
[selected_cpu
->arch
];
12628 explicit_arch
= selected_arch
->arch
;
12631 /* Set the arch as well as we will need it when outputing
12632 the .arch directive in assembly. */
12633 if (!selected_arch
)
12635 gcc_assert (selected_cpu
);
12636 selected_arch
= &all_architectures
[selected_cpu
->arch
];
12639 if (!selected_tune
)
12640 selected_tune
= selected_cpu
;
12642 if (aarch64_enable_bti
== 2)
12644 #ifdef TARGET_ENABLE_BTI
12645 aarch64_enable_bti
= 1;
12647 aarch64_enable_bti
= 0;
12651 /* Return address signing is currently not supported for ILP32 targets. For
12652 LP64 targets use the configured option in the absence of a command-line
12653 option for -mbranch-protection. */
12654 if (!TARGET_ILP32
&& accepted_branch_protection_string
== NULL
)
12656 #ifdef TARGET_ENABLE_PAC_RET
12657 aarch64_ra_sign_scope
= AARCH64_FUNCTION_NON_LEAF
;
12659 aarch64_ra_sign_scope
= AARCH64_FUNCTION_NONE
;
12663 #ifndef HAVE_AS_MABI_OPTION
12664 /* The compiler may have been configured with 2.23.* binutils, which does
12665 not have support for ILP32. */
12667 error ("assembler does not support %<-mabi=ilp32%>");
12670 /* Convert -msve-vector-bits to a VG count. */
12671 aarch64_sve_vg
= aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits
);
12673 if (aarch64_ra_sign_scope
!= AARCH64_FUNCTION_NONE
&& TARGET_ILP32
)
12674 sorry ("return address signing is only supported for %<-mabi=lp64%>");
12676 /* Make sure we properly set up the explicit options. */
12677 if ((aarch64_cpu_string
&& valid_cpu
)
12678 || (aarch64_tune_string
&& valid_tune
))
12679 gcc_assert (explicit_tune_core
!= aarch64_none
);
12681 if ((aarch64_cpu_string
&& valid_cpu
)
12682 || (aarch64_arch_string
&& valid_arch
))
12683 gcc_assert (explicit_arch
!= aarch64_no_arch
);
12685 /* The pass to insert speculation tracking runs before
12686 shrink-wrapping and the latter does not know how to update the
12687 tracking status. So disable it in this case. */
12688 if (aarch64_track_speculation
)
12689 flag_shrink_wrap
= 0;
12691 aarch64_override_options_internal (&global_options
);
12693 /* Save these options as the default ones in case we push and pop them later
12694 while processing functions with potential target attributes. */
12695 target_option_default_node
= target_option_current_node
12696 = build_target_option_node (&global_options
);
12699 /* Implement targetm.override_options_after_change. */
12702 aarch64_override_options_after_change (void)
12704 aarch64_override_options_after_change_1 (&global_options
);
12707 static struct machine_function
*
12708 aarch64_init_machine_status (void)
12710 struct machine_function
*machine
;
12711 machine
= ggc_cleared_alloc
<machine_function
> ();
12716 aarch64_init_expanders (void)
12718 init_machine_status
= aarch64_init_machine_status
;
12721 /* A checking mechanism for the implementation of the various code models. */
12723 initialize_aarch64_code_model (struct gcc_options
*opts
)
12725 if (opts
->x_flag_pic
)
12727 switch (opts
->x_aarch64_cmodel_var
)
12729 case AARCH64_CMODEL_TINY
:
12730 aarch64_cmodel
= AARCH64_CMODEL_TINY_PIC
;
12732 case AARCH64_CMODEL_SMALL
:
12733 #ifdef HAVE_AS_SMALL_PIC_RELOCS
12734 aarch64_cmodel
= (flag_pic
== 2
12735 ? AARCH64_CMODEL_SMALL_PIC
12736 : AARCH64_CMODEL_SMALL_SPIC
);
12738 aarch64_cmodel
= AARCH64_CMODEL_SMALL_PIC
;
12741 case AARCH64_CMODEL_LARGE
:
12742 sorry ("code model %qs with %<-f%s%>", "large",
12743 opts
->x_flag_pic
> 1 ? "PIC" : "pic");
12746 gcc_unreachable ();
12750 aarch64_cmodel
= opts
->x_aarch64_cmodel_var
;
12753 /* Implement TARGET_OPTION_SAVE. */
12756 aarch64_option_save (struct cl_target_option
*ptr
, struct gcc_options
*opts
)
12758 ptr
->x_aarch64_override_tune_string
= opts
->x_aarch64_override_tune_string
;
12759 ptr
->x_aarch64_branch_protection_string
12760 = opts
->x_aarch64_branch_protection_string
;
12763 /* Implements TARGET_OPTION_RESTORE. Restore the backend codegen decisions
12764 using the information saved in PTR. */
12767 aarch64_option_restore (struct gcc_options
*opts
, struct cl_target_option
*ptr
)
12769 opts
->x_explicit_tune_core
= ptr
->x_explicit_tune_core
;
12770 selected_tune
= aarch64_get_tune_cpu (ptr
->x_explicit_tune_core
);
12771 opts
->x_explicit_arch
= ptr
->x_explicit_arch
;
12772 selected_arch
= aarch64_get_arch (ptr
->x_explicit_arch
);
12773 opts
->x_aarch64_override_tune_string
= ptr
->x_aarch64_override_tune_string
;
12774 opts
->x_aarch64_branch_protection_string
12775 = ptr
->x_aarch64_branch_protection_string
;
12776 if (opts
->x_aarch64_branch_protection_string
)
12778 aarch64_parse_branch_protection (opts
->x_aarch64_branch_protection_string
,
12782 aarch64_override_options_internal (opts
);
12785 /* Implement TARGET_OPTION_PRINT. */
12788 aarch64_option_print (FILE *file
, int indent
, struct cl_target_option
*ptr
)
12790 const struct processor
*cpu
12791 = aarch64_get_tune_cpu (ptr
->x_explicit_tune_core
);
12792 uint64_t isa_flags
= ptr
->x_aarch64_isa_flags
;
12793 const struct processor
*arch
= aarch64_get_arch (ptr
->x_explicit_arch
);
12794 std::string extension
12795 = aarch64_get_extension_string_for_isa_flags (isa_flags
, arch
->flags
);
12797 fprintf (file
, "%*sselected tune = %s\n", indent
, "", cpu
->name
);
12798 fprintf (file
, "%*sselected arch = %s%s\n", indent
, "",
12799 arch
->name
, extension
.c_str ());
12802 static GTY(()) tree aarch64_previous_fndecl
;
12805 aarch64_reset_previous_fndecl (void)
12807 aarch64_previous_fndecl
= NULL
;
12810 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
12811 Used by aarch64_set_current_function and aarch64_pragma_target_parse to
12812 make sure optab availability predicates are recomputed when necessary. */
12815 aarch64_save_restore_target_globals (tree new_tree
)
12817 if (TREE_TARGET_GLOBALS (new_tree
))
12818 restore_target_globals (TREE_TARGET_GLOBALS (new_tree
));
12819 else if (new_tree
== target_option_default_node
)
12820 restore_target_globals (&default_target_globals
);
12822 TREE_TARGET_GLOBALS (new_tree
) = save_target_globals_default_opts ();
12825 /* Implement TARGET_SET_CURRENT_FUNCTION. Unpack the codegen decisions
12826 like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
12827 of the function, if such exists. This function may be called multiple
12828 times on a single function so use aarch64_previous_fndecl to avoid
12829 setting up identical state. */
12832 aarch64_set_current_function (tree fndecl
)
12834 if (!fndecl
|| fndecl
== aarch64_previous_fndecl
)
12837 tree old_tree
= (aarch64_previous_fndecl
12838 ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl
)
12841 tree new_tree
= DECL_FUNCTION_SPECIFIC_TARGET (fndecl
);
12843 /* If current function has no attributes but the previous one did,
12844 use the default node. */
12845 if (!new_tree
&& old_tree
)
12846 new_tree
= target_option_default_node
;
12848 /* If nothing to do, return. #pragma GCC reset or #pragma GCC pop to
12849 the default have been handled by aarch64_save_restore_target_globals from
12850 aarch64_pragma_target_parse. */
12851 if (old_tree
== new_tree
)
12854 aarch64_previous_fndecl
= fndecl
;
12856 /* First set the target options. */
12857 cl_target_option_restore (&global_options
, TREE_TARGET_OPTION (new_tree
));
12859 aarch64_save_restore_target_globals (new_tree
);
12862 /* Enum describing the various ways we can handle attributes.
12863 In many cases we can reuse the generic option handling machinery. */
12865 enum aarch64_attr_opt_type
12867 aarch64_attr_mask
, /* Attribute should set a bit in target_flags. */
12868 aarch64_attr_bool
, /* Attribute sets or unsets a boolean variable. */
12869 aarch64_attr_enum
, /* Attribute sets an enum variable. */
12870 aarch64_attr_custom
/* Attribute requires a custom handling function. */
12873 /* All the information needed to handle a target attribute.
12874 NAME is the name of the attribute.
12875 ATTR_TYPE specifies the type of behavior of the attribute as described
12876 in the definition of enum aarch64_attr_opt_type.
12877 ALLOW_NEG is true if the attribute supports a "no-" form.
12878 HANDLER is the function that takes the attribute string as an argument
12879 It is needed only when the ATTR_TYPE is aarch64_attr_custom.
12880 OPT_NUM is the enum specifying the option that the attribute modifies.
12881 This is needed for attributes that mirror the behavior of a command-line
12882 option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
12883 aarch64_attr_enum. */
12885 struct aarch64_attribute_info
12888 enum aarch64_attr_opt_type attr_type
;
12890 bool (*handler
) (const char *);
12891 enum opt_code opt_num
;
12894 /* Handle the ARCH_STR argument to the arch= target attribute. */
12897 aarch64_handle_attr_arch (const char *str
)
12899 const struct processor
*tmp_arch
= NULL
;
12900 std::string invalid_extension
;
12901 enum aarch64_parse_opt_result parse_res
12902 = aarch64_parse_arch (str
, &tmp_arch
, &aarch64_isa_flags
, &invalid_extension
);
12904 if (parse_res
== AARCH64_PARSE_OK
)
12906 gcc_assert (tmp_arch
);
12907 selected_arch
= tmp_arch
;
12908 explicit_arch
= selected_arch
->arch
;
12914 case AARCH64_PARSE_MISSING_ARG
:
12915 error ("missing name in %<target(\"arch=\")%> pragma or attribute");
12917 case AARCH64_PARSE_INVALID_ARG
:
12918 error ("invalid name (\"%s\") in %<target(\"arch=\")%> pragma or attribute", str
);
12919 aarch64_print_hint_for_arch (str
);
12921 case AARCH64_PARSE_INVALID_FEATURE
:
12922 error ("invalid feature modifier %s of value (\"%s\") in "
12923 "%<target()%> pragma or attribute", invalid_extension
.c_str (), str
);
12924 aarch64_print_hint_for_extensions (invalid_extension
);
12927 gcc_unreachable ();
12933 /* Handle the argument CPU_STR to the cpu= target attribute. */
12936 aarch64_handle_attr_cpu (const char *str
)
12938 const struct processor
*tmp_cpu
= NULL
;
12939 std::string invalid_extension
;
12940 enum aarch64_parse_opt_result parse_res
12941 = aarch64_parse_cpu (str
, &tmp_cpu
, &aarch64_isa_flags
, &invalid_extension
);
12943 if (parse_res
== AARCH64_PARSE_OK
)
12945 gcc_assert (tmp_cpu
);
12946 selected_tune
= tmp_cpu
;
12947 explicit_tune_core
= selected_tune
->ident
;
12949 selected_arch
= &all_architectures
[tmp_cpu
->arch
];
12950 explicit_arch
= selected_arch
->arch
;
12956 case AARCH64_PARSE_MISSING_ARG
:
12957 error ("missing name in %<target(\"cpu=\")%> pragma or attribute");
12959 case AARCH64_PARSE_INVALID_ARG
:
12960 error ("invalid name (\"%s\") in %<target(\"cpu=\")%> pragma or attribute", str
);
12961 aarch64_print_hint_for_core (str
);
12963 case AARCH64_PARSE_INVALID_FEATURE
:
12964 error ("invalid feature modifier %s of value (\"%s\") in "
12965 "%<target()%> pragma or attribute", invalid_extension
.c_str (), str
);
12966 aarch64_print_hint_for_extensions (invalid_extension
);
12969 gcc_unreachable ();
12975 /* Handle the argument STR to the branch-protection= attribute. */
12978 aarch64_handle_attr_branch_protection (const char* str
)
12980 char *err_str
= (char *) xmalloc (strlen (str
));
12981 enum aarch64_parse_opt_result res
= aarch64_parse_branch_protection (str
,
12983 bool success
= false;
12986 case AARCH64_PARSE_MISSING_ARG
:
12987 error ("missing argument to %<target(\"branch-protection=\")%> pragma or"
12990 case AARCH64_PARSE_INVALID_ARG
:
12991 error ("invalid protection type (\"%s\") in %<target(\"branch-protection"
12992 "=\")%> pragma or attribute", err_str
);
12994 case AARCH64_PARSE_OK
:
12996 /* Fall through. */
12997 case AARCH64_PARSE_INVALID_FEATURE
:
13000 gcc_unreachable ();
13006 /* Handle the argument STR to the tune= target attribute. */
13009 aarch64_handle_attr_tune (const char *str
)
13011 const struct processor
*tmp_tune
= NULL
;
13012 enum aarch64_parse_opt_result parse_res
13013 = aarch64_parse_tune (str
, &tmp_tune
);
13015 if (parse_res
== AARCH64_PARSE_OK
)
13017 gcc_assert (tmp_tune
);
13018 selected_tune
= tmp_tune
;
13019 explicit_tune_core
= selected_tune
->ident
;
13025 case AARCH64_PARSE_INVALID_ARG
:
13026 error ("invalid name (\"%s\") in %<target(\"tune=\")%> pragma or attribute", str
);
13027 aarch64_print_hint_for_core (str
);
13030 gcc_unreachable ();
13036 /* Parse an architecture extensions target attribute string specified in STR.
13037 For example "+fp+nosimd". Show any errors if needed. Return TRUE
13038 if successful. Update aarch64_isa_flags to reflect the ISA features
13042 aarch64_handle_attr_isa_flags (char *str
)
13044 enum aarch64_parse_opt_result parse_res
;
13045 uint64_t isa_flags
= aarch64_isa_flags
;
13047 /* We allow "+nothing" in the beginning to clear out all architectural
13048 features if the user wants to handpick specific features. */
13049 if (strncmp ("+nothing", str
, 8) == 0)
13055 std::string invalid_extension
;
13056 parse_res
= aarch64_parse_extension (str
, &isa_flags
, &invalid_extension
);
13058 if (parse_res
== AARCH64_PARSE_OK
)
13060 aarch64_isa_flags
= isa_flags
;
13066 case AARCH64_PARSE_MISSING_ARG
:
13067 error ("missing value in %<target()%> pragma or attribute");
13070 case AARCH64_PARSE_INVALID_FEATURE
:
13071 error ("invalid feature modifier %s of value (\"%s\") in "
13072 "%<target()%> pragma or attribute", invalid_extension
.c_str (), str
);
13076 gcc_unreachable ();
13082 /* The target attributes that we support. On top of these we also support just
13083 ISA extensions, like __attribute__ ((target ("+crc"))), but that case is
13084 handled explicitly in aarch64_process_one_target_attr. */
13086 static const struct aarch64_attribute_info aarch64_attributes
[] =
13088 { "general-regs-only", aarch64_attr_mask
, false, NULL
,
13089 OPT_mgeneral_regs_only
},
13090 { "fix-cortex-a53-835769", aarch64_attr_bool
, true, NULL
,
13091 OPT_mfix_cortex_a53_835769
},
13092 { "fix-cortex-a53-843419", aarch64_attr_bool
, true, NULL
,
13093 OPT_mfix_cortex_a53_843419
},
13094 { "cmodel", aarch64_attr_enum
, false, NULL
, OPT_mcmodel_
},
13095 { "strict-align", aarch64_attr_mask
, true, NULL
, OPT_mstrict_align
},
13096 { "omit-leaf-frame-pointer", aarch64_attr_bool
, true, NULL
,
13097 OPT_momit_leaf_frame_pointer
},
13098 { "tls-dialect", aarch64_attr_enum
, false, NULL
, OPT_mtls_dialect_
},
13099 { "arch", aarch64_attr_custom
, false, aarch64_handle_attr_arch
,
13101 { "cpu", aarch64_attr_custom
, false, aarch64_handle_attr_cpu
, OPT_mcpu_
},
13102 { "tune", aarch64_attr_custom
, false, aarch64_handle_attr_tune
,
13104 { "branch-protection", aarch64_attr_custom
, false,
13105 aarch64_handle_attr_branch_protection
, OPT_mbranch_protection_
},
13106 { "sign-return-address", aarch64_attr_enum
, false, NULL
,
13107 OPT_msign_return_address_
},
13108 { NULL
, aarch64_attr_custom
, false, NULL
, OPT____
}
13111 /* Parse ARG_STR which contains the definition of one target attribute.
13112 Show appropriate errors if any or return true if the attribute is valid. */
13115 aarch64_process_one_target_attr (char *arg_str
)
13117 bool invert
= false;
13119 size_t len
= strlen (arg_str
);
13123 error ("malformed %<target()%> pragma or attribute");
13127 char *str_to_check
= (char *) alloca (len
+ 1);
13128 strcpy (str_to_check
, arg_str
);
13130 /* We have something like __attribute__ ((target ("+fp+nosimd"))).
13131 It is easier to detect and handle it explicitly here rather than going
13132 through the machinery for the rest of the target attributes in this
13134 if (*str_to_check
== '+')
13135 return aarch64_handle_attr_isa_flags (str_to_check
);
13137 if (len
> 3 && strncmp (str_to_check
, "no-", 3) == 0)
13142 char *arg
= strchr (str_to_check
, '=');
13144 /* If we found opt=foo then terminate STR_TO_CHECK at the '='
13145 and point ARG to "foo". */
13151 const struct aarch64_attribute_info
*p_attr
;
13152 bool found
= false;
13153 for (p_attr
= aarch64_attributes
; p_attr
->name
; p_attr
++)
13155 /* If the names don't match up, or the user has given an argument
13156 to an attribute that doesn't accept one, or didn't give an argument
13157 to an attribute that expects one, fail to match. */
13158 if (strcmp (str_to_check
, p_attr
->name
) != 0)
13162 bool attr_need_arg_p
= p_attr
->attr_type
== aarch64_attr_custom
13163 || p_attr
->attr_type
== aarch64_attr_enum
;
13165 if (attr_need_arg_p
^ (arg
!= NULL
))
13167 error ("pragma or attribute %<target(\"%s\")%> does not accept an argument", str_to_check
);
13171 /* If the name matches but the attribute does not allow "no-" versions
13172 then we can't match. */
13173 if (invert
&& !p_attr
->allow_neg
)
13175 error ("pragma or attribute %<target(\"%s\")%> does not allow a negated form", str_to_check
);
13179 switch (p_attr
->attr_type
)
13181 /* Has a custom handler registered.
13182 For example, cpu=, arch=, tune=. */
13183 case aarch64_attr_custom
:
13184 gcc_assert (p_attr
->handler
);
13185 if (!p_attr
->handler (arg
))
13189 /* Either set or unset a boolean option. */
13190 case aarch64_attr_bool
:
13192 struct cl_decoded_option decoded
;
13194 generate_option (p_attr
->opt_num
, NULL
, !invert
,
13195 CL_TARGET
, &decoded
);
13196 aarch64_handle_option (&global_options
, &global_options_set
,
13197 &decoded
, input_location
);
13200 /* Set or unset a bit in the target_flags. aarch64_handle_option
13201 should know what mask to apply given the option number. */
13202 case aarch64_attr_mask
:
13204 struct cl_decoded_option decoded
;
13205 /* We only need to specify the option number.
13206 aarch64_handle_option will know which mask to apply. */
13207 decoded
.opt_index
= p_attr
->opt_num
;
13208 decoded
.value
= !invert
;
13209 aarch64_handle_option (&global_options
, &global_options_set
,
13210 &decoded
, input_location
);
13213 /* Use the option setting machinery to set an option to an enum. */
13214 case aarch64_attr_enum
:
13219 valid
= opt_enum_arg_to_value (p_attr
->opt_num
, arg
,
13220 &value
, CL_TARGET
);
13223 set_option (&global_options
, NULL
, p_attr
->opt_num
, value
,
13224 NULL
, DK_UNSPECIFIED
, input_location
,
13229 error ("pragma or attribute %<target(\"%s=%s\")%> is not valid", str_to_check
, arg
);
13234 gcc_unreachable ();
13238 /* If we reached here we either have found an attribute and validated
13239 it or didn't match any. If we matched an attribute but its arguments
13240 were malformed we will have returned false already. */
13244 /* Count how many times the character C appears in
13245 NULL-terminated string STR. */
13247 static unsigned int
13248 num_occurences_in_str (char c
, char *str
)
13250 unsigned int res
= 0;
13251 while (*str
!= '\0')
13262 /* Parse the tree in ARGS that contains the target attribute information
13263 and update the global target options space. */
13266 aarch64_process_target_attr (tree args
)
13268 if (TREE_CODE (args
) == TREE_LIST
)
13272 tree head
= TREE_VALUE (args
);
13275 if (!aarch64_process_target_attr (head
))
13278 args
= TREE_CHAIN (args
);
13284 if (TREE_CODE (args
) != STRING_CST
)
13286 error ("attribute %<target%> argument not a string");
13290 size_t len
= strlen (TREE_STRING_POINTER (args
));
13291 char *str_to_check
= (char *) alloca (len
+ 1);
13292 strcpy (str_to_check
, TREE_STRING_POINTER (args
));
13296 error ("malformed %<target()%> pragma or attribute");
13300 /* Used to catch empty spaces between commas i.e.
13301 attribute ((target ("attr1,,attr2"))). */
13302 unsigned int num_commas
= num_occurences_in_str (',', str_to_check
);
13304 /* Handle multiple target attributes separated by ','. */
13305 char *token
= strtok_r (str_to_check
, ",", &str_to_check
);
13307 unsigned int num_attrs
= 0;
13311 if (!aarch64_process_one_target_attr (token
))
13313 error ("pragma or attribute %<target(\"%s\")%> is not valid", token
);
13317 token
= strtok_r (NULL
, ",", &str_to_check
);
13320 if (num_attrs
!= num_commas
+ 1)
13322 error ("malformed %<target(\"%s\")%> pragma or attribute", TREE_STRING_POINTER (args
));
13329 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P. This is used to
13330 process attribute ((target ("..."))). */
13333 aarch64_option_valid_attribute_p (tree fndecl
, tree
, tree args
, int)
13335 struct cl_target_option cur_target
;
13338 tree new_target
, new_optimize
;
13339 tree existing_target
= DECL_FUNCTION_SPECIFIC_TARGET (fndecl
);
13341 /* If what we're processing is the current pragma string then the
13342 target option node is already stored in target_option_current_node
13343 by aarch64_pragma_target_parse in aarch64-c.c. Use that to avoid
13344 having to re-parse the string. This is especially useful to keep
13345 arm_neon.h compile times down since that header contains a lot
13346 of intrinsics enclosed in pragmas. */
13347 if (!existing_target
&& args
== current_target_pragma
)
13349 DECL_FUNCTION_SPECIFIC_TARGET (fndecl
) = target_option_current_node
;
13352 tree func_optimize
= DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl
);
13354 old_optimize
= build_optimization_node (&global_options
);
13355 func_optimize
= DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl
);
13357 /* If the function changed the optimization levels as well as setting
13358 target options, start with the optimizations specified. */
13359 if (func_optimize
&& func_optimize
!= old_optimize
)
13360 cl_optimization_restore (&global_options
,
13361 TREE_OPTIMIZATION (func_optimize
));
13363 /* Save the current target options to restore at the end. */
13364 cl_target_option_save (&cur_target
, &global_options
);
13366 /* If fndecl already has some target attributes applied to it, unpack
13367 them so that we add this attribute on top of them, rather than
13368 overwriting them. */
13369 if (existing_target
)
13371 struct cl_target_option
*existing_options
13372 = TREE_TARGET_OPTION (existing_target
);
13374 if (existing_options
)
13375 cl_target_option_restore (&global_options
, existing_options
);
13378 cl_target_option_restore (&global_options
,
13379 TREE_TARGET_OPTION (target_option_current_node
));
13381 ret
= aarch64_process_target_attr (args
);
13383 /* Set up any additional state. */
13386 aarch64_override_options_internal (&global_options
);
13387 /* Initialize SIMD builtins if we haven't already.
13388 Set current_target_pragma to NULL for the duration so that
13389 the builtin initialization code doesn't try to tag the functions
13390 being built with the attributes specified by any current pragma, thus
13391 going into an infinite recursion. */
13394 tree saved_current_target_pragma
= current_target_pragma
;
13395 current_target_pragma
= NULL
;
13396 aarch64_init_simd_builtins ();
13397 current_target_pragma
= saved_current_target_pragma
;
13399 new_target
= build_target_option_node (&global_options
);
13404 new_optimize
= build_optimization_node (&global_options
);
13408 DECL_FUNCTION_SPECIFIC_TARGET (fndecl
) = new_target
;
13410 if (old_optimize
!= new_optimize
)
13411 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl
) = new_optimize
;
13414 cl_target_option_restore (&global_options
, &cur_target
);
13416 if (old_optimize
!= new_optimize
)
13417 cl_optimization_restore (&global_options
,
13418 TREE_OPTIMIZATION (old_optimize
));
13422 /* Helper for aarch64_can_inline_p. In the case where CALLER and CALLEE are
13423 tri-bool options (yes, no, don't care) and the default value is
13424 DEF, determine whether to reject inlining. */
13427 aarch64_tribools_ok_for_inlining_p (int caller
, int callee
,
13428 int dont_care
, int def
)
13430 /* If the callee doesn't care, always allow inlining. */
13431 if (callee
== dont_care
)
13434 /* If the caller doesn't care, always allow inlining. */
13435 if (caller
== dont_care
)
13438 /* Otherwise, allow inlining if either the callee and caller values
13439 agree, or if the callee is using the default value. */
13440 return (callee
== caller
|| callee
== def
);
13443 /* Implement TARGET_CAN_INLINE_P. Decide whether it is valid
13444 to inline CALLEE into CALLER based on target-specific info.
13445 Make sure that the caller and callee have compatible architectural
13446 features. Then go through the other possible target attributes
13447 and see if they can block inlining. Try not to reject always_inline
13448 callees unless they are incompatible architecturally. */
13451 aarch64_can_inline_p (tree caller
, tree callee
)
13453 tree caller_tree
= DECL_FUNCTION_SPECIFIC_TARGET (caller
);
13454 tree callee_tree
= DECL_FUNCTION_SPECIFIC_TARGET (callee
);
13456 struct cl_target_option
*caller_opts
13457 = TREE_TARGET_OPTION (caller_tree
? caller_tree
13458 : target_option_default_node
);
13460 struct cl_target_option
*callee_opts
13461 = TREE_TARGET_OPTION (callee_tree
? callee_tree
13462 : target_option_default_node
);
13464 /* Callee's ISA flags should be a subset of the caller's. */
13465 if ((caller_opts
->x_aarch64_isa_flags
& callee_opts
->x_aarch64_isa_flags
)
13466 != callee_opts
->x_aarch64_isa_flags
)
13469 /* Allow non-strict aligned functions inlining into strict
13471 if ((TARGET_STRICT_ALIGN_P (caller_opts
->x_target_flags
)
13472 != TARGET_STRICT_ALIGN_P (callee_opts
->x_target_flags
))
13473 && !(!TARGET_STRICT_ALIGN_P (callee_opts
->x_target_flags
)
13474 && TARGET_STRICT_ALIGN_P (caller_opts
->x_target_flags
)))
13477 bool always_inline
= lookup_attribute ("always_inline",
13478 DECL_ATTRIBUTES (callee
));
13480 /* If the architectural features match up and the callee is always_inline
13481 then the other attributes don't matter. */
13485 if (caller_opts
->x_aarch64_cmodel_var
13486 != callee_opts
->x_aarch64_cmodel_var
)
13489 if (caller_opts
->x_aarch64_tls_dialect
13490 != callee_opts
->x_aarch64_tls_dialect
)
13493 /* Honour explicit requests to workaround errata. */
13494 if (!aarch64_tribools_ok_for_inlining_p (
13495 caller_opts
->x_aarch64_fix_a53_err835769
,
13496 callee_opts
->x_aarch64_fix_a53_err835769
,
13497 2, TARGET_FIX_ERR_A53_835769_DEFAULT
))
13500 if (!aarch64_tribools_ok_for_inlining_p (
13501 caller_opts
->x_aarch64_fix_a53_err843419
,
13502 callee_opts
->x_aarch64_fix_a53_err843419
,
13503 2, TARGET_FIX_ERR_A53_843419
))
13506 /* If the user explicitly specified -momit-leaf-frame-pointer for the
13507 caller and calle and they don't match up, reject inlining. */
13508 if (!aarch64_tribools_ok_for_inlining_p (
13509 caller_opts
->x_flag_omit_leaf_frame_pointer
,
13510 callee_opts
->x_flag_omit_leaf_frame_pointer
,
13514 /* If the callee has specific tuning overrides, respect them. */
13515 if (callee_opts
->x_aarch64_override_tune_string
!= NULL
13516 && caller_opts
->x_aarch64_override_tune_string
== NULL
)
13519 /* If the user specified tuning override strings for the
13520 caller and callee and they don't match up, reject inlining.
13521 We just do a string compare here, we don't analyze the meaning
13522 of the string, as it would be too costly for little gain. */
13523 if (callee_opts
->x_aarch64_override_tune_string
13524 && caller_opts
->x_aarch64_override_tune_string
13525 && (strcmp (callee_opts
->x_aarch64_override_tune_string
,
13526 caller_opts
->x_aarch64_override_tune_string
) != 0))
13532 /* Return true if SYMBOL_REF X binds locally. */
13535 aarch64_symbol_binds_local_p (const_rtx x
)
13537 return (SYMBOL_REF_DECL (x
)
13538 ? targetm
.binds_local_p (SYMBOL_REF_DECL (x
))
13539 : SYMBOL_REF_LOCAL_P (x
));
13542 /* Return true if SYMBOL_REF X is thread local */
13544 aarch64_tls_symbol_p (rtx x
)
13546 if (! TARGET_HAVE_TLS
)
13549 if (GET_CODE (x
) != SYMBOL_REF
)
13552 return SYMBOL_REF_TLS_MODEL (x
) != 0;
13555 /* Classify a TLS symbol into one of the TLS kinds. */
13556 enum aarch64_symbol_type
13557 aarch64_classify_tls_symbol (rtx x
)
13559 enum tls_model tls_kind
= tls_symbolic_operand_type (x
);
13563 case TLS_MODEL_GLOBAL_DYNAMIC
:
13564 case TLS_MODEL_LOCAL_DYNAMIC
:
13565 return TARGET_TLS_DESC
? SYMBOL_SMALL_TLSDESC
: SYMBOL_SMALL_TLSGD
;
13567 case TLS_MODEL_INITIAL_EXEC
:
13568 switch (aarch64_cmodel
)
13570 case AARCH64_CMODEL_TINY
:
13571 case AARCH64_CMODEL_TINY_PIC
:
13572 return SYMBOL_TINY_TLSIE
;
13574 return SYMBOL_SMALL_TLSIE
;
13577 case TLS_MODEL_LOCAL_EXEC
:
13578 if (aarch64_tls_size
== 12)
13579 return SYMBOL_TLSLE12
;
13580 else if (aarch64_tls_size
== 24)
13581 return SYMBOL_TLSLE24
;
13582 else if (aarch64_tls_size
== 32)
13583 return SYMBOL_TLSLE32
;
13584 else if (aarch64_tls_size
== 48)
13585 return SYMBOL_TLSLE48
;
13587 gcc_unreachable ();
13589 case TLS_MODEL_EMULATED
:
13590 case TLS_MODEL_NONE
:
13591 return SYMBOL_FORCE_TO_MEM
;
13594 gcc_unreachable ();
13598 /* Return the correct method for accessing X + OFFSET, where X is either
13599 a SYMBOL_REF or LABEL_REF. */
13601 enum aarch64_symbol_type
13602 aarch64_classify_symbol (rtx x
, HOST_WIDE_INT offset
)
13604 if (GET_CODE (x
) == LABEL_REF
)
13606 switch (aarch64_cmodel
)
13608 case AARCH64_CMODEL_LARGE
:
13609 return SYMBOL_FORCE_TO_MEM
;
13611 case AARCH64_CMODEL_TINY_PIC
:
13612 case AARCH64_CMODEL_TINY
:
13613 return SYMBOL_TINY_ABSOLUTE
;
13615 case AARCH64_CMODEL_SMALL_SPIC
:
13616 case AARCH64_CMODEL_SMALL_PIC
:
13617 case AARCH64_CMODEL_SMALL
:
13618 return SYMBOL_SMALL_ABSOLUTE
;
13621 gcc_unreachable ();
13625 if (GET_CODE (x
) == SYMBOL_REF
)
13627 if (aarch64_tls_symbol_p (x
))
13628 return aarch64_classify_tls_symbol (x
);
13630 switch (aarch64_cmodel
)
13632 case AARCH64_CMODEL_TINY
:
13633 /* When we retrieve symbol + offset address, we have to make sure
13634 the offset does not cause overflow of the final address. But
13635 we have no way of knowing the address of symbol at compile time
13636 so we can't accurately say if the distance between the PC and
13637 symbol + offset is outside the addressible range of +/-1M in the
13638 TINY code model. So we rely on images not being greater than
13639 1M and cap the offset at 1M and anything beyond 1M will have to
13640 be loaded using an alternative mechanism. Furthermore if the
13641 symbol is a weak reference to something that isn't known to
13642 resolve to a symbol in this module, then force to memory. */
13643 if ((SYMBOL_REF_WEAK (x
)
13644 && !aarch64_symbol_binds_local_p (x
))
13645 || !IN_RANGE (offset
, -1048575, 1048575))
13646 return SYMBOL_FORCE_TO_MEM
;
13647 return SYMBOL_TINY_ABSOLUTE
;
13649 case AARCH64_CMODEL_SMALL
:
13650 /* Same reasoning as the tiny code model, but the offset cap here is
13652 if ((SYMBOL_REF_WEAK (x
)
13653 && !aarch64_symbol_binds_local_p (x
))
13654 || !IN_RANGE (offset
, HOST_WIDE_INT_C (-4294967263),
13655 HOST_WIDE_INT_C (4294967264)))
13656 return SYMBOL_FORCE_TO_MEM
;
13657 return SYMBOL_SMALL_ABSOLUTE
;
13659 case AARCH64_CMODEL_TINY_PIC
:
13660 if (!aarch64_symbol_binds_local_p (x
))
13661 return SYMBOL_TINY_GOT
;
13662 return SYMBOL_TINY_ABSOLUTE
;
13664 case AARCH64_CMODEL_SMALL_SPIC
:
13665 case AARCH64_CMODEL_SMALL_PIC
:
13666 if (!aarch64_symbol_binds_local_p (x
))
13667 return (aarch64_cmodel
== AARCH64_CMODEL_SMALL_SPIC
13668 ? SYMBOL_SMALL_GOT_28K
: SYMBOL_SMALL_GOT_4G
);
13669 return SYMBOL_SMALL_ABSOLUTE
;
13671 case AARCH64_CMODEL_LARGE
:
13672 /* This is alright even in PIC code as the constant
13673 pool reference is always PC relative and within
13674 the same translation unit. */
13675 if (!aarch64_pcrelative_literal_loads
&& CONSTANT_POOL_ADDRESS_P (x
))
13676 return SYMBOL_SMALL_ABSOLUTE
;
13678 return SYMBOL_FORCE_TO_MEM
;
13681 gcc_unreachable ();
13685 /* By default push everything into the constant pool. */
13686 return SYMBOL_FORCE_TO_MEM
;
13690 aarch64_constant_address_p (rtx x
)
13692 return (CONSTANT_P (x
) && memory_address_p (DImode
, x
));
13696 aarch64_legitimate_pic_operand_p (rtx x
)
13698 if (GET_CODE (x
) == SYMBOL_REF
13699 || (GET_CODE (x
) == CONST
13700 && GET_CODE (XEXP (x
, 0)) == PLUS
13701 && GET_CODE (XEXP (XEXP (x
, 0), 0)) == SYMBOL_REF
))
13707 /* Implement TARGET_LEGITIMATE_CONSTANT_P hook. Return true for constants
13708 that should be rematerialized rather than spilled. */
13711 aarch64_legitimate_constant_p (machine_mode mode
, rtx x
)
13713 /* Support CSE and rematerialization of common constants. */
13714 if (CONST_INT_P (x
)
13715 || (CONST_DOUBLE_P (x
) && GET_MODE_CLASS (mode
) == MODE_FLOAT
)
13716 || GET_CODE (x
) == CONST_VECTOR
)
13719 /* Do not allow vector struct mode constants for Advanced SIMD.
13720 We could support 0 and -1 easily, but they need support in
13721 aarch64-simd.md. */
13722 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
13723 if (vec_flags
== (VEC_ADVSIMD
| VEC_STRUCT
))
13726 /* Only accept variable-length vector constants if they can be
13729 ??? It would be possible to handle rematerialization of other
13730 constants via secondary reloads. */
13731 if (vec_flags
& VEC_ANY_SVE
)
13732 return aarch64_simd_valid_immediate (x
, NULL
);
13734 if (GET_CODE (x
) == HIGH
)
13737 /* Accept polynomial constants that can be calculated by using the
13738 destination of a move as the sole temporary. Constants that
13739 require a second temporary cannot be rematerialized (they can't be
13740 forced to memory and also aren't legitimate constants). */
13742 if (poly_int_rtx_p (x
, &offset
))
13743 return aarch64_offset_temporaries (false, offset
) <= 1;
13745 /* If an offset is being added to something else, we need to allow the
13746 base to be moved into the destination register, meaning that there
13747 are no free temporaries for the offset. */
13748 x
= strip_offset (x
, &offset
);
13749 if (!offset
.is_constant () && aarch64_offset_temporaries (true, offset
) > 0)
13752 /* Do not allow const (plus (anchor_symbol, const_int)). */
13753 if (maybe_ne (offset
, 0) && SYMBOL_REF_P (x
) && SYMBOL_REF_ANCHOR_P (x
))
13756 /* Treat symbols as constants. Avoid TLS symbols as they are complex,
13757 so spilling them is better than rematerialization. */
13758 if (SYMBOL_REF_P (x
) && !SYMBOL_REF_TLS_MODEL (x
))
13761 /* Label references are always constant. */
13762 if (GET_CODE (x
) == LABEL_REF
)
13769 aarch64_load_tp (rtx target
)
13772 || GET_MODE (target
) != Pmode
13773 || !register_operand (target
, Pmode
))
13774 target
= gen_reg_rtx (Pmode
);
13776 /* Can return in any reg. */
13777 emit_insn (gen_aarch64_load_tp_hard (target
));
13781 /* On AAPCS systems, this is the "struct __va_list". */
13782 static GTY(()) tree va_list_type
;
13784 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
13785 Return the type to use as __builtin_va_list.
13787 AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
13799 aarch64_build_builtin_va_list (void)
13802 tree f_stack
, f_grtop
, f_vrtop
, f_groff
, f_vroff
;
13804 /* Create the type. */
13805 va_list_type
= lang_hooks
.types
.make_type (RECORD_TYPE
);
13806 /* Give it the required name. */
13807 va_list_name
= build_decl (BUILTINS_LOCATION
,
13809 get_identifier ("__va_list"),
13811 DECL_ARTIFICIAL (va_list_name
) = 1;
13812 TYPE_NAME (va_list_type
) = va_list_name
;
13813 TYPE_STUB_DECL (va_list_type
) = va_list_name
;
13815 /* Create the fields. */
13816 f_stack
= build_decl (BUILTINS_LOCATION
,
13817 FIELD_DECL
, get_identifier ("__stack"),
13819 f_grtop
= build_decl (BUILTINS_LOCATION
,
13820 FIELD_DECL
, get_identifier ("__gr_top"),
13822 f_vrtop
= build_decl (BUILTINS_LOCATION
,
13823 FIELD_DECL
, get_identifier ("__vr_top"),
13825 f_groff
= build_decl (BUILTINS_LOCATION
,
13826 FIELD_DECL
, get_identifier ("__gr_offs"),
13827 integer_type_node
);
13828 f_vroff
= build_decl (BUILTINS_LOCATION
,
13829 FIELD_DECL
, get_identifier ("__vr_offs"),
13830 integer_type_node
);
13832 /* Tell tree-stdarg pass about our internal offset fields.
13833 NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
13834 purpose to identify whether the code is updating va_list internal
13835 offset fields through irregular way. */
13836 va_list_gpr_counter_field
= f_groff
;
13837 va_list_fpr_counter_field
= f_vroff
;
13839 DECL_ARTIFICIAL (f_stack
) = 1;
13840 DECL_ARTIFICIAL (f_grtop
) = 1;
13841 DECL_ARTIFICIAL (f_vrtop
) = 1;
13842 DECL_ARTIFICIAL (f_groff
) = 1;
13843 DECL_ARTIFICIAL (f_vroff
) = 1;
13845 DECL_FIELD_CONTEXT (f_stack
) = va_list_type
;
13846 DECL_FIELD_CONTEXT (f_grtop
) = va_list_type
;
13847 DECL_FIELD_CONTEXT (f_vrtop
) = va_list_type
;
13848 DECL_FIELD_CONTEXT (f_groff
) = va_list_type
;
13849 DECL_FIELD_CONTEXT (f_vroff
) = va_list_type
;
13851 TYPE_FIELDS (va_list_type
) = f_stack
;
13852 DECL_CHAIN (f_stack
) = f_grtop
;
13853 DECL_CHAIN (f_grtop
) = f_vrtop
;
13854 DECL_CHAIN (f_vrtop
) = f_groff
;
13855 DECL_CHAIN (f_groff
) = f_vroff
;
13857 /* Compute its layout. */
13858 layout_type (va_list_type
);
13860 return va_list_type
;
13863 /* Implement TARGET_EXPAND_BUILTIN_VA_START. */
13865 aarch64_expand_builtin_va_start (tree valist
, rtx nextarg ATTRIBUTE_UNUSED
)
13867 const CUMULATIVE_ARGS
*cum
;
13868 tree f_stack
, f_grtop
, f_vrtop
, f_groff
, f_vroff
;
13869 tree stack
, grtop
, vrtop
, groff
, vroff
;
13871 int gr_save_area_size
= cfun
->va_list_gpr_size
;
13872 int vr_save_area_size
= cfun
->va_list_fpr_size
;
13875 cum
= &crtl
->args
.info
;
13876 if (cfun
->va_list_gpr_size
)
13877 gr_save_area_size
= MIN ((NUM_ARG_REGS
- cum
->aapcs_ncrn
) * UNITS_PER_WORD
,
13878 cfun
->va_list_gpr_size
);
13879 if (cfun
->va_list_fpr_size
)
13880 vr_save_area_size
= MIN ((NUM_FP_ARG_REGS
- cum
->aapcs_nvrn
)
13881 * UNITS_PER_VREG
, cfun
->va_list_fpr_size
);
13885 gcc_assert (cum
->aapcs_nvrn
== 0);
13886 vr_save_area_size
= 0;
13889 f_stack
= TYPE_FIELDS (va_list_type_node
);
13890 f_grtop
= DECL_CHAIN (f_stack
);
13891 f_vrtop
= DECL_CHAIN (f_grtop
);
13892 f_groff
= DECL_CHAIN (f_vrtop
);
13893 f_vroff
= DECL_CHAIN (f_groff
);
13895 stack
= build3 (COMPONENT_REF
, TREE_TYPE (f_stack
), valist
, f_stack
,
13897 grtop
= build3 (COMPONENT_REF
, TREE_TYPE (f_grtop
), valist
, f_grtop
,
13899 vrtop
= build3 (COMPONENT_REF
, TREE_TYPE (f_vrtop
), valist
, f_vrtop
,
13901 groff
= build3 (COMPONENT_REF
, TREE_TYPE (f_groff
), valist
, f_groff
,
13903 vroff
= build3 (COMPONENT_REF
, TREE_TYPE (f_vroff
), valist
, f_vroff
,
13906 /* Emit code to initialize STACK, which points to the next varargs stack
13907 argument. CUM->AAPCS_STACK_SIZE gives the number of stack words used
13908 by named arguments. STACK is 8-byte aligned. */
13909 t
= make_tree (TREE_TYPE (stack
), virtual_incoming_args_rtx
);
13910 if (cum
->aapcs_stack_size
> 0)
13911 t
= fold_build_pointer_plus_hwi (t
, cum
->aapcs_stack_size
* UNITS_PER_WORD
);
13912 t
= build2 (MODIFY_EXPR
, TREE_TYPE (stack
), stack
, t
);
13913 expand_expr (t
, const0_rtx
, VOIDmode
, EXPAND_NORMAL
);
13915 /* Emit code to initialize GRTOP, the top of the GR save area.
13916 virtual_incoming_args_rtx should have been 16 byte aligned. */
13917 t
= make_tree (TREE_TYPE (grtop
), virtual_incoming_args_rtx
);
13918 t
= build2 (MODIFY_EXPR
, TREE_TYPE (grtop
), grtop
, t
);
13919 expand_expr (t
, const0_rtx
, VOIDmode
, EXPAND_NORMAL
);
13921 /* Emit code to initialize VRTOP, the top of the VR save area.
13922 This address is gr_save_area_bytes below GRTOP, rounded
13923 down to the next 16-byte boundary. */
13924 t
= make_tree (TREE_TYPE (vrtop
), virtual_incoming_args_rtx
);
13925 vr_offset
= ROUND_UP (gr_save_area_size
,
13926 STACK_BOUNDARY
/ BITS_PER_UNIT
);
13929 t
= fold_build_pointer_plus_hwi (t
, -vr_offset
);
13930 t
= build2 (MODIFY_EXPR
, TREE_TYPE (vrtop
), vrtop
, t
);
13931 expand_expr (t
, const0_rtx
, VOIDmode
, EXPAND_NORMAL
);
13933 /* Emit code to initialize GROFF, the offset from GRTOP of the
13934 next GPR argument. */
13935 t
= build2 (MODIFY_EXPR
, TREE_TYPE (groff
), groff
,
13936 build_int_cst (TREE_TYPE (groff
), -gr_save_area_size
));
13937 expand_expr (t
, const0_rtx
, VOIDmode
, EXPAND_NORMAL
);
13939 /* Likewise emit code to initialize VROFF, the offset from FTOP
13940 of the next VR argument. */
13941 t
= build2 (MODIFY_EXPR
, TREE_TYPE (vroff
), vroff
,
13942 build_int_cst (TREE_TYPE (vroff
), -vr_save_area_size
));
13943 expand_expr (t
, const0_rtx
, VOIDmode
, EXPAND_NORMAL
);
13946 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR. */
13949 aarch64_gimplify_va_arg_expr (tree valist
, tree type
, gimple_seq
*pre_p
,
13950 gimple_seq
*post_p ATTRIBUTE_UNUSED
)
13954 bool is_ha
; /* is HFA or HVA. */
13955 bool dw_align
; /* double-word align. */
13956 machine_mode ag_mode
= VOIDmode
;
13960 tree f_stack
, f_grtop
, f_vrtop
, f_groff
, f_vroff
;
13961 tree stack
, f_top
, f_off
, off
, arg
, roundup
, on_stack
;
13962 HOST_WIDE_INT size
, rsize
, adjust
, align
;
13963 tree t
, u
, cond1
, cond2
;
13965 indirect_p
= pass_by_reference (NULL
, TYPE_MODE (type
), type
, false);
13967 type
= build_pointer_type (type
);
13969 mode
= TYPE_MODE (type
);
13971 f_stack
= TYPE_FIELDS (va_list_type_node
);
13972 f_grtop
= DECL_CHAIN (f_stack
);
13973 f_vrtop
= DECL_CHAIN (f_grtop
);
13974 f_groff
= DECL_CHAIN (f_vrtop
);
13975 f_vroff
= DECL_CHAIN (f_groff
);
13977 stack
= build3 (COMPONENT_REF
, TREE_TYPE (f_stack
), unshare_expr (valist
),
13978 f_stack
, NULL_TREE
);
13979 size
= int_size_in_bytes (type
);
13983 = aarch64_function_arg_alignment (mode
, type
, &abi_break
) / BITS_PER_UNIT
;
13987 if (aarch64_vfp_is_call_or_return_candidate (mode
,
13993 /* No frontends can create types with variable-sized modes, so we
13994 shouldn't be asked to pass or return them. */
13995 unsigned int ag_size
= GET_MODE_SIZE (ag_mode
).to_constant ();
13997 /* TYPE passed in fp/simd registers. */
13999 aarch64_err_no_fpadvsimd (mode
);
14001 f_top
= build3 (COMPONENT_REF
, TREE_TYPE (f_vrtop
),
14002 unshare_expr (valist
), f_vrtop
, NULL_TREE
);
14003 f_off
= build3 (COMPONENT_REF
, TREE_TYPE (f_vroff
),
14004 unshare_expr (valist
), f_vroff
, NULL_TREE
);
14006 rsize
= nregs
* UNITS_PER_VREG
;
14010 if (BYTES_BIG_ENDIAN
&& ag_size
< UNITS_PER_VREG
)
14011 adjust
= UNITS_PER_VREG
- ag_size
;
14013 else if (BLOCK_REG_PADDING (mode
, type
, 1) == PAD_DOWNWARD
14014 && size
< UNITS_PER_VREG
)
14016 adjust
= UNITS_PER_VREG
- size
;
14021 /* TYPE passed in general registers. */
14022 f_top
= build3 (COMPONENT_REF
, TREE_TYPE (f_grtop
),
14023 unshare_expr (valist
), f_grtop
, NULL_TREE
);
14024 f_off
= build3 (COMPONENT_REF
, TREE_TYPE (f_groff
),
14025 unshare_expr (valist
), f_groff
, NULL_TREE
);
14026 rsize
= ROUND_UP (size
, UNITS_PER_WORD
);
14027 nregs
= rsize
/ UNITS_PER_WORD
;
14031 if (abi_break
&& warn_psabi
)
14032 inform (input_location
, "parameter passing for argument of type "
14033 "%qT changed in GCC 9.1", type
);
14037 if (BLOCK_REG_PADDING (mode
, type
, 1) == PAD_DOWNWARD
14038 && size
< UNITS_PER_WORD
)
14040 adjust
= UNITS_PER_WORD
- size
;
14044 /* Get a local temporary for the field value. */
14045 off
= get_initialized_tmp_var (f_off
, pre_p
, NULL
);
14047 /* Emit code to branch if off >= 0. */
14048 t
= build2 (GE_EXPR
, boolean_type_node
, off
,
14049 build_int_cst (TREE_TYPE (off
), 0));
14050 cond1
= build3 (COND_EXPR
, ptr_type_node
, t
, NULL_TREE
, NULL_TREE
);
14054 /* Emit: offs = (offs + 15) & -16. */
14055 t
= build2 (PLUS_EXPR
, TREE_TYPE (off
), off
,
14056 build_int_cst (TREE_TYPE (off
), 15));
14057 t
= build2 (BIT_AND_EXPR
, TREE_TYPE (off
), t
,
14058 build_int_cst (TREE_TYPE (off
), -16));
14059 roundup
= build2 (MODIFY_EXPR
, TREE_TYPE (off
), off
, t
);
14064 /* Update ap.__[g|v]r_offs */
14065 t
= build2 (PLUS_EXPR
, TREE_TYPE (off
), off
,
14066 build_int_cst (TREE_TYPE (off
), rsize
));
14067 t
= build2 (MODIFY_EXPR
, TREE_TYPE (f_off
), unshare_expr (f_off
), t
);
14071 t
= build2 (COMPOUND_EXPR
, TREE_TYPE (t
), roundup
, t
);
14073 /* [cond2] if (ap.__[g|v]r_offs > 0) */
14074 u
= build2 (GT_EXPR
, boolean_type_node
, unshare_expr (f_off
),
14075 build_int_cst (TREE_TYPE (f_off
), 0));
14076 cond2
= build3 (COND_EXPR
, ptr_type_node
, u
, NULL_TREE
, NULL_TREE
);
14078 /* String up: make sure the assignment happens before the use. */
14079 t
= build2 (COMPOUND_EXPR
, TREE_TYPE (cond2
), t
, cond2
);
14080 COND_EXPR_ELSE (cond1
) = t
;
14082 /* Prepare the trees handling the argument that is passed on the stack;
14083 the top level node will store in ON_STACK. */
14084 arg
= get_initialized_tmp_var (stack
, pre_p
, NULL
);
14087 /* if (alignof(type) > 8) (arg = arg + 15) & -16; */
14088 t
= fold_build_pointer_plus_hwi (arg
, 15);
14089 t
= build2 (BIT_AND_EXPR
, TREE_TYPE (t
), t
,
14090 build_int_cst (TREE_TYPE (t
), -16));
14091 roundup
= build2 (MODIFY_EXPR
, TREE_TYPE (arg
), arg
, t
);
14095 /* Advance ap.__stack */
14096 t
= fold_build_pointer_plus_hwi (arg
, size
+ 7);
14097 t
= build2 (BIT_AND_EXPR
, TREE_TYPE (t
), t
,
14098 build_int_cst (TREE_TYPE (t
), -8));
14099 t
= build2 (MODIFY_EXPR
, TREE_TYPE (stack
), unshare_expr (stack
), t
);
14100 /* String up roundup and advance. */
14102 t
= build2 (COMPOUND_EXPR
, TREE_TYPE (t
), roundup
, t
);
14103 /* String up with arg */
14104 on_stack
= build2 (COMPOUND_EXPR
, TREE_TYPE (arg
), t
, arg
);
14105 /* Big-endianness related address adjustment. */
14106 if (BLOCK_REG_PADDING (mode
, type
, 1) == PAD_DOWNWARD
14107 && size
< UNITS_PER_WORD
)
14109 t
= build2 (POINTER_PLUS_EXPR
, TREE_TYPE (arg
), arg
,
14110 size_int (UNITS_PER_WORD
- size
));
14111 on_stack
= build2 (COMPOUND_EXPR
, TREE_TYPE (arg
), on_stack
, t
);
14114 COND_EXPR_THEN (cond1
) = unshare_expr (on_stack
);
14115 COND_EXPR_THEN (cond2
) = unshare_expr (on_stack
);
14117 /* Adjustment to OFFSET in the case of BIG_ENDIAN. */
14120 t
= build2 (PREINCREMENT_EXPR
, TREE_TYPE (off
), off
,
14121 build_int_cst (TREE_TYPE (off
), adjust
));
14123 t
= fold_convert (sizetype
, t
);
14124 t
= build2 (POINTER_PLUS_EXPR
, TREE_TYPE (f_top
), f_top
, t
);
14128 /* type ha; // treat as "struct {ftype field[n];}"
14129 ... [computing offs]
14130 for (i = 0; i <nregs; ++i, offs += 16)
14131 ha.field[i] = *((ftype *)(ap.__vr_top + offs));
14134 tree tmp_ha
, field_t
, field_ptr_t
;
14136 /* Declare a local variable. */
14137 tmp_ha
= create_tmp_var_raw (type
, "ha");
14138 gimple_add_tmp_var (tmp_ha
);
14140 /* Establish the base type. */
14144 field_t
= float_type_node
;
14145 field_ptr_t
= float_ptr_type_node
;
14148 field_t
= double_type_node
;
14149 field_ptr_t
= double_ptr_type_node
;
14152 field_t
= long_double_type_node
;
14153 field_ptr_t
= long_double_ptr_type_node
;
14156 field_t
= aarch64_fp16_type_node
;
14157 field_ptr_t
= aarch64_fp16_ptr_type_node
;
14162 tree innertype
= make_signed_type (GET_MODE_PRECISION (SImode
));
14163 field_t
= build_vector_type_for_mode (innertype
, ag_mode
);
14164 field_ptr_t
= build_pointer_type (field_t
);
14171 /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area */
14172 tmp_ha
= build1 (ADDR_EXPR
, field_ptr_t
, tmp_ha
);
14174 t
= fold_convert (field_ptr_t
, addr
);
14175 t
= build2 (MODIFY_EXPR
, field_t
,
14176 build1 (INDIRECT_REF
, field_t
, tmp_ha
),
14177 build1 (INDIRECT_REF
, field_t
, t
));
14179 /* ha.field[i] = *((field_ptr_t)vr_saved_area + i) */
14180 for (i
= 1; i
< nregs
; ++i
)
14182 addr
= fold_build_pointer_plus_hwi (addr
, UNITS_PER_VREG
);
14183 u
= fold_convert (field_ptr_t
, addr
);
14184 u
= build2 (MODIFY_EXPR
, field_t
,
14185 build2 (MEM_REF
, field_t
, tmp_ha
,
14186 build_int_cst (field_ptr_t
,
14188 int_size_in_bytes (field_t
)))),
14189 build1 (INDIRECT_REF
, field_t
, u
));
14190 t
= build2 (COMPOUND_EXPR
, TREE_TYPE (t
), t
, u
);
14193 u
= fold_convert (TREE_TYPE (f_top
), tmp_ha
);
14194 t
= build2 (COMPOUND_EXPR
, TREE_TYPE (f_top
), t
, u
);
14197 COND_EXPR_ELSE (cond2
) = t
;
14198 addr
= fold_convert (build_pointer_type (type
), cond1
);
14199 addr
= build_va_arg_indirect_ref (addr
);
14202 addr
= build_va_arg_indirect_ref (addr
);
14207 /* Implement TARGET_SETUP_INCOMING_VARARGS. */
14210 aarch64_setup_incoming_varargs (cumulative_args_t cum_v
, machine_mode mode
,
14211 tree type
, int *pretend_size ATTRIBUTE_UNUSED
,
14214 CUMULATIVE_ARGS
*cum
= get_cumulative_args (cum_v
);
14215 CUMULATIVE_ARGS local_cum
;
14216 int gr_saved
= cfun
->va_list_gpr_size
;
14217 int vr_saved
= cfun
->va_list_fpr_size
;
14219 /* The caller has advanced CUM up to, but not beyond, the last named
14220 argument. Advance a local copy of CUM past the last "real" named
14221 argument, to find out how many registers are left over. */
14223 aarch64_function_arg_advance (pack_cumulative_args(&local_cum
), mode
, type
, true);
14225 /* Found out how many registers we need to save.
14226 Honor tree-stdvar analysis results. */
14227 if (cfun
->va_list_gpr_size
)
14228 gr_saved
= MIN (NUM_ARG_REGS
- local_cum
.aapcs_ncrn
,
14229 cfun
->va_list_gpr_size
/ UNITS_PER_WORD
);
14230 if (cfun
->va_list_fpr_size
)
14231 vr_saved
= MIN (NUM_FP_ARG_REGS
- local_cum
.aapcs_nvrn
,
14232 cfun
->va_list_fpr_size
/ UNITS_PER_VREG
);
14236 gcc_assert (local_cum
.aapcs_nvrn
== 0);
14246 /* virtual_incoming_args_rtx should have been 16-byte aligned. */
14247 ptr
= plus_constant (Pmode
, virtual_incoming_args_rtx
,
14248 - gr_saved
* UNITS_PER_WORD
);
14249 mem
= gen_frame_mem (BLKmode
, ptr
);
14250 set_mem_alias_set (mem
, get_varargs_alias_set ());
14252 move_block_from_reg (local_cum
.aapcs_ncrn
+ R0_REGNUM
,
14257 /* We can't use move_block_from_reg, because it will use
14258 the wrong mode, storing D regs only. */
14259 machine_mode mode
= TImode
;
14260 int off
, i
, vr_start
;
14262 /* Set OFF to the offset from virtual_incoming_args_rtx of
14263 the first vector register. The VR save area lies below
14264 the GR one, and is aligned to 16 bytes. */
14265 off
= -ROUND_UP (gr_saved
* UNITS_PER_WORD
,
14266 STACK_BOUNDARY
/ BITS_PER_UNIT
);
14267 off
-= vr_saved
* UNITS_PER_VREG
;
14269 vr_start
= V0_REGNUM
+ local_cum
.aapcs_nvrn
;
14270 for (i
= 0; i
< vr_saved
; ++i
)
14274 ptr
= plus_constant (Pmode
, virtual_incoming_args_rtx
, off
);
14275 mem
= gen_frame_mem (mode
, ptr
);
14276 set_mem_alias_set (mem
, get_varargs_alias_set ());
14277 aarch64_emit_move (mem
, gen_rtx_REG (mode
, vr_start
+ i
));
14278 off
+= UNITS_PER_VREG
;
14283 /* We don't save the size into *PRETEND_SIZE because we want to avoid
14284 any complication of having crtl->args.pretend_args_size changed. */
14285 cfun
->machine
->frame
.saved_varargs_size
14286 = (ROUND_UP (gr_saved
* UNITS_PER_WORD
,
14287 STACK_BOUNDARY
/ BITS_PER_UNIT
)
14288 + vr_saved
* UNITS_PER_VREG
);
14292 aarch64_conditional_register_usage (void)
14297 for (i
= V0_REGNUM
; i
<= V31_REGNUM
; i
++)
14300 call_used_regs
[i
] = 1;
14304 for (i
= P0_REGNUM
; i
<= P15_REGNUM
; i
++)
14307 call_used_regs
[i
] = 1;
14310 /* When tracking speculation, we need a couple of call-clobbered registers
14311 to track the speculation state. It would be nice to just use
14312 IP0 and IP1, but currently there are numerous places that just
14313 assume these registers are free for other uses (eg pointer
14314 authentication). */
14315 if (aarch64_track_speculation
)
14317 fixed_regs
[SPECULATION_TRACKER_REGNUM
] = 1;
14318 call_used_regs
[SPECULATION_TRACKER_REGNUM
] = 1;
14319 fixed_regs
[SPECULATION_SCRATCH_REGNUM
] = 1;
14320 call_used_regs
[SPECULATION_SCRATCH_REGNUM
] = 1;
14324 /* Walk down the type tree of TYPE counting consecutive base elements.
14325 If *MODEP is VOIDmode, then set it to the first valid floating point
14326 type. If a non-floating point type is found, or if a floating point
14327 type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
14328 otherwise return the count in the sub-tree. */
14330 aapcs_vfp_sub_candidate (const_tree type
, machine_mode
*modep
)
14333 HOST_WIDE_INT size
;
14335 switch (TREE_CODE (type
))
14338 mode
= TYPE_MODE (type
);
14339 if (mode
!= DFmode
&& mode
!= SFmode
14340 && mode
!= TFmode
&& mode
!= HFmode
)
14343 if (*modep
== VOIDmode
)
14346 if (*modep
== mode
)
14352 mode
= TYPE_MODE (TREE_TYPE (type
));
14353 if (mode
!= DFmode
&& mode
!= SFmode
14354 && mode
!= TFmode
&& mode
!= HFmode
)
14357 if (*modep
== VOIDmode
)
14360 if (*modep
== mode
)
14366 /* Use V2SImode and V4SImode as representatives of all 64-bit
14367 and 128-bit vector types. */
14368 size
= int_size_in_bytes (type
);
14381 if (*modep
== VOIDmode
)
14384 /* Vector modes are considered to be opaque: two vectors are
14385 equivalent for the purposes of being homogeneous aggregates
14386 if they are the same size. */
14387 if (*modep
== mode
)
14395 tree index
= TYPE_DOMAIN (type
);
14397 /* Can't handle incomplete types nor sizes that are not
14399 if (!COMPLETE_TYPE_P (type
)
14400 || TREE_CODE (TYPE_SIZE (type
)) != INTEGER_CST
)
14403 count
= aapcs_vfp_sub_candidate (TREE_TYPE (type
), modep
);
14406 || !TYPE_MAX_VALUE (index
)
14407 || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index
))
14408 || !TYPE_MIN_VALUE (index
)
14409 || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index
))
14413 count
*= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index
))
14414 - tree_to_uhwi (TYPE_MIN_VALUE (index
)));
14416 /* There must be no padding. */
14417 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type
)),
14418 count
* GET_MODE_BITSIZE (*modep
)))
14430 /* Can't handle incomplete types nor sizes that are not
14432 if (!COMPLETE_TYPE_P (type
)
14433 || TREE_CODE (TYPE_SIZE (type
)) != INTEGER_CST
)
14436 for (field
= TYPE_FIELDS (type
); field
; field
= TREE_CHAIN (field
))
14438 if (TREE_CODE (field
) != FIELD_DECL
)
14441 sub_count
= aapcs_vfp_sub_candidate (TREE_TYPE (field
), modep
);
14444 count
+= sub_count
;
14447 /* There must be no padding. */
14448 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type
)),
14449 count
* GET_MODE_BITSIZE (*modep
)))
14456 case QUAL_UNION_TYPE
:
14458 /* These aren't very interesting except in a degenerate case. */
14463 /* Can't handle incomplete types nor sizes that are not
14465 if (!COMPLETE_TYPE_P (type
)
14466 || TREE_CODE (TYPE_SIZE (type
)) != INTEGER_CST
)
14469 for (field
= TYPE_FIELDS (type
); field
; field
= TREE_CHAIN (field
))
14471 if (TREE_CODE (field
) != FIELD_DECL
)
14474 sub_count
= aapcs_vfp_sub_candidate (TREE_TYPE (field
), modep
);
14477 count
= count
> sub_count
? count
: sub_count
;
14480 /* There must be no padding. */
14481 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type
)),
14482 count
* GET_MODE_BITSIZE (*modep
)))
14495 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
14496 type as described in AAPCS64 \S 4.1.2.
14498 See the comment above aarch64_composite_type_p for the notes on MODE. */
14501 aarch64_short_vector_p (const_tree type
,
14504 poly_int64 size
= -1;
14506 if (type
&& TREE_CODE (type
) == VECTOR_TYPE
)
14507 size
= int_size_in_bytes (type
);
14508 else if (GET_MODE_CLASS (mode
) == MODE_VECTOR_INT
14509 || GET_MODE_CLASS (mode
) == MODE_VECTOR_FLOAT
)
14510 size
= GET_MODE_SIZE (mode
);
14512 return known_eq (size
, 8) || known_eq (size
, 16);
14515 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
14516 type as described in AAPCS64 \S 4.3. This includes aggregate, union and
14517 array types. The C99 floating-point complex types are also considered
14518 as composite types, according to AAPCS64 \S 7.1.1. The complex integer
14519 types, which are GCC extensions and out of the scope of AAPCS64, are
14520 treated as composite types here as well.
14522 Note that MODE itself is not sufficient in determining whether a type
14523 is such a composite type or not. This is because
14524 stor-layout.c:compute_record_mode may have already changed the MODE
14525 (BLKmode) of a RECORD_TYPE TYPE to some other mode. For example, a
14526 structure with only one field may have its MODE set to the mode of the
14527 field. Also an integer mode whose size matches the size of the
14528 RECORD_TYPE type may be used to substitute the original mode
14529 (i.e. BLKmode) in certain circumstances. In other words, MODE cannot be
14530 solely relied on. */
14533 aarch64_composite_type_p (const_tree type
,
14536 if (aarch64_short_vector_p (type
, mode
))
14539 if (type
&& (AGGREGATE_TYPE_P (type
) || TREE_CODE (type
) == COMPLEX_TYPE
))
14542 if (mode
== BLKmode
14543 || GET_MODE_CLASS (mode
) == MODE_COMPLEX_FLOAT
14544 || GET_MODE_CLASS (mode
) == MODE_COMPLEX_INT
)
14550 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
14551 shall be passed or returned in simd/fp register(s) (providing these
14552 parameter passing registers are available).
14554 Upon successful return, *COUNT returns the number of needed registers,
14555 *BASE_MODE returns the mode of the individual register and when IS_HAF
14556 is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
14557 floating-point aggregate or a homogeneous short-vector aggregate. */
14560 aarch64_vfp_is_call_or_return_candidate (machine_mode mode
,
14562 machine_mode
*base_mode
,
14566 machine_mode new_mode
= VOIDmode
;
14567 bool composite_p
= aarch64_composite_type_p (type
, mode
);
14569 if (is_ha
!= NULL
) *is_ha
= false;
14571 if ((!composite_p
&& GET_MODE_CLASS (mode
) == MODE_FLOAT
)
14572 || aarch64_short_vector_p (type
, mode
))
14577 else if (GET_MODE_CLASS (mode
) == MODE_COMPLEX_FLOAT
)
14579 if (is_ha
!= NULL
) *is_ha
= true;
14581 new_mode
= GET_MODE_INNER (mode
);
14583 else if (type
&& composite_p
)
14585 int ag_count
= aapcs_vfp_sub_candidate (type
, &new_mode
);
14587 if (ag_count
> 0 && ag_count
<= HA_MAX_NUM_FLDS
)
14589 if (is_ha
!= NULL
) *is_ha
= true;
14598 *base_mode
= new_mode
;
14602 /* Implement TARGET_STRUCT_VALUE_RTX. */
14605 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED
,
14606 int incoming ATTRIBUTE_UNUSED
)
14608 return gen_rtx_REG (Pmode
, AARCH64_STRUCT_VALUE_REGNUM
);
14611 /* Implements target hook vector_mode_supported_p. */
14613 aarch64_vector_mode_supported_p (machine_mode mode
)
14615 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
14616 return vec_flags
!= 0 && (vec_flags
& VEC_STRUCT
) == 0;
14619 /* Return the full-width SVE vector mode for element mode MODE, if one
14622 aarch64_full_sve_mode (scalar_mode mode
)
14639 return VNx16QImode
;
14641 return opt_machine_mode ();
14645 /* Return the 128-bit Advanced SIMD vector mode for element mode MODE,
14648 aarch64_vq_mode (scalar_mode mode
)
14667 return opt_machine_mode ();
14671 /* Return appropriate SIMD container
14672 for MODE within a vector of WIDTH bits. */
14673 static machine_mode
14674 aarch64_simd_container_mode (scalar_mode mode
, poly_int64 width
)
14676 if (TARGET_SVE
&& known_eq (width
, BITS_PER_SVE_VECTOR
))
14677 return aarch64_full_sve_mode (mode
).else_mode (word_mode
);
14679 gcc_assert (known_eq (width
, 64) || known_eq (width
, 128));
14682 if (known_eq (width
, 128))
14683 return aarch64_vq_mode (mode
).else_mode (word_mode
);
14704 /* Return 128-bit container as the preferred SIMD mode for MODE. */
14705 static machine_mode
14706 aarch64_preferred_simd_mode (scalar_mode mode
)
14708 poly_int64 bits
= TARGET_SVE
? BITS_PER_SVE_VECTOR
: 128;
14709 return aarch64_simd_container_mode (mode
, bits
);
14712 /* Return a list of possible vector sizes for the vectorizer
14713 to iterate over. */
14715 aarch64_autovectorize_vector_sizes (vector_sizes
*sizes
, bool)
14718 sizes
->safe_push (BYTES_PER_SVE_VECTOR
);
14719 sizes
->safe_push (16);
14720 sizes
->safe_push (8);
14723 /* Implement TARGET_MANGLE_TYPE. */
14725 static const char *
14726 aarch64_mangle_type (const_tree type
)
14728 /* The AArch64 ABI documents say that "__va_list" has to be
14729 mangled as if it is in the "std" namespace. */
14730 if (lang_hooks
.types_compatible_p (CONST_CAST_TREE (type
), va_list_type
))
14731 return "St9__va_list";
14733 /* Half-precision float. */
14734 if (TREE_CODE (type
) == REAL_TYPE
&& TYPE_PRECISION (type
) == 16)
14737 /* Mangle AArch64-specific internal types. TYPE_NAME is non-NULL_TREE for
14739 if (TYPE_NAME (type
) != NULL
)
14740 return aarch64_mangle_builtin_type (type
);
14742 /* Use the default mangling. */
14746 /* Find the first rtx_insn before insn that will generate an assembly
14750 aarch64_prev_real_insn (rtx_insn
*insn
)
14757 insn
= prev_real_insn (insn
);
14759 while (insn
&& recog_memoized (insn
) < 0);
14765 is_madd_op (enum attr_type t1
)
14768 /* A number of these may be AArch32 only. */
14769 enum attr_type mlatypes
[] = {
14770 TYPE_MLA
, TYPE_MLAS
, TYPE_SMLAD
, TYPE_SMLADX
, TYPE_SMLAL
, TYPE_SMLALD
,
14771 TYPE_SMLALS
, TYPE_SMLALXY
, TYPE_SMLAWX
, TYPE_SMLAWY
, TYPE_SMLAXY
,
14772 TYPE_SMMLA
, TYPE_UMLAL
, TYPE_UMLALS
,TYPE_SMLSD
, TYPE_SMLSDX
, TYPE_SMLSLD
14775 for (i
= 0; i
< sizeof (mlatypes
) / sizeof (enum attr_type
); i
++)
14777 if (t1
== mlatypes
[i
])
14784 /* Check if there is a register dependency between a load and the insn
14785 for which we hold recog_data. */
14788 dep_between_memop_and_curr (rtx memop
)
14793 gcc_assert (GET_CODE (memop
) == SET
);
14795 if (!REG_P (SET_DEST (memop
)))
14798 load_reg
= SET_DEST (memop
);
14799 for (opno
= 1; opno
< recog_data
.n_operands
; opno
++)
14801 rtx operand
= recog_data
.operand
[opno
];
14802 if (REG_P (operand
)
14803 && reg_overlap_mentioned_p (load_reg
, operand
))
14811 /* When working around the Cortex-A53 erratum 835769,
14812 given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
14813 instruction and has a preceding memory instruction such that a NOP
14814 should be inserted between them. */
14817 aarch64_madd_needs_nop (rtx_insn
* insn
)
14819 enum attr_type attr_type
;
14823 if (!TARGET_FIX_ERR_A53_835769
)
14826 if (!INSN_P (insn
) || recog_memoized (insn
) < 0)
14829 attr_type
= get_attr_type (insn
);
14830 if (!is_madd_op (attr_type
))
14833 prev
= aarch64_prev_real_insn (insn
);
14834 /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
14835 Restore recog state to INSN to avoid state corruption. */
14836 extract_constrain_insn_cached (insn
);
14838 if (!prev
|| !contains_mem_rtx_p (PATTERN (prev
)))
14841 body
= single_set (prev
);
14843 /* If the previous insn is a memory op and there is no dependency between
14844 it and the DImode madd, emit a NOP between them. If body is NULL then we
14845 have a complex memory operation, probably a load/store pair.
14846 Be conservative for now and emit a NOP. */
14847 if (GET_MODE (recog_data
.operand
[0]) == DImode
14848 && (!body
|| !dep_between_memop_and_curr (body
)))
14856 /* Implement FINAL_PRESCAN_INSN. */
14859 aarch64_final_prescan_insn (rtx_insn
*insn
)
14861 if (aarch64_madd_needs_nop (insn
))
14862 fprintf (asm_out_file
, "\tnop // between mem op and mult-accumulate\n");
14866 /* Return true if BASE_OR_STEP is a valid immediate operand for an SVE INDEX
14870 aarch64_sve_index_immediate_p (rtx base_or_step
)
14872 return (CONST_INT_P (base_or_step
)
14873 && IN_RANGE (INTVAL (base_or_step
), -16, 15));
14876 /* Return true if X is a valid immediate for the SVE ADD and SUB
14877 instructions. Negate X first if NEGATE_P is true. */
14880 aarch64_sve_arith_immediate_p (rtx x
, bool negate_p
)
14884 if (!const_vec_duplicate_p (x
, &elt
)
14885 || !CONST_INT_P (elt
))
14888 HOST_WIDE_INT val
= INTVAL (elt
);
14891 val
&= GET_MODE_MASK (GET_MODE_INNER (GET_MODE (x
)));
14894 return IN_RANGE (val
, 0, 0xff);
14895 return IN_RANGE (val
, 0, 0xff00);
14898 /* Return true if X is a valid immediate operand for an SVE logical
14899 instruction such as AND. */
14902 aarch64_sve_bitmask_immediate_p (rtx x
)
14906 return (const_vec_duplicate_p (x
, &elt
)
14907 && CONST_INT_P (elt
)
14908 && aarch64_bitmask_imm (INTVAL (elt
),
14909 GET_MODE_INNER (GET_MODE (x
))));
14912 /* Return true if X is a valid immediate for the SVE DUP and CPY
14916 aarch64_sve_dup_immediate_p (rtx x
)
14920 if (!const_vec_duplicate_p (x
, &elt
)
14921 || !CONST_INT_P (elt
))
14924 HOST_WIDE_INT val
= INTVAL (elt
);
14926 return IN_RANGE (val
, -0x80, 0x7f);
14927 return IN_RANGE (val
, -0x8000, 0x7f00);
14930 /* Return true if X is a valid immediate operand for an SVE CMP instruction.
14931 SIGNED_P says whether the operand is signed rather than unsigned. */
14934 aarch64_sve_cmp_immediate_p (rtx x
, bool signed_p
)
14938 return (const_vec_duplicate_p (x
, &elt
)
14939 && CONST_INT_P (elt
)
14941 ? IN_RANGE (INTVAL (elt
), -16, 15)
14942 : IN_RANGE (INTVAL (elt
), 0, 127)));
14945 /* Return true if X is a valid immediate operand for an SVE FADD or FSUB
14946 instruction. Negate X first if NEGATE_P is true. */
14949 aarch64_sve_float_arith_immediate_p (rtx x
, bool negate_p
)
14954 if (!const_vec_duplicate_p (x
, &elt
)
14955 || GET_CODE (elt
) != CONST_DOUBLE
)
14958 r
= *CONST_DOUBLE_REAL_VALUE (elt
);
14961 r
= real_value_negate (&r
);
14963 if (real_equal (&r
, &dconst1
))
14965 if (real_equal (&r
, &dconsthalf
))
14970 /* Return true if X is a valid immediate operand for an SVE FMUL
14974 aarch64_sve_float_mul_immediate_p (rtx x
)
14978 /* GCC will never generate a multiply with an immediate of 2, so there is no
14979 point testing for it (even though it is a valid constant). */
14980 return (const_vec_duplicate_p (x
, &elt
)
14981 && GET_CODE (elt
) == CONST_DOUBLE
14982 && real_equal (CONST_DOUBLE_REAL_VALUE (elt
), &dconsthalf
));
14985 /* Return true if replicating VAL32 is a valid 2-byte or 4-byte immediate
14986 for the Advanced SIMD operation described by WHICH and INSN. If INFO
14987 is nonnull, use it to describe valid immediates. */
14989 aarch64_advsimd_valid_immediate_hs (unsigned int val32
,
14990 simd_immediate_info
*info
,
14991 enum simd_immediate_check which
,
14992 simd_immediate_info::insn_type insn
)
14994 /* Try a 4-byte immediate with LSL. */
14995 for (unsigned int shift
= 0; shift
< 32; shift
+= 8)
14996 if ((val32
& (0xff << shift
)) == val32
)
14999 *info
= simd_immediate_info (SImode
, val32
>> shift
, insn
,
15000 simd_immediate_info::LSL
, shift
);
15004 /* Try a 2-byte immediate with LSL. */
15005 unsigned int imm16
= val32
& 0xffff;
15006 if (imm16
== (val32
>> 16))
15007 for (unsigned int shift
= 0; shift
< 16; shift
+= 8)
15008 if ((imm16
& (0xff << shift
)) == imm16
)
15011 *info
= simd_immediate_info (HImode
, imm16
>> shift
, insn
,
15012 simd_immediate_info::LSL
, shift
);
15016 /* Try a 4-byte immediate with MSL, except for cases that MVN
15018 if (which
== AARCH64_CHECK_MOV
)
15019 for (unsigned int shift
= 8; shift
< 24; shift
+= 8)
15021 unsigned int low
= (1 << shift
) - 1;
15022 if (((val32
& (0xff << shift
)) | low
) == val32
)
15025 *info
= simd_immediate_info (SImode
, val32
>> shift
, insn
,
15026 simd_immediate_info::MSL
, shift
);
15034 /* Return true if replicating VAL64 is a valid immediate for the
15035 Advanced SIMD operation described by WHICH. If INFO is nonnull,
15036 use it to describe valid immediates. */
15038 aarch64_advsimd_valid_immediate (unsigned HOST_WIDE_INT val64
,
15039 simd_immediate_info
*info
,
15040 enum simd_immediate_check which
)
15042 unsigned int val32
= val64
& 0xffffffff;
15043 unsigned int val16
= val64
& 0xffff;
15044 unsigned int val8
= val64
& 0xff;
15046 if (val32
== (val64
>> 32))
15048 if ((which
& AARCH64_CHECK_ORR
) != 0
15049 && aarch64_advsimd_valid_immediate_hs (val32
, info
, which
,
15050 simd_immediate_info::MOV
))
15053 if ((which
& AARCH64_CHECK_BIC
) != 0
15054 && aarch64_advsimd_valid_immediate_hs (~val32
, info
, which
,
15055 simd_immediate_info::MVN
))
15058 /* Try using a replicated byte. */
15059 if (which
== AARCH64_CHECK_MOV
15060 && val16
== (val32
>> 16)
15061 && val8
== (val16
>> 8))
15064 *info
= simd_immediate_info (QImode
, val8
);
15069 /* Try using a bit-to-bytemask. */
15070 if (which
== AARCH64_CHECK_MOV
)
15073 for (i
= 0; i
< 64; i
+= 8)
15075 unsigned char byte
= (val64
>> i
) & 0xff;
15076 if (byte
!= 0 && byte
!= 0xff)
15082 *info
= simd_immediate_info (DImode
, val64
);
15089 /* Return true if replicating VAL64 gives a valid immediate for an SVE MOV
15090 instruction. If INFO is nonnull, use it to describe valid immediates. */
15093 aarch64_sve_valid_immediate (unsigned HOST_WIDE_INT val64
,
15094 simd_immediate_info
*info
)
15096 scalar_int_mode mode
= DImode
;
15097 unsigned int val32
= val64
& 0xffffffff;
15098 if (val32
== (val64
>> 32))
15101 unsigned int val16
= val32
& 0xffff;
15102 if (val16
== (val32
>> 16))
15105 unsigned int val8
= val16
& 0xff;
15106 if (val8
== (val16
>> 8))
15110 HOST_WIDE_INT val
= trunc_int_for_mode (val64
, mode
);
15111 if (IN_RANGE (val
, -0x80, 0x7f))
15113 /* DUP with no shift. */
15115 *info
= simd_immediate_info (mode
, val
);
15118 if ((val
& 0xff) == 0 && IN_RANGE (val
, -0x8000, 0x7f00))
15120 /* DUP with LSL #8. */
15122 *info
= simd_immediate_info (mode
, val
);
15125 if (aarch64_bitmask_imm (val64
, mode
))
15129 *info
= simd_immediate_info (mode
, val
);
15135 /* Return true if X is a valid SVE predicate. If INFO is nonnull, use
15136 it to describe valid immediates. */
15139 aarch64_sve_pred_valid_immediate (rtx x
, simd_immediate_info
*info
)
15141 if (x
== CONST0_RTX (GET_MODE (x
)))
15144 *info
= simd_immediate_info (DImode
, 0);
15148 /* Analyze the value as a VNx16BImode. This should be relatively
15149 efficient, since rtx_vector_builder has enough built-in capacity
15150 to store all VLA predicate constants without needing the heap. */
15151 rtx_vector_builder builder
;
15152 if (!aarch64_get_sve_pred_bits (builder
, x
))
15155 unsigned int elt_size
= aarch64_widest_sve_pred_elt_size (builder
);
15156 if (int vl
= aarch64_partial_ptrue_length (builder
, elt_size
))
15158 machine_mode mode
= aarch64_sve_pred_mode (elt_size
).require ();
15159 aarch64_svpattern pattern
= aarch64_svpattern_for_vl (mode
, vl
);
15160 if (pattern
!= AARCH64_NUM_SVPATTERNS
)
15164 scalar_int_mode int_mode
= aarch64_sve_element_int_mode (mode
);
15165 *info
= simd_immediate_info (int_mode
, pattern
);
15173 /* Return true if OP is a valid SIMD immediate for the operation
15174 described by WHICH. If INFO is nonnull, use it to describe valid
15177 aarch64_simd_valid_immediate (rtx op
, simd_immediate_info
*info
,
15178 enum simd_immediate_check which
)
15180 machine_mode mode
= GET_MODE (op
);
15181 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
15182 if (vec_flags
== 0 || vec_flags
== (VEC_ADVSIMD
| VEC_STRUCT
))
15185 if (vec_flags
& VEC_SVE_PRED
)
15186 return aarch64_sve_pred_valid_immediate (op
, info
);
15188 scalar_mode elt_mode
= GET_MODE_INNER (mode
);
15190 unsigned int n_elts
;
15191 if (GET_CODE (op
) == CONST_VECTOR
15192 && CONST_VECTOR_DUPLICATE_P (op
))
15193 n_elts
= CONST_VECTOR_NPATTERNS (op
);
15194 else if ((vec_flags
& VEC_SVE_DATA
)
15195 && const_vec_series_p (op
, &base
, &step
))
15197 gcc_assert (GET_MODE_CLASS (mode
) == MODE_VECTOR_INT
);
15198 if (!aarch64_sve_index_immediate_p (base
)
15199 || !aarch64_sve_index_immediate_p (step
))
15203 *info
= simd_immediate_info (elt_mode
, base
, step
);
15206 else if (GET_CODE (op
) == CONST_VECTOR
15207 && CONST_VECTOR_NUNITS (op
).is_constant (&n_elts
))
15208 /* N_ELTS set above. */;
15212 scalar_float_mode elt_float_mode
;
15214 && is_a
<scalar_float_mode
> (elt_mode
, &elt_float_mode
))
15216 rtx elt
= CONST_VECTOR_ENCODED_ELT (op
, 0);
15217 if (aarch64_float_const_zero_rtx_p (elt
)
15218 || aarch64_float_const_representable_p (elt
))
15221 *info
= simd_immediate_info (elt_float_mode
, elt
);
15226 unsigned int elt_size
= GET_MODE_SIZE (elt_mode
);
15230 scalar_int_mode elt_int_mode
= int_mode_for_mode (elt_mode
).require ();
15232 /* Expand the vector constant out into a byte vector, with the least
15233 significant byte of the register first. */
15234 auto_vec
<unsigned char, 16> bytes
;
15235 bytes
.reserve (n_elts
* elt_size
);
15236 for (unsigned int i
= 0; i
< n_elts
; i
++)
15238 /* The vector is provided in gcc endian-neutral fashion.
15239 For aarch64_be Advanced SIMD, it must be laid out in the vector
15240 register in reverse order. */
15241 bool swap_p
= ((vec_flags
& VEC_ADVSIMD
) != 0 && BYTES_BIG_ENDIAN
);
15242 rtx elt
= CONST_VECTOR_ELT (op
, swap_p
? (n_elts
- 1 - i
) : i
);
15244 if (elt_mode
!= elt_int_mode
)
15245 elt
= gen_lowpart (elt_int_mode
, elt
);
15247 if (!CONST_INT_P (elt
))
15250 unsigned HOST_WIDE_INT elt_val
= INTVAL (elt
);
15251 for (unsigned int byte
= 0; byte
< elt_size
; byte
++)
15253 bytes
.quick_push (elt_val
& 0xff);
15254 elt_val
>>= BITS_PER_UNIT
;
15258 /* The immediate must repeat every eight bytes. */
15259 unsigned int nbytes
= bytes
.length ();
15260 for (unsigned i
= 8; i
< nbytes
; ++i
)
15261 if (bytes
[i
] != bytes
[i
- 8])
15264 /* Get the repeating 8-byte value as an integer. No endian correction
15265 is needed here because bytes is already in lsb-first order. */
15266 unsigned HOST_WIDE_INT val64
= 0;
15267 for (unsigned int i
= 0; i
< 8; i
++)
15268 val64
|= ((unsigned HOST_WIDE_INT
) bytes
[i
% nbytes
]
15269 << (i
* BITS_PER_UNIT
));
15271 if (vec_flags
& VEC_SVE_DATA
)
15272 return aarch64_sve_valid_immediate (val64
, info
);
15274 return aarch64_advsimd_valid_immediate (val64
, info
, which
);
15277 /* Check whether X is a VEC_SERIES-like constant that starts at 0 and
15278 has a step in the range of INDEX. Return the index expression if so,
15279 otherwise return null. */
15281 aarch64_check_zero_based_sve_index_immediate (rtx x
)
15284 if (const_vec_series_p (x
, &base
, &step
)
15285 && base
== const0_rtx
15286 && aarch64_sve_index_immediate_p (step
))
15291 /* Check of immediate shift constants are within range. */
15293 aarch64_simd_shift_imm_p (rtx x
, machine_mode mode
, bool left
)
15295 int bit_width
= GET_MODE_UNIT_SIZE (mode
) * BITS_PER_UNIT
;
15297 return aarch64_const_vec_all_same_in_range_p (x
, 0, bit_width
- 1);
15299 return aarch64_const_vec_all_same_in_range_p (x
, 1, bit_width
);
15302 /* Return the bitmask CONST_INT to select the bits required by a zero extract
15303 operation of width WIDTH at bit position POS. */
15306 aarch64_mask_from_zextract_ops (rtx width
, rtx pos
)
15308 gcc_assert (CONST_INT_P (width
));
15309 gcc_assert (CONST_INT_P (pos
));
15311 unsigned HOST_WIDE_INT mask
15312 = ((unsigned HOST_WIDE_INT
) 1 << UINTVAL (width
)) - 1;
15313 return GEN_INT (mask
<< UINTVAL (pos
));
15317 aarch64_mov_operand_p (rtx x
, machine_mode mode
)
15319 if (GET_CODE (x
) == HIGH
15320 && aarch64_valid_symref (XEXP (x
, 0), GET_MODE (XEXP (x
, 0))))
15323 if (CONST_INT_P (x
))
15326 if (VECTOR_MODE_P (GET_MODE (x
)))
15328 /* Require predicate constants to be VNx16BI before RA, so that we
15329 force everything to have a canonical form. */
15330 if (!lra_in_progress
15331 && !reload_completed
15332 && GET_MODE_CLASS (GET_MODE (x
)) == MODE_VECTOR_BOOL
15333 && GET_MODE (x
) != VNx16BImode
)
15336 return aarch64_simd_valid_immediate (x
, NULL
);
15339 if (GET_CODE (x
) == SYMBOL_REF
&& mode
== DImode
&& CONSTANT_ADDRESS_P (x
))
15342 if (aarch64_sve_cnt_immediate_p (x
))
15345 return aarch64_classify_symbolic_expression (x
)
15346 == SYMBOL_TINY_ABSOLUTE
;
15349 /* Return a const_int vector of VAL. */
15351 aarch64_simd_gen_const_vector_dup (machine_mode mode
, HOST_WIDE_INT val
)
15353 rtx c
= gen_int_mode (val
, GET_MODE_INNER (mode
));
15354 return gen_const_vec_duplicate (mode
, c
);
15357 /* Check OP is a legal scalar immediate for the MOVI instruction. */
15360 aarch64_simd_scalar_immediate_valid_for_move (rtx op
, scalar_int_mode mode
)
15362 machine_mode vmode
;
15364 vmode
= aarch64_simd_container_mode (mode
, 64);
15365 rtx op_v
= aarch64_simd_gen_const_vector_dup (vmode
, INTVAL (op
));
15366 return aarch64_simd_valid_immediate (op_v
, NULL
);
15369 /* Construct and return a PARALLEL RTX vector with elements numbering the
15370 lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
15371 the vector - from the perspective of the architecture. This does not
15372 line up with GCC's perspective on lane numbers, so we end up with
15373 different masks depending on our target endian-ness. The diagram
15374 below may help. We must draw the distinction when building masks
15375 which select one half of the vector. An instruction selecting
15376 architectural low-lanes for a big-endian target, must be described using
15377 a mask selecting GCC high-lanes.
15379 Big-Endian Little-Endian
15381 GCC 0 1 2 3 3 2 1 0
15382 | x | x | x | x | | x | x | x | x |
15383 Architecture 3 2 1 0 3 2 1 0
15385 Low Mask: { 2, 3 } { 0, 1 }
15386 High Mask: { 0, 1 } { 2, 3 }
15388 MODE Is the mode of the vector and NUNITS is the number of units in it. */
15391 aarch64_simd_vect_par_cnst_half (machine_mode mode
, int nunits
, bool high
)
15393 rtvec v
= rtvec_alloc (nunits
/ 2);
15394 int high_base
= nunits
/ 2;
15400 if (BYTES_BIG_ENDIAN
)
15401 base
= high
? low_base
: high_base
;
15403 base
= high
? high_base
: low_base
;
15405 for (i
= 0; i
< nunits
/ 2; i
++)
15406 RTVEC_ELT (v
, i
) = GEN_INT (base
+ i
);
15408 t1
= gen_rtx_PARALLEL (mode
, v
);
15412 /* Check OP for validity as a PARALLEL RTX vector with elements
15413 numbering the lanes of either the high (HIGH == TRUE) or low lanes,
15414 from the perspective of the architecture. See the diagram above
15415 aarch64_simd_vect_par_cnst_half for more details. */
15418 aarch64_simd_check_vect_par_cnst_half (rtx op
, machine_mode mode
,
15422 if (!VECTOR_MODE_P (mode
) || !GET_MODE_NUNITS (mode
).is_constant (&nelts
))
15425 rtx ideal
= aarch64_simd_vect_par_cnst_half (mode
, nelts
, high
);
15426 HOST_WIDE_INT count_op
= XVECLEN (op
, 0);
15427 HOST_WIDE_INT count_ideal
= XVECLEN (ideal
, 0);
15430 if (count_op
!= count_ideal
)
15433 for (i
= 0; i
< count_ideal
; i
++)
15435 rtx elt_op
= XVECEXP (op
, 0, i
);
15436 rtx elt_ideal
= XVECEXP (ideal
, 0, i
);
15438 if (!CONST_INT_P (elt_op
)
15439 || INTVAL (elt_ideal
) != INTVAL (elt_op
))
15445 /* Return a PARALLEL containing NELTS elements, with element I equal
15446 to BASE + I * STEP. */
15449 aarch64_gen_stepped_int_parallel (unsigned int nelts
, int base
, int step
)
15451 rtvec vec
= rtvec_alloc (nelts
);
15452 for (unsigned int i
= 0; i
< nelts
; ++i
)
15453 RTVEC_ELT (vec
, i
) = gen_int_mode (base
+ i
* step
, DImode
);
15454 return gen_rtx_PARALLEL (VOIDmode
, vec
);
15457 /* Return true if OP is a PARALLEL of CONST_INTs that form a linear
15458 series with step STEP. */
15461 aarch64_stepped_int_parallel_p (rtx op
, int step
)
15463 if (GET_CODE (op
) != PARALLEL
|| !CONST_INT_P (XVECEXP (op
, 0, 0)))
15466 unsigned HOST_WIDE_INT base
= UINTVAL (XVECEXP (op
, 0, 0));
15467 for (int i
= 1; i
< XVECLEN (op
, 0); ++i
)
15468 if (!CONST_INT_P (XVECEXP (op
, 0, i
))
15469 || UINTVAL (XVECEXP (op
, 0, i
)) != base
+ i
* step
)
15475 /* Bounds-check lanes. Ensure OPERAND lies between LOW (inclusive) and
15476 HIGH (exclusive). */
15478 aarch64_simd_lane_bounds (rtx operand
, HOST_WIDE_INT low
, HOST_WIDE_INT high
,
15481 HOST_WIDE_INT lane
;
15482 gcc_assert (CONST_INT_P (operand
));
15483 lane
= INTVAL (operand
);
15485 if (lane
< low
|| lane
>= high
)
15488 error ("%Klane %wd out of range %wd - %wd", exp
, lane
, low
, high
- 1);
15490 error ("lane %wd out of range %wd - %wd", lane
, low
, high
- 1);
15494 /* Peform endian correction on lane number N, which indexes a vector
15495 of mode MODE, and return the result as an SImode rtx. */
15498 aarch64_endian_lane_rtx (machine_mode mode
, unsigned int n
)
15500 return gen_int_mode (ENDIAN_LANE_N (GET_MODE_NUNITS (mode
), n
), SImode
);
15503 /* Return TRUE if OP is a valid vector addressing mode. */
15506 aarch64_simd_mem_operand_p (rtx op
)
15508 return MEM_P (op
) && (GET_CODE (XEXP (op
, 0)) == POST_INC
15509 || REG_P (XEXP (op
, 0)));
15512 /* Return true if OP is a valid MEM operand for an SVE LD1R instruction. */
15515 aarch64_sve_ld1r_operand_p (rtx op
)
15517 struct aarch64_address_info addr
;
15521 && is_a
<scalar_mode
> (GET_MODE (op
), &mode
)
15522 && aarch64_classify_address (&addr
, XEXP (op
, 0), mode
, false)
15523 && addr
.type
== ADDRESS_REG_IMM
15524 && offset_6bit_unsigned_scaled_p (mode
, addr
.const_offset
));
15527 /* Return true if OP is a valid MEM operand for an SVE LD1RQ instruction. */
15529 aarch64_sve_ld1rq_operand_p (rtx op
)
15531 struct aarch64_address_info addr
;
15532 scalar_mode elem_mode
= GET_MODE_INNER (GET_MODE (op
));
15534 || !aarch64_classify_address (&addr
, XEXP (op
, 0), elem_mode
, false))
15537 if (addr
.type
== ADDRESS_REG_IMM
)
15538 return offset_4bit_signed_scaled_p (TImode
, addr
.const_offset
);
15540 if (addr
.type
== ADDRESS_REG_REG
)
15541 return (1U << addr
.shift
) == GET_MODE_SIZE (elem_mode
);
15546 /* Return true if OP is a valid MEM operand for an SVE LDR instruction.
15547 The conditions for STR are the same. */
15549 aarch64_sve_ldr_operand_p (rtx op
)
15551 struct aarch64_address_info addr
;
15554 && aarch64_classify_address (&addr
, XEXP (op
, 0), GET_MODE (op
),
15555 false, ADDR_QUERY_ANY
)
15556 && addr
.type
== ADDRESS_REG_IMM
);
15559 /* Return true if OP is a valid MEM operand for an SVE_STRUCT mode.
15560 We need to be able to access the individual pieces, so the range
15561 is different from LD[234] and ST[234]. */
15563 aarch64_sve_struct_memory_operand_p (rtx op
)
15568 machine_mode mode
= GET_MODE (op
);
15569 struct aarch64_address_info addr
;
15570 if (!aarch64_classify_address (&addr
, XEXP (op
, 0), SVE_BYTE_MODE
, false,
15572 || addr
.type
!= ADDRESS_REG_IMM
)
15575 poly_int64 first
= addr
.const_offset
;
15576 poly_int64 last
= first
+ GET_MODE_SIZE (mode
) - BYTES_PER_SVE_VECTOR
;
15577 return (offset_4bit_signed_scaled_p (SVE_BYTE_MODE
, first
)
15578 && offset_4bit_signed_scaled_p (SVE_BYTE_MODE
, last
));
15581 /* Emit a register copy from operand to operand, taking care not to
15582 early-clobber source registers in the process.
15584 COUNT is the number of components into which the copy needs to be
15587 aarch64_simd_emit_reg_reg_move (rtx
*operands
, machine_mode mode
,
15588 unsigned int count
)
15591 int rdest
= REGNO (operands
[0]);
15592 int rsrc
= REGNO (operands
[1]);
15594 if (!reg_overlap_mentioned_p (operands
[0], operands
[1])
15596 for (i
= 0; i
< count
; i
++)
15597 emit_move_insn (gen_rtx_REG (mode
, rdest
+ i
),
15598 gen_rtx_REG (mode
, rsrc
+ i
));
15600 for (i
= 0; i
< count
; i
++)
15601 emit_move_insn (gen_rtx_REG (mode
, rdest
+ count
- i
- 1),
15602 gen_rtx_REG (mode
, rsrc
+ count
- i
- 1));
15605 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
15606 one of VSTRUCT modes: OI, CI, or XI. */
15608 aarch64_simd_attr_length_rglist (machine_mode mode
)
15610 /* This is only used (and only meaningful) for Advanced SIMD, not SVE. */
15611 return (GET_MODE_SIZE (mode
).to_constant () / UNITS_PER_VREG
) * 4;
15614 /* Implement target hook TARGET_VECTOR_ALIGNMENT. The AAPCS64 sets the maximum
15615 alignment of a vector to 128 bits. SVE predicates have an alignment of
15617 static HOST_WIDE_INT
15618 aarch64_simd_vector_alignment (const_tree type
)
15620 if (TREE_CODE (TYPE_SIZE (type
)) != INTEGER_CST
)
15621 /* ??? Checking the mode isn't ideal, but VECTOR_BOOLEAN_TYPE_P can
15622 be set for non-predicate vectors of booleans. Modes are the most
15623 direct way we have of identifying real SVE predicate types. */
15624 return GET_MODE_CLASS (TYPE_MODE (type
)) == MODE_VECTOR_BOOL
? 16 : 128;
15625 return wi::umin (wi::to_wide (TYPE_SIZE (type
)), 128).to_uhwi ();
15628 /* Implement target hook TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT. */
15630 aarch64_vectorize_preferred_vector_alignment (const_tree type
)
15632 if (aarch64_sve_data_mode_p (TYPE_MODE (type
)))
15634 /* If the length of the vector is fixed, try to align to that length,
15635 otherwise don't try to align at all. */
15636 HOST_WIDE_INT result
;
15637 if (!BITS_PER_SVE_VECTOR
.is_constant (&result
))
15638 result
= TYPE_ALIGN (TREE_TYPE (type
));
15641 return TYPE_ALIGN (type
);
15644 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE. */
15646 aarch64_simd_vector_alignment_reachable (const_tree type
, bool is_packed
)
15651 /* For fixed-length vectors, check that the vectorizer will aim for
15652 full-vector alignment. This isn't true for generic GCC vectors
15653 that are wider than the ABI maximum of 128 bits. */
15654 poly_uint64 preferred_alignment
=
15655 aarch64_vectorize_preferred_vector_alignment (type
);
15656 if (TREE_CODE (TYPE_SIZE (type
)) == INTEGER_CST
15657 && maybe_ne (wi::to_widest (TYPE_SIZE (type
)),
15658 preferred_alignment
))
15661 /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned. */
15665 /* Return true if the vector misalignment factor is supported by the
15668 aarch64_builtin_support_vector_misalignment (machine_mode mode
,
15669 const_tree type
, int misalignment
,
15672 if (TARGET_SIMD
&& STRICT_ALIGNMENT
)
15674 /* Return if movmisalign pattern is not supported for this mode. */
15675 if (optab_handler (movmisalign_optab
, mode
) == CODE_FOR_nothing
)
15678 /* Misalignment factor is unknown at compile time. */
15679 if (misalignment
== -1)
15682 return default_builtin_support_vector_misalignment (mode
, type
, misalignment
,
15686 /* If VALS is a vector constant that can be loaded into a register
15687 using DUP, generate instructions to do so and return an RTX to
15688 assign to the register. Otherwise return NULL_RTX. */
15690 aarch64_simd_dup_constant (rtx vals
)
15692 machine_mode mode
= GET_MODE (vals
);
15693 machine_mode inner_mode
= GET_MODE_INNER (mode
);
15696 if (!const_vec_duplicate_p (vals
, &x
))
15699 /* We can load this constant by using DUP and a constant in a
15700 single ARM register. This will be cheaper than a vector
15702 x
= copy_to_mode_reg (inner_mode
, x
);
15703 return gen_vec_duplicate (mode
, x
);
15707 /* Generate code to load VALS, which is a PARALLEL containing only
15708 constants (for vec_init) or CONST_VECTOR, efficiently into a
15709 register. Returns an RTX to copy into the register, or NULL_RTX
15710 for a PARALLEL that cannot be converted into a CONST_VECTOR. */
15712 aarch64_simd_make_constant (rtx vals
)
15714 machine_mode mode
= GET_MODE (vals
);
15716 rtx const_vec
= NULL_RTX
;
15720 if (GET_CODE (vals
) == CONST_VECTOR
)
15722 else if (GET_CODE (vals
) == PARALLEL
)
15724 /* A CONST_VECTOR must contain only CONST_INTs and
15725 CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
15726 Only store valid constants in a CONST_VECTOR. */
15727 int n_elts
= XVECLEN (vals
, 0);
15728 for (i
= 0; i
< n_elts
; ++i
)
15730 rtx x
= XVECEXP (vals
, 0, i
);
15731 if (CONST_INT_P (x
) || CONST_DOUBLE_P (x
))
15734 if (n_const
== n_elts
)
15735 const_vec
= gen_rtx_CONST_VECTOR (mode
, XVEC (vals
, 0));
15738 gcc_unreachable ();
15740 if (const_vec
!= NULL_RTX
15741 && aarch64_simd_valid_immediate (const_vec
, NULL
))
15742 /* Load using MOVI/MVNI. */
15744 else if ((const_dup
= aarch64_simd_dup_constant (vals
)) != NULL_RTX
)
15745 /* Loaded using DUP. */
15747 else if (const_vec
!= NULL_RTX
)
15748 /* Load from constant pool. We cannot take advantage of single-cycle
15749 LD1 because we need a PC-relative addressing mode. */
15752 /* A PARALLEL containing something not valid inside CONST_VECTOR.
15753 We cannot construct an initializer. */
15757 /* Expand a vector initialisation sequence, such that TARGET is
15758 initialised to contain VALS. */
15761 aarch64_expand_vector_init (rtx target
, rtx vals
)
15763 machine_mode mode
= GET_MODE (target
);
15764 scalar_mode inner_mode
= GET_MODE_INNER (mode
);
15765 /* The number of vector elements. */
15766 int n_elts
= XVECLEN (vals
, 0);
15767 /* The number of vector elements which are not constant. */
15769 rtx any_const
= NULL_RTX
;
15770 /* The first element of vals. */
15771 rtx v0
= XVECEXP (vals
, 0, 0);
15772 bool all_same
= true;
15774 /* This is a special vec_init<M><N> where N is not an element mode but a
15775 vector mode with half the elements of M. We expect to find two entries
15776 of mode N in VALS and we must put their concatentation into TARGET. */
15777 if (XVECLEN (vals
, 0) == 2 && VECTOR_MODE_P (GET_MODE (XVECEXP (vals
, 0, 0))))
15779 gcc_assert (known_eq (GET_MODE_SIZE (mode
),
15780 2 * GET_MODE_SIZE (GET_MODE (XVECEXP (vals
, 0, 0)))));
15781 rtx lo
= XVECEXP (vals
, 0, 0);
15782 rtx hi
= XVECEXP (vals
, 0, 1);
15783 machine_mode narrow_mode
= GET_MODE (lo
);
15784 gcc_assert (GET_MODE_INNER (narrow_mode
) == inner_mode
);
15785 gcc_assert (narrow_mode
== GET_MODE (hi
));
15787 /* When we want to concatenate a half-width vector with zeroes we can
15788 use the aarch64_combinez[_be] patterns. Just make sure that the
15789 zeroes are in the right half. */
15790 if (BYTES_BIG_ENDIAN
15791 && aarch64_simd_imm_zero (lo
, narrow_mode
)
15792 && general_operand (hi
, narrow_mode
))
15793 emit_insn (gen_aarch64_combinez_be (narrow_mode
, target
, hi
, lo
));
15794 else if (!BYTES_BIG_ENDIAN
15795 && aarch64_simd_imm_zero (hi
, narrow_mode
)
15796 && general_operand (lo
, narrow_mode
))
15797 emit_insn (gen_aarch64_combinez (narrow_mode
, target
, lo
, hi
));
15800 /* Else create the two half-width registers and combine them. */
15802 lo
= force_reg (GET_MODE (lo
), lo
);
15804 hi
= force_reg (GET_MODE (hi
), hi
);
15806 if (BYTES_BIG_ENDIAN
)
15807 std::swap (lo
, hi
);
15808 emit_insn (gen_aarch64_simd_combine (narrow_mode
, target
, lo
, hi
));
15813 /* Count the number of variable elements to initialise. */
15814 for (int i
= 0; i
< n_elts
; ++i
)
15816 rtx x
= XVECEXP (vals
, 0, i
);
15817 if (!(CONST_INT_P (x
) || CONST_DOUBLE_P (x
)))
15822 all_same
&= rtx_equal_p (x
, v0
);
15825 /* No variable elements, hand off to aarch64_simd_make_constant which knows
15826 how best to handle this. */
15829 rtx constant
= aarch64_simd_make_constant (vals
);
15830 if (constant
!= NULL_RTX
)
15832 emit_move_insn (target
, constant
);
15837 /* Splat a single non-constant element if we can. */
15840 rtx x
= copy_to_mode_reg (inner_mode
, v0
);
15841 aarch64_emit_move (target
, gen_vec_duplicate (mode
, x
));
15845 enum insn_code icode
= optab_handler (vec_set_optab
, mode
);
15846 gcc_assert (icode
!= CODE_FOR_nothing
);
15848 /* If there are only variable elements, try to optimize
15849 the insertion using dup for the most common element
15850 followed by insertions. */
15852 /* The algorithm will fill matches[*][0] with the earliest matching element,
15853 and matches[X][1] with the count of duplicate elements (if X is the
15854 earliest element which has duplicates). */
15856 if (n_var
== n_elts
&& n_elts
<= 16)
15858 int matches
[16][2] = {0};
15859 for (int i
= 0; i
< n_elts
; i
++)
15861 for (int j
= 0; j
<= i
; j
++)
15863 if (rtx_equal_p (XVECEXP (vals
, 0, i
), XVECEXP (vals
, 0, j
)))
15871 int maxelement
= 0;
15873 for (int i
= 0; i
< n_elts
; i
++)
15874 if (matches
[i
][1] > maxv
)
15877 maxv
= matches
[i
][1];
15880 /* Create a duplicate of the most common element, unless all elements
15881 are equally useless to us, in which case just immediately set the
15882 vector register using the first element. */
15886 /* For vectors of two 64-bit elements, we can do even better. */
15888 && (inner_mode
== E_DImode
15889 || inner_mode
== E_DFmode
))
15892 rtx x0
= XVECEXP (vals
, 0, 0);
15893 rtx x1
= XVECEXP (vals
, 0, 1);
15894 /* Combine can pick up this case, but handling it directly
15895 here leaves clearer RTL.
15897 This is load_pair_lanes<mode>, and also gives us a clean-up
15898 for store_pair_lanes<mode>. */
15899 if (memory_operand (x0
, inner_mode
)
15900 && memory_operand (x1
, inner_mode
)
15901 && !STRICT_ALIGNMENT
15902 && rtx_equal_p (XEXP (x1
, 0),
15903 plus_constant (Pmode
,
15905 GET_MODE_SIZE (inner_mode
))))
15908 if (inner_mode
== DFmode
)
15909 t
= gen_load_pair_lanesdf (target
, x0
, x1
);
15911 t
= gen_load_pair_lanesdi (target
, x0
, x1
);
15916 /* The subreg-move sequence below will move into lane zero of the
15917 vector register. For big-endian we want that position to hold
15918 the last element of VALS. */
15919 maxelement
= BYTES_BIG_ENDIAN
? n_elts
- 1 : 0;
15920 rtx x
= copy_to_mode_reg (inner_mode
, XVECEXP (vals
, 0, maxelement
));
15921 aarch64_emit_move (target
, lowpart_subreg (mode
, x
, inner_mode
));
15925 rtx x
= copy_to_mode_reg (inner_mode
, XVECEXP (vals
, 0, maxelement
));
15926 aarch64_emit_move (target
, gen_vec_duplicate (mode
, x
));
15929 /* Insert the rest. */
15930 for (int i
= 0; i
< n_elts
; i
++)
15932 rtx x
= XVECEXP (vals
, 0, i
);
15933 if (matches
[i
][0] == maxelement
)
15935 x
= copy_to_mode_reg (inner_mode
, x
);
15936 emit_insn (GEN_FCN (icode
) (target
, x
, GEN_INT (i
)));
15941 /* Initialise a vector which is part-variable. We want to first try
15942 to build those lanes which are constant in the most efficient way we
15944 if (n_var
!= n_elts
)
15946 rtx copy
= copy_rtx (vals
);
15948 /* Load constant part of vector. We really don't care what goes into the
15949 parts we will overwrite, but we're more likely to be able to load the
15950 constant efficiently if it has fewer, larger, repeating parts
15951 (see aarch64_simd_valid_immediate). */
15952 for (int i
= 0; i
< n_elts
; i
++)
15954 rtx x
= XVECEXP (vals
, 0, i
);
15955 if (CONST_INT_P (x
) || CONST_DOUBLE_P (x
))
15957 rtx subst
= any_const
;
15958 for (int bit
= n_elts
/ 2; bit
> 0; bit
/= 2)
15960 /* Look in the copied vector, as more elements are const. */
15961 rtx test
= XVECEXP (copy
, 0, i
^ bit
);
15962 if (CONST_INT_P (test
) || CONST_DOUBLE_P (test
))
15968 XVECEXP (copy
, 0, i
) = subst
;
15970 aarch64_expand_vector_init (target
, copy
);
15973 /* Insert the variable lanes directly. */
15974 for (int i
= 0; i
< n_elts
; i
++)
15976 rtx x
= XVECEXP (vals
, 0, i
);
15977 if (CONST_INT_P (x
) || CONST_DOUBLE_P (x
))
15979 x
= copy_to_mode_reg (inner_mode
, x
);
15980 emit_insn (GEN_FCN (icode
) (target
, x
, GEN_INT (i
)));
15984 /* Emit RTL corresponding to:
15985 insr TARGET, ELEM. */
15988 emit_insr (rtx target
, rtx elem
)
15990 machine_mode mode
= GET_MODE (target
);
15991 scalar_mode elem_mode
= GET_MODE_INNER (mode
);
15992 elem
= force_reg (elem_mode
, elem
);
15994 insn_code icode
= optab_handler (vec_shl_insert_optab
, mode
);
15995 gcc_assert (icode
!= CODE_FOR_nothing
);
15996 emit_insn (GEN_FCN (icode
) (target
, target
, elem
));
15999 /* Subroutine of aarch64_sve_expand_vector_init for handling
16000 trailing constants.
16001 This function works as follows:
16002 (a) Create a new vector consisting of trailing constants.
16003 (b) Initialize TARGET with the constant vector using emit_move_insn.
16004 (c) Insert remaining elements in TARGET using insr.
16005 NELTS is the total number of elements in original vector while
16006 while NELTS_REQD is the number of elements that are actually
16009 ??? The heuristic used is to do above only if number of constants
16010 is at least half the total number of elements. May need fine tuning. */
16013 aarch64_sve_expand_vector_init_handle_trailing_constants
16014 (rtx target
, const rtx_vector_builder
&builder
, int nelts
, int nelts_reqd
)
16016 machine_mode mode
= GET_MODE (target
);
16017 scalar_mode elem_mode
= GET_MODE_INNER (mode
);
16018 int n_trailing_constants
= 0;
16020 for (int i
= nelts_reqd
- 1;
16021 i
>= 0 && aarch64_legitimate_constant_p (elem_mode
, builder
.elt (i
));
16023 n_trailing_constants
++;
16025 if (n_trailing_constants
>= nelts_reqd
/ 2)
16027 rtx_vector_builder
v (mode
, 1, nelts
);
16028 for (int i
= 0; i
< nelts
; i
++)
16029 v
.quick_push (builder
.elt (i
+ nelts_reqd
- n_trailing_constants
));
16030 rtx const_vec
= v
.build ();
16031 emit_move_insn (target
, const_vec
);
16033 for (int i
= nelts_reqd
- n_trailing_constants
- 1; i
>= 0; i
--)
16034 emit_insr (target
, builder
.elt (i
));
16042 /* Subroutine of aarch64_sve_expand_vector_init.
16044 (a) Initialize TARGET by broadcasting element NELTS_REQD - 1 of BUILDER.
16045 (b) Skip trailing elements from BUILDER, which are the same as
16046 element NELTS_REQD - 1.
16047 (c) Insert earlier elements in reverse order in TARGET using insr. */
16050 aarch64_sve_expand_vector_init_insert_elems (rtx target
,
16051 const rtx_vector_builder
&builder
,
16054 machine_mode mode
= GET_MODE (target
);
16055 scalar_mode elem_mode
= GET_MODE_INNER (mode
);
16057 struct expand_operand ops
[2];
16058 enum insn_code icode
= optab_handler (vec_duplicate_optab
, mode
);
16059 gcc_assert (icode
!= CODE_FOR_nothing
);
16061 create_output_operand (&ops
[0], target
, mode
);
16062 create_input_operand (&ops
[1], builder
.elt (nelts_reqd
- 1), elem_mode
);
16063 expand_insn (icode
, 2, ops
);
16065 int ndups
= builder
.count_dups (nelts_reqd
- 1, -1, -1);
16066 for (int i
= nelts_reqd
- ndups
- 1; i
>= 0; i
--)
16067 emit_insr (target
, builder
.elt (i
));
16070 /* Subroutine of aarch64_sve_expand_vector_init to handle case
16071 when all trailing elements of builder are same.
16072 This works as follows:
16073 (a) Use expand_insn interface to broadcast last vector element in TARGET.
16074 (b) Insert remaining elements in TARGET using insr.
16076 ??? The heuristic used is to do above if number of same trailing elements
16077 is at least 3/4 of total number of elements, loosely based on
16078 heuristic from mostly_zeros_p. May need fine-tuning. */
16081 aarch64_sve_expand_vector_init_handle_trailing_same_elem
16082 (rtx target
, const rtx_vector_builder
&builder
, int nelts_reqd
)
16084 int ndups
= builder
.count_dups (nelts_reqd
- 1, -1, -1);
16085 if (ndups
>= (3 * nelts_reqd
) / 4)
16087 aarch64_sve_expand_vector_init_insert_elems (target
, builder
,
16088 nelts_reqd
- ndups
+ 1);
16095 /* Initialize register TARGET from BUILDER. NELTS is the constant number
16096 of elements in BUILDER.
16098 The function tries to initialize TARGET from BUILDER if it fits one
16099 of the special cases outlined below.
16101 Failing that, the function divides BUILDER into two sub-vectors:
16102 v_even = even elements of BUILDER;
16103 v_odd = odd elements of BUILDER;
16105 and recursively calls itself with v_even and v_odd.
16107 if (recursive call succeeded for v_even or v_odd)
16108 TARGET = zip (v_even, v_odd)
16110 The function returns true if it managed to build TARGET from BUILDER
16111 with one of the special cases, false otherwise.
16113 Example: {a, 1, b, 2, c, 3, d, 4}
16115 The vector gets divided into:
16116 v_even = {a, b, c, d}
16117 v_odd = {1, 2, 3, 4}
16119 aarch64_sve_expand_vector_init(v_odd) hits case 1 and
16120 initialize tmp2 from constant vector v_odd using emit_move_insn.
16122 aarch64_sve_expand_vector_init(v_even) fails since v_even contains
16123 4 elements, so we construct tmp1 from v_even using insr:
16130 TARGET = zip (tmp1, tmp2)
16131 which sets TARGET to {a, 1, b, 2, c, 3, d, 4}. */
16134 aarch64_sve_expand_vector_init (rtx target
, const rtx_vector_builder
&builder
,
16135 int nelts
, int nelts_reqd
)
16137 machine_mode mode
= GET_MODE (target
);
16139 /* Case 1: Vector contains trailing constants. */
16141 if (aarch64_sve_expand_vector_init_handle_trailing_constants
16142 (target
, builder
, nelts
, nelts_reqd
))
16145 /* Case 2: Vector contains leading constants. */
16147 rtx_vector_builder
rev_builder (mode
, 1, nelts_reqd
);
16148 for (int i
= 0; i
< nelts_reqd
; i
++)
16149 rev_builder
.quick_push (builder
.elt (nelts_reqd
- i
- 1));
16150 rev_builder
.finalize ();
16152 if (aarch64_sve_expand_vector_init_handle_trailing_constants
16153 (target
, rev_builder
, nelts
, nelts_reqd
))
16155 emit_insn (gen_aarch64_sve_rev (mode
, target
, target
));
16159 /* Case 3: Vector contains trailing same element. */
16161 if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
16162 (target
, builder
, nelts_reqd
))
16165 /* Case 4: Vector contains leading same element. */
16167 if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
16168 (target
, rev_builder
, nelts_reqd
) && nelts_reqd
== nelts
)
16170 emit_insn (gen_aarch64_sve_rev (mode
, target
, target
));
16174 /* Avoid recursing below 4-elements.
16175 ??? The threshold 4 may need fine-tuning. */
16177 if (nelts_reqd
<= 4)
16180 rtx_vector_builder
v_even (mode
, 1, nelts
);
16181 rtx_vector_builder
v_odd (mode
, 1, nelts
);
16183 for (int i
= 0; i
< nelts
* 2; i
+= 2)
16185 v_even
.quick_push (builder
.elt (i
));
16186 v_odd
.quick_push (builder
.elt (i
+ 1));
16189 v_even
.finalize ();
16192 rtx tmp1
= gen_reg_rtx (mode
);
16193 bool did_even_p
= aarch64_sve_expand_vector_init (tmp1
, v_even
,
16194 nelts
, nelts_reqd
/ 2);
16196 rtx tmp2
= gen_reg_rtx (mode
);
16197 bool did_odd_p
= aarch64_sve_expand_vector_init (tmp2
, v_odd
,
16198 nelts
, nelts_reqd
/ 2);
16200 if (!did_even_p
&& !did_odd_p
)
16203 /* Initialize v_even and v_odd using INSR if it didn't match any of the
16204 special cases and zip v_even, v_odd. */
16207 aarch64_sve_expand_vector_init_insert_elems (tmp1
, v_even
, nelts_reqd
/ 2);
16210 aarch64_sve_expand_vector_init_insert_elems (tmp2
, v_odd
, nelts_reqd
/ 2);
16212 rtvec v
= gen_rtvec (2, tmp1
, tmp2
);
16213 emit_set_insn (target
, gen_rtx_UNSPEC (mode
, v
, UNSPEC_ZIP1
));
16217 /* Initialize register TARGET from the elements in PARALLEL rtx VALS. */
16220 aarch64_sve_expand_vector_init (rtx target
, rtx vals
)
16222 machine_mode mode
= GET_MODE (target
);
16223 int nelts
= XVECLEN (vals
, 0);
16225 rtx_vector_builder
v (mode
, 1, nelts
);
16226 for (int i
= 0; i
< nelts
; i
++)
16227 v
.quick_push (XVECEXP (vals
, 0, i
));
16230 /* If neither sub-vectors of v could be initialized specially,
16231 then use INSR to insert all elements from v into TARGET.
16232 ??? This might not be optimal for vectors with large
16233 initializers like 16-element or above.
16234 For nelts < 4, it probably isn't useful to handle specially. */
16237 || !aarch64_sve_expand_vector_init (target
, v
, nelts
, nelts
))
16238 aarch64_sve_expand_vector_init_insert_elems (target
, v
, nelts
);
16241 static unsigned HOST_WIDE_INT
16242 aarch64_shift_truncation_mask (machine_mode mode
)
16244 if (!SHIFT_COUNT_TRUNCATED
|| aarch64_vector_data_mode_p (mode
))
16246 return GET_MODE_UNIT_BITSIZE (mode
) - 1;
16249 /* Select a format to encode pointers in exception handling data. */
16251 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED
, int global
)
16254 switch (aarch64_cmodel
)
16256 case AARCH64_CMODEL_TINY
:
16257 case AARCH64_CMODEL_TINY_PIC
:
16258 case AARCH64_CMODEL_SMALL
:
16259 case AARCH64_CMODEL_SMALL_PIC
:
16260 case AARCH64_CMODEL_SMALL_SPIC
:
16261 /* text+got+data < 4Gb. 4-byte signed relocs are sufficient
16263 type
= DW_EH_PE_sdata4
;
16266 /* No assumptions here. 8-byte relocs required. */
16267 type
= DW_EH_PE_sdata8
;
16270 return (global
? DW_EH_PE_indirect
: 0) | DW_EH_PE_pcrel
| type
;
16273 /* Output .variant_pcs for aarch64_vector_pcs function symbols. */
16276 aarch64_asm_output_variant_pcs (FILE *stream
, const tree decl
, const char* name
)
16278 if (aarch64_simd_decl_p (decl
))
16280 fprintf (stream
, "\t.variant_pcs\t");
16281 assemble_name (stream
, name
);
16282 fprintf (stream
, "\n");
16286 /* The last .arch and .tune assembly strings that we printed. */
16287 static std::string aarch64_last_printed_arch_string
;
16288 static std::string aarch64_last_printed_tune_string
;
16290 /* Implement ASM_DECLARE_FUNCTION_NAME. Output the ISA features used
16291 by the function fndecl. */
16294 aarch64_declare_function_name (FILE *stream
, const char* name
,
16297 tree target_parts
= DECL_FUNCTION_SPECIFIC_TARGET (fndecl
);
16299 struct cl_target_option
*targ_options
;
16301 targ_options
= TREE_TARGET_OPTION (target_parts
);
16303 targ_options
= TREE_TARGET_OPTION (target_option_current_node
);
16304 gcc_assert (targ_options
);
16306 const struct processor
*this_arch
16307 = aarch64_get_arch (targ_options
->x_explicit_arch
);
16309 uint64_t isa_flags
= targ_options
->x_aarch64_isa_flags
;
16310 std::string extension
16311 = aarch64_get_extension_string_for_isa_flags (isa_flags
,
16313 /* Only update the assembler .arch string if it is distinct from the last
16314 such string we printed. */
16315 std::string to_print
= this_arch
->name
+ extension
;
16316 if (to_print
!= aarch64_last_printed_arch_string
)
16318 asm_fprintf (asm_out_file
, "\t.arch %s\n", to_print
.c_str ());
16319 aarch64_last_printed_arch_string
= to_print
;
16322 /* Print the cpu name we're tuning for in the comments, might be
16323 useful to readers of the generated asm. Do it only when it changes
16324 from function to function and verbose assembly is requested. */
16325 const struct processor
*this_tune
16326 = aarch64_get_tune_cpu (targ_options
->x_explicit_tune_core
);
16328 if (flag_debug_asm
&& aarch64_last_printed_tune_string
!= this_tune
->name
)
16330 asm_fprintf (asm_out_file
, "\t" ASM_COMMENT_START
".tune %s\n",
16332 aarch64_last_printed_tune_string
= this_tune
->name
;
16335 aarch64_asm_output_variant_pcs (stream
, fndecl
, name
);
16337 /* Don't forget the type directive for ELF. */
16338 ASM_OUTPUT_TYPE_DIRECTIVE (stream
, name
, "function");
16339 ASM_OUTPUT_LABEL (stream
, name
);
16342 /* Implement ASM_OUTPUT_DEF_FROM_DECLS. Output .variant_pcs for aliases. */
16345 aarch64_asm_output_alias (FILE *stream
, const tree decl
, const tree target
)
16347 const char *name
= XSTR (XEXP (DECL_RTL (decl
), 0), 0);
16348 const char *value
= IDENTIFIER_POINTER (target
);
16349 aarch64_asm_output_variant_pcs (stream
, decl
, name
);
16350 ASM_OUTPUT_DEF (stream
, name
, value
);
16353 /* Implement ASM_OUTPUT_EXTERNAL. Output .variant_pcs for undefined
16354 function symbol references. */
16357 aarch64_asm_output_external (FILE *stream
, tree decl
, const char* name
)
16359 default_elf_asm_output_external (stream
, decl
, name
);
16360 aarch64_asm_output_variant_pcs (stream
, decl
, name
);
16363 /* Triggered after a .cfi_startproc directive is emitted into the assembly file.
16364 Used to output the .cfi_b_key_frame directive when signing the current
16365 function with the B key. */
16368 aarch64_post_cfi_startproc (FILE *f
, tree ignored ATTRIBUTE_UNUSED
)
16370 if (cfun
->machine
->frame
.laid_out
&& aarch64_return_address_signing_enabled ()
16371 && aarch64_ra_sign_key
== AARCH64_KEY_B
)
16372 asm_fprintf (f
, "\t.cfi_b_key_frame\n");
16375 /* Implements TARGET_ASM_FILE_START. Output the assembly header. */
16378 aarch64_start_file (void)
16380 struct cl_target_option
*default_options
16381 = TREE_TARGET_OPTION (target_option_default_node
);
16383 const struct processor
*default_arch
16384 = aarch64_get_arch (default_options
->x_explicit_arch
);
16385 uint64_t default_isa_flags
= default_options
->x_aarch64_isa_flags
;
16386 std::string extension
16387 = aarch64_get_extension_string_for_isa_flags (default_isa_flags
,
16388 default_arch
->flags
);
16390 aarch64_last_printed_arch_string
= default_arch
->name
+ extension
;
16391 aarch64_last_printed_tune_string
= "";
16392 asm_fprintf (asm_out_file
, "\t.arch %s\n",
16393 aarch64_last_printed_arch_string
.c_str ());
16395 default_file_start ();
16398 /* Emit load exclusive. */
16401 aarch64_emit_load_exclusive (machine_mode mode
, rtx rval
,
16402 rtx mem
, rtx model_rtx
)
16404 emit_insn (gen_aarch64_load_exclusive (mode
, rval
, mem
, model_rtx
));
16407 /* Emit store exclusive. */
16410 aarch64_emit_store_exclusive (machine_mode mode
, rtx bval
,
16411 rtx rval
, rtx mem
, rtx model_rtx
)
16413 emit_insn (gen_aarch64_store_exclusive (mode
, bval
, rval
, mem
, model_rtx
));
16416 /* Mark the previous jump instruction as unlikely. */
16419 aarch64_emit_unlikely_jump (rtx insn
)
16421 rtx_insn
*jump
= emit_jump_insn (insn
);
16422 add_reg_br_prob_note (jump
, profile_probability::very_unlikely ());
16425 /* Expand a compare and swap pattern. */
16428 aarch64_expand_compare_and_swap (rtx operands
[])
16430 rtx bval
, rval
, mem
, oldval
, newval
, is_weak
, mod_s
, mod_f
, x
, cc_reg
;
16431 machine_mode mode
, r_mode
;
16433 bval
= operands
[0];
16434 rval
= operands
[1];
16436 oldval
= operands
[3];
16437 newval
= operands
[4];
16438 is_weak
= operands
[5];
16439 mod_s
= operands
[6];
16440 mod_f
= operands
[7];
16441 mode
= GET_MODE (mem
);
16443 /* Normally the succ memory model must be stronger than fail, but in the
16444 unlikely event of fail being ACQUIRE and succ being RELEASE we need to
16445 promote succ to ACQ_REL so that we don't lose the acquire semantics. */
16446 if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f
)))
16447 && is_mm_release (memmodel_from_int (INTVAL (mod_s
))))
16448 mod_s
= GEN_INT (MEMMODEL_ACQ_REL
);
16451 if (mode
== QImode
|| mode
== HImode
)
16454 rval
= gen_reg_rtx (r_mode
);
16459 /* The CAS insn requires oldval and rval overlap, but we need to
16460 have a copy of oldval saved across the operation to tell if
16461 the operation is successful. */
16462 if (reg_overlap_mentioned_p (rval
, oldval
))
16463 rval
= copy_to_mode_reg (r_mode
, oldval
);
16465 emit_move_insn (rval
, gen_lowpart (r_mode
, oldval
));
16467 emit_insn (gen_aarch64_compare_and_swap_lse (mode
, rval
, mem
,
16469 cc_reg
= aarch64_gen_compare_reg_maybe_ze (NE
, rval
, oldval
, mode
);
16473 /* The oldval predicate varies by mode. Test it and force to reg. */
16474 insn_code code
= code_for_aarch64_compare_and_swap (mode
);
16475 if (!insn_data
[code
].operand
[2].predicate (oldval
, mode
))
16476 oldval
= force_reg (mode
, oldval
);
16478 emit_insn (GEN_FCN (code
) (rval
, mem
, oldval
, newval
,
16479 is_weak
, mod_s
, mod_f
));
16480 cc_reg
= gen_rtx_REG (CCmode
, CC_REGNUM
);
16483 if (r_mode
!= mode
)
16484 rval
= gen_lowpart (mode
, rval
);
16485 emit_move_insn (operands
[1], rval
);
16487 x
= gen_rtx_EQ (SImode
, cc_reg
, const0_rtx
);
16488 emit_insn (gen_rtx_SET (bval
, x
));
16491 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
16492 sequence implementing an atomic operation. */
16495 aarch64_emit_post_barrier (enum memmodel model
)
16497 const enum memmodel base_model
= memmodel_base (model
);
16499 if (is_mm_sync (model
)
16500 && (base_model
== MEMMODEL_ACQUIRE
16501 || base_model
== MEMMODEL_ACQ_REL
16502 || base_model
== MEMMODEL_SEQ_CST
))
16504 emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST
)));
16508 /* Split a compare and swap pattern. */
16511 aarch64_split_compare_and_swap (rtx operands
[])
16513 rtx rval
, mem
, oldval
, newval
, scratch
;
16516 rtx_code_label
*label1
, *label2
;
16518 enum memmodel model
;
16521 rval
= operands
[0];
16523 oldval
= operands
[2];
16524 newval
= operands
[3];
16525 is_weak
= (operands
[4] != const0_rtx
);
16526 model_rtx
= operands
[5];
16527 scratch
= operands
[7];
16528 mode
= GET_MODE (mem
);
16529 model
= memmodel_from_int (INTVAL (model_rtx
));
16531 /* When OLDVAL is zero and we want the strong version we can emit a tighter
16534 LD[A]XR rval, [mem]
16536 ST[L]XR scratch, newval, [mem]
16537 CBNZ scratch, .label1
16540 bool strong_zero_p
= !is_weak
&& oldval
== const0_rtx
;
16545 label1
= gen_label_rtx ();
16546 emit_label (label1
);
16548 label2
= gen_label_rtx ();
16550 /* The initial load can be relaxed for a __sync operation since a final
16551 barrier will be emitted to stop code hoisting. */
16552 if (is_mm_sync (model
))
16553 aarch64_emit_load_exclusive (mode
, rval
, mem
,
16554 GEN_INT (MEMMODEL_RELAXED
));
16556 aarch64_emit_load_exclusive (mode
, rval
, mem
, model_rtx
);
16560 if (aarch64_track_speculation
)
16562 /* Emit an explicit compare instruction, so that we can correctly
16563 track the condition codes. */
16564 rtx cc_reg
= aarch64_gen_compare_reg (NE
, rval
, const0_rtx
);
16565 x
= gen_rtx_NE (GET_MODE (cc_reg
), cc_reg
, const0_rtx
);
16568 x
= gen_rtx_NE (VOIDmode
, rval
, const0_rtx
);
16570 x
= gen_rtx_IF_THEN_ELSE (VOIDmode
, x
,
16571 gen_rtx_LABEL_REF (Pmode
, label2
), pc_rtx
);
16572 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx
, x
));
16576 cond
= aarch64_gen_compare_reg_maybe_ze (NE
, rval
, oldval
, mode
);
16577 x
= gen_rtx_NE (VOIDmode
, cond
, const0_rtx
);
16578 x
= gen_rtx_IF_THEN_ELSE (VOIDmode
, x
,
16579 gen_rtx_LABEL_REF (Pmode
, label2
), pc_rtx
);
16580 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx
, x
));
16583 aarch64_emit_store_exclusive (mode
, scratch
, mem
, newval
, model_rtx
);
16587 if (aarch64_track_speculation
)
16589 /* Emit an explicit compare instruction, so that we can correctly
16590 track the condition codes. */
16591 rtx cc_reg
= aarch64_gen_compare_reg (NE
, scratch
, const0_rtx
);
16592 x
= gen_rtx_NE (GET_MODE (cc_reg
), cc_reg
, const0_rtx
);
16595 x
= gen_rtx_NE (VOIDmode
, scratch
, const0_rtx
);
16597 x
= gen_rtx_IF_THEN_ELSE (VOIDmode
, x
,
16598 gen_rtx_LABEL_REF (Pmode
, label1
), pc_rtx
);
16599 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx
, x
));
16603 cond
= gen_rtx_REG (CCmode
, CC_REGNUM
);
16604 x
= gen_rtx_COMPARE (CCmode
, scratch
, const0_rtx
);
16605 emit_insn (gen_rtx_SET (cond
, x
));
16608 emit_label (label2
);
16609 /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
16610 to set the condition flags. If this is not used it will be removed by
16614 cond
= gen_rtx_REG (CCmode
, CC_REGNUM
);
16615 x
= gen_rtx_COMPARE (CCmode
, rval
, const0_rtx
);
16616 emit_insn (gen_rtx_SET (cond
, x
));
16618 /* Emit any final barrier needed for a __sync operation. */
16619 if (is_mm_sync (model
))
16620 aarch64_emit_post_barrier (model
);
16623 /* Split an atomic operation. */
16626 aarch64_split_atomic_op (enum rtx_code code
, rtx old_out
, rtx new_out
, rtx mem
,
16627 rtx value
, rtx model_rtx
, rtx cond
)
16629 machine_mode mode
= GET_MODE (mem
);
16630 machine_mode wmode
= (mode
== DImode
? DImode
: SImode
);
16631 const enum memmodel model
= memmodel_from_int (INTVAL (model_rtx
));
16632 const bool is_sync
= is_mm_sync (model
);
16633 rtx_code_label
*label
;
16636 /* Split the atomic operation into a sequence. */
16637 label
= gen_label_rtx ();
16638 emit_label (label
);
16641 new_out
= gen_lowpart (wmode
, new_out
);
16643 old_out
= gen_lowpart (wmode
, old_out
);
16646 value
= simplify_gen_subreg (wmode
, value
, mode
, 0);
16648 /* The initial load can be relaxed for a __sync operation since a final
16649 barrier will be emitted to stop code hoisting. */
16651 aarch64_emit_load_exclusive (mode
, old_out
, mem
,
16652 GEN_INT (MEMMODEL_RELAXED
));
16654 aarch64_emit_load_exclusive (mode
, old_out
, mem
, model_rtx
);
16663 x
= gen_rtx_AND (wmode
, old_out
, value
);
16664 emit_insn (gen_rtx_SET (new_out
, x
));
16665 x
= gen_rtx_NOT (wmode
, new_out
);
16666 emit_insn (gen_rtx_SET (new_out
, x
));
16670 if (CONST_INT_P (value
))
16672 value
= GEN_INT (-INTVAL (value
));
16675 /* Fall through. */
16678 x
= gen_rtx_fmt_ee (code
, wmode
, old_out
, value
);
16679 emit_insn (gen_rtx_SET (new_out
, x
));
16683 aarch64_emit_store_exclusive (mode
, cond
, mem
,
16684 gen_lowpart (mode
, new_out
), model_rtx
);
16686 if (aarch64_track_speculation
)
16688 /* Emit an explicit compare instruction, so that we can correctly
16689 track the condition codes. */
16690 rtx cc_reg
= aarch64_gen_compare_reg (NE
, cond
, const0_rtx
);
16691 x
= gen_rtx_NE (GET_MODE (cc_reg
), cc_reg
, const0_rtx
);
16694 x
= gen_rtx_NE (VOIDmode
, cond
, const0_rtx
);
16696 x
= gen_rtx_IF_THEN_ELSE (VOIDmode
, x
,
16697 gen_rtx_LABEL_REF (Pmode
, label
), pc_rtx
);
16698 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx
, x
));
16700 /* Emit any final barrier needed for a __sync operation. */
16702 aarch64_emit_post_barrier (model
);
16706 aarch64_init_libfuncs (void)
16708 /* Half-precision float operations. The compiler handles all operations
16709 with NULL libfuncs by converting to SFmode. */
16712 set_conv_libfunc (trunc_optab
, HFmode
, SFmode
, "__gnu_f2h_ieee");
16713 set_conv_libfunc (sext_optab
, SFmode
, HFmode
, "__gnu_h2f_ieee");
16716 set_optab_libfunc (add_optab
, HFmode
, NULL
);
16717 set_optab_libfunc (sdiv_optab
, HFmode
, NULL
);
16718 set_optab_libfunc (smul_optab
, HFmode
, NULL
);
16719 set_optab_libfunc (neg_optab
, HFmode
, NULL
);
16720 set_optab_libfunc (sub_optab
, HFmode
, NULL
);
16723 set_optab_libfunc (eq_optab
, HFmode
, NULL
);
16724 set_optab_libfunc (ne_optab
, HFmode
, NULL
);
16725 set_optab_libfunc (lt_optab
, HFmode
, NULL
);
16726 set_optab_libfunc (le_optab
, HFmode
, NULL
);
16727 set_optab_libfunc (ge_optab
, HFmode
, NULL
);
16728 set_optab_libfunc (gt_optab
, HFmode
, NULL
);
16729 set_optab_libfunc (unord_optab
, HFmode
, NULL
);
16732 /* Target hook for c_mode_for_suffix. */
16733 static machine_mode
16734 aarch64_c_mode_for_suffix (char suffix
)
16742 /* We can only represent floating point constants which will fit in
16743 "quarter-precision" values. These values are characterised by
16744 a sign bit, a 4-bit mantissa and a 3-bit exponent. And are given
16747 (-1)^s * (n/16) * 2^r
16750 's' is the sign bit.
16751 'n' is an integer in the range 16 <= n <= 31.
16752 'r' is an integer in the range -3 <= r <= 4. */
16754 /* Return true iff X can be represented by a quarter-precision
16755 floating point immediate operand X. Note, we cannot represent 0.0. */
16757 aarch64_float_const_representable_p (rtx x
)
16759 /* This represents our current view of how many bits
16760 make up the mantissa. */
16761 int point_pos
= 2 * HOST_BITS_PER_WIDE_INT
- 1;
16763 unsigned HOST_WIDE_INT mantissa
, mask
;
16764 REAL_VALUE_TYPE r
, m
;
16767 if (!CONST_DOUBLE_P (x
))
16770 if (GET_MODE (x
) == VOIDmode
16771 || (GET_MODE (x
) == HFmode
&& !TARGET_FP_F16INST
))
16774 r
= *CONST_DOUBLE_REAL_VALUE (x
);
16776 /* We cannot represent infinities, NaNs or +/-zero. We won't
16777 know if we have +zero until we analyse the mantissa, but we
16778 can reject the other invalid values. */
16779 if (REAL_VALUE_ISINF (r
) || REAL_VALUE_ISNAN (r
)
16780 || REAL_VALUE_MINUS_ZERO (r
))
16783 /* Extract exponent. */
16784 r
= real_value_abs (&r
);
16785 exponent
= REAL_EXP (&r
);
16787 /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
16788 highest (sign) bit, with a fixed binary point at bit point_pos.
16789 m1 holds the low part of the mantissa, m2 the high part.
16790 WARNING: If we ever have a representation using more than 2 * H_W_I - 1
16791 bits for the mantissa, this can fail (low bits will be lost). */
16792 real_ldexp (&m
, &r
, point_pos
- exponent
);
16793 wide_int w
= real_to_integer (&m
, &fail
, HOST_BITS_PER_WIDE_INT
* 2);
16795 /* If the low part of the mantissa has bits set we cannot represent
16797 if (w
.ulow () != 0)
16799 /* We have rejected the lower HOST_WIDE_INT, so update our
16800 understanding of how many bits lie in the mantissa and
16801 look only at the high HOST_WIDE_INT. */
16802 mantissa
= w
.elt (1);
16803 point_pos
-= HOST_BITS_PER_WIDE_INT
;
16805 /* We can only represent values with a mantissa of the form 1.xxxx. */
16806 mask
= ((unsigned HOST_WIDE_INT
)1 << (point_pos
- 5)) - 1;
16807 if ((mantissa
& mask
) != 0)
16810 /* Having filtered unrepresentable values, we may now remove all
16811 but the highest 5 bits. */
16812 mantissa
>>= point_pos
- 5;
16814 /* We cannot represent the value 0.0, so reject it. This is handled
16819 /* Then, as bit 4 is always set, we can mask it off, leaving
16820 the mantissa in the range [0, 15]. */
16821 mantissa
&= ~(1 << 4);
16822 gcc_assert (mantissa
<= 15);
16824 /* GCC internally does not use IEEE754-like encoding (where normalized
16825 significands are in the range [1, 2). GCC uses [0.5, 1) (see real.c).
16826 Our mantissa values are shifted 4 places to the left relative to
16827 normalized IEEE754 so we must modify the exponent returned by REAL_EXP
16828 by 5 places to correct for GCC's representation. */
16829 exponent
= 5 - exponent
;
16831 return (exponent
>= 0 && exponent
<= 7);
16834 /* Returns the string with the instruction for AdvSIMD MOVI, MVNI, ORR or BIC
16835 immediate with a CONST_VECTOR of MODE and WIDTH. WHICH selects whether to
16836 output MOVI/MVNI, ORR or BIC immediate. */
16838 aarch64_output_simd_mov_immediate (rtx const_vector
, unsigned width
,
16839 enum simd_immediate_check which
)
16842 static char templ
[40];
16843 const char *mnemonic
;
16844 const char *shift_op
;
16845 unsigned int lane_count
= 0;
16848 struct simd_immediate_info info
;
16850 /* This will return true to show const_vector is legal for use as either
16851 a AdvSIMD MOVI instruction (or, implicitly, MVNI), ORR or BIC immediate.
16852 It will also update INFO to show how the immediate should be generated.
16853 WHICH selects whether to check for MOVI/MVNI, ORR or BIC. */
16854 is_valid
= aarch64_simd_valid_immediate (const_vector
, &info
, which
);
16855 gcc_assert (is_valid
);
16857 element_char
= sizetochar (GET_MODE_BITSIZE (info
.elt_mode
));
16858 lane_count
= width
/ GET_MODE_BITSIZE (info
.elt_mode
);
16860 if (GET_MODE_CLASS (info
.elt_mode
) == MODE_FLOAT
)
16862 gcc_assert (info
.insn
== simd_immediate_info::MOV
16863 && info
.u
.mov
.shift
== 0);
16864 /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
16865 move immediate path. */
16866 if (aarch64_float_const_zero_rtx_p (info
.u
.mov
.value
))
16867 info
.u
.mov
.value
= GEN_INT (0);
16870 const unsigned int buf_size
= 20;
16871 char float_buf
[buf_size
] = {'\0'};
16872 real_to_decimal_for_mode (float_buf
,
16873 CONST_DOUBLE_REAL_VALUE (info
.u
.mov
.value
),
16874 buf_size
, buf_size
, 1, info
.elt_mode
);
16876 if (lane_count
== 1)
16877 snprintf (templ
, sizeof (templ
), "fmov\t%%d0, %s", float_buf
);
16879 snprintf (templ
, sizeof (templ
), "fmov\t%%0.%d%c, %s",
16880 lane_count
, element_char
, float_buf
);
16885 gcc_assert (CONST_INT_P (info
.u
.mov
.value
));
16887 if (which
== AARCH64_CHECK_MOV
)
16889 mnemonic
= info
.insn
== simd_immediate_info::MVN
? "mvni" : "movi";
16890 shift_op
= (info
.u
.mov
.modifier
== simd_immediate_info::MSL
16892 if (lane_count
== 1)
16893 snprintf (templ
, sizeof (templ
), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX
,
16894 mnemonic
, UINTVAL (info
.u
.mov
.value
));
16895 else if (info
.u
.mov
.shift
)
16896 snprintf (templ
, sizeof (templ
), "%s\t%%0.%d%c, "
16897 HOST_WIDE_INT_PRINT_HEX
", %s %d", mnemonic
, lane_count
,
16898 element_char
, UINTVAL (info
.u
.mov
.value
), shift_op
,
16901 snprintf (templ
, sizeof (templ
), "%s\t%%0.%d%c, "
16902 HOST_WIDE_INT_PRINT_HEX
, mnemonic
, lane_count
,
16903 element_char
, UINTVAL (info
.u
.mov
.value
));
16907 /* For AARCH64_CHECK_BIC and AARCH64_CHECK_ORR. */
16908 mnemonic
= info
.insn
== simd_immediate_info::MVN
? "bic" : "orr";
16909 if (info
.u
.mov
.shift
)
16910 snprintf (templ
, sizeof (templ
), "%s\t%%0.%d%c, #"
16911 HOST_WIDE_INT_PRINT_DEC
", %s #%d", mnemonic
, lane_count
,
16912 element_char
, UINTVAL (info
.u
.mov
.value
), "lsl",
16915 snprintf (templ
, sizeof (templ
), "%s\t%%0.%d%c, #"
16916 HOST_WIDE_INT_PRINT_DEC
, mnemonic
, lane_count
,
16917 element_char
, UINTVAL (info
.u
.mov
.value
));
16923 aarch64_output_scalar_simd_mov_immediate (rtx immediate
, scalar_int_mode mode
)
16926 /* If a floating point number was passed and we desire to use it in an
16927 integer mode do the conversion to integer. */
16928 if (CONST_DOUBLE_P (immediate
) && GET_MODE_CLASS (mode
) == MODE_INT
)
16930 unsigned HOST_WIDE_INT ival
;
16931 if (!aarch64_reinterpret_float_as_int (immediate
, &ival
))
16932 gcc_unreachable ();
16933 immediate
= gen_int_mode (ival
, mode
);
16936 machine_mode vmode
;
16937 /* use a 64 bit mode for everything except for DI/DF mode, where we use
16938 a 128 bit vector mode. */
16939 int width
= GET_MODE_BITSIZE (mode
) == 64 ? 128 : 64;
16941 vmode
= aarch64_simd_container_mode (mode
, width
);
16942 rtx v_op
= aarch64_simd_gen_const_vector_dup (vmode
, INTVAL (immediate
));
16943 return aarch64_output_simd_mov_immediate (v_op
, width
);
16946 /* Return the output string to use for moving immediate CONST_VECTOR
16947 into an SVE register. */
16950 aarch64_output_sve_mov_immediate (rtx const_vector
)
16952 static char templ
[40];
16953 struct simd_immediate_info info
;
16956 bool is_valid
= aarch64_simd_valid_immediate (const_vector
, &info
);
16957 gcc_assert (is_valid
);
16959 element_char
= sizetochar (GET_MODE_BITSIZE (info
.elt_mode
));
16961 machine_mode vec_mode
= GET_MODE (const_vector
);
16962 if (aarch64_sve_pred_mode_p (vec_mode
))
16964 static char buf
[sizeof ("ptrue\t%0.N, vlNNNNN")];
16965 if (info
.insn
== simd_immediate_info::MOV
)
16967 gcc_assert (info
.u
.mov
.value
== const0_rtx
);
16968 snprintf (buf
, sizeof (buf
), "pfalse\t%%0.b");
16972 gcc_assert (info
.insn
== simd_immediate_info::PTRUE
);
16973 unsigned int total_bytes
;
16974 if (info
.u
.pattern
== AARCH64_SV_ALL
16975 && BYTES_PER_SVE_VECTOR
.is_constant (&total_bytes
))
16976 snprintf (buf
, sizeof (buf
), "ptrue\t%%0.%c, vl%d", element_char
,
16977 total_bytes
/ GET_MODE_SIZE (info
.elt_mode
));
16979 snprintf (buf
, sizeof (buf
), "ptrue\t%%0.%c, %s", element_char
,
16980 svpattern_token (info
.u
.pattern
));
16985 if (info
.insn
== simd_immediate_info::INDEX
)
16987 snprintf (templ
, sizeof (templ
), "index\t%%0.%c, #"
16988 HOST_WIDE_INT_PRINT_DEC
", #" HOST_WIDE_INT_PRINT_DEC
,
16989 element_char
, INTVAL (info
.u
.index
.base
),
16990 INTVAL (info
.u
.index
.step
));
16994 if (GET_MODE_CLASS (info
.elt_mode
) == MODE_FLOAT
)
16996 if (aarch64_float_const_zero_rtx_p (info
.u
.mov
.value
))
16997 info
.u
.mov
.value
= GEN_INT (0);
17000 const int buf_size
= 20;
17001 char float_buf
[buf_size
] = {};
17002 real_to_decimal_for_mode (float_buf
,
17003 CONST_DOUBLE_REAL_VALUE (info
.u
.mov
.value
),
17004 buf_size
, buf_size
, 1, info
.elt_mode
);
17006 snprintf (templ
, sizeof (templ
), "fmov\t%%0.%c, #%s",
17007 element_char
, float_buf
);
17012 snprintf (templ
, sizeof (templ
), "mov\t%%0.%c, #" HOST_WIDE_INT_PRINT_DEC
,
17013 element_char
, INTVAL (info
.u
.mov
.value
));
17017 /* Split operands into moves from op[1] + op[2] into op[0]. */
17020 aarch64_split_combinev16qi (rtx operands
[3])
17022 unsigned int dest
= REGNO (operands
[0]);
17023 unsigned int src1
= REGNO (operands
[1]);
17024 unsigned int src2
= REGNO (operands
[2]);
17025 machine_mode halfmode
= GET_MODE (operands
[1]);
17026 unsigned int halfregs
= REG_NREGS (operands
[1]);
17027 rtx destlo
, desthi
;
17029 gcc_assert (halfmode
== V16QImode
);
17031 if (src1
== dest
&& src2
== dest
+ halfregs
)
17033 /* No-op move. Can't split to nothing; emit something. */
17034 emit_note (NOTE_INSN_DELETED
);
17038 /* Preserve register attributes for variable tracking. */
17039 destlo
= gen_rtx_REG_offset (operands
[0], halfmode
, dest
, 0);
17040 desthi
= gen_rtx_REG_offset (operands
[0], halfmode
, dest
+ halfregs
,
17041 GET_MODE_SIZE (halfmode
));
17043 /* Special case of reversed high/low parts. */
17044 if (reg_overlap_mentioned_p (operands
[2], destlo
)
17045 && reg_overlap_mentioned_p (operands
[1], desthi
))
17047 emit_insn (gen_xorv16qi3 (operands
[1], operands
[1], operands
[2]));
17048 emit_insn (gen_xorv16qi3 (operands
[2], operands
[1], operands
[2]));
17049 emit_insn (gen_xorv16qi3 (operands
[1], operands
[1], operands
[2]));
17051 else if (!reg_overlap_mentioned_p (operands
[2], destlo
))
17053 /* Try to avoid unnecessary moves if part of the result
17054 is in the right place already. */
17056 emit_move_insn (destlo
, operands
[1]);
17057 if (src2
!= dest
+ halfregs
)
17058 emit_move_insn (desthi
, operands
[2]);
17062 if (src2
!= dest
+ halfregs
)
17063 emit_move_insn (desthi
, operands
[2]);
17065 emit_move_insn (destlo
, operands
[1]);
17069 /* vec_perm support. */
17071 struct expand_vec_perm_d
17073 rtx target
, op0
, op1
;
17074 vec_perm_indices perm
;
17075 machine_mode vmode
;
17076 unsigned int vec_flags
;
17081 /* Generate a variable permutation. */
17084 aarch64_expand_vec_perm_1 (rtx target
, rtx op0
, rtx op1
, rtx sel
)
17086 machine_mode vmode
= GET_MODE (target
);
17087 bool one_vector_p
= rtx_equal_p (op0
, op1
);
17089 gcc_checking_assert (vmode
== V8QImode
|| vmode
== V16QImode
);
17090 gcc_checking_assert (GET_MODE (op0
) == vmode
);
17091 gcc_checking_assert (GET_MODE (op1
) == vmode
);
17092 gcc_checking_assert (GET_MODE (sel
) == vmode
);
17093 gcc_checking_assert (TARGET_SIMD
);
17097 if (vmode
== V8QImode
)
17099 /* Expand the argument to a V16QI mode by duplicating it. */
17100 rtx pair
= gen_reg_rtx (V16QImode
);
17101 emit_insn (gen_aarch64_combinev8qi (pair
, op0
, op0
));
17102 emit_insn (gen_aarch64_tbl1v8qi (target
, pair
, sel
));
17106 emit_insn (gen_aarch64_tbl1v16qi (target
, op0
, sel
));
17113 if (vmode
== V8QImode
)
17115 pair
= gen_reg_rtx (V16QImode
);
17116 emit_insn (gen_aarch64_combinev8qi (pair
, op0
, op1
));
17117 emit_insn (gen_aarch64_tbl1v8qi (target
, pair
, sel
));
17121 pair
= gen_reg_rtx (OImode
);
17122 emit_insn (gen_aarch64_combinev16qi (pair
, op0
, op1
));
17123 emit_insn (gen_aarch64_tbl2v16qi (target
, pair
, sel
));
17128 /* Expand a vec_perm with the operands given by TARGET, OP0, OP1 and SEL.
17129 NELT is the number of elements in the vector. */
17132 aarch64_expand_vec_perm (rtx target
, rtx op0
, rtx op1
, rtx sel
,
17135 machine_mode vmode
= GET_MODE (target
);
17136 bool one_vector_p
= rtx_equal_p (op0
, op1
);
17139 /* The TBL instruction does not use a modulo index, so we must take care
17140 of that ourselves. */
17141 mask
= aarch64_simd_gen_const_vector_dup (vmode
,
17142 one_vector_p
? nelt
- 1 : 2 * nelt
- 1);
17143 sel
= expand_simple_binop (vmode
, AND
, sel
, mask
, NULL
, 0, OPTAB_LIB_WIDEN
);
17145 /* For big-endian, we also need to reverse the index within the vector
17146 (but not which vector). */
17147 if (BYTES_BIG_ENDIAN
)
17149 /* If one_vector_p, mask is a vector of (nelt - 1)'s already. */
17151 mask
= aarch64_simd_gen_const_vector_dup (vmode
, nelt
- 1);
17152 sel
= expand_simple_binop (vmode
, XOR
, sel
, mask
,
17153 NULL
, 0, OPTAB_LIB_WIDEN
);
17155 aarch64_expand_vec_perm_1 (target
, op0
, op1
, sel
);
17158 /* Generate (set TARGET (unspec [OP0 OP1] CODE)). */
17161 emit_unspec2 (rtx target
, int code
, rtx op0
, rtx op1
)
17163 emit_insn (gen_rtx_SET (target
,
17164 gen_rtx_UNSPEC (GET_MODE (target
),
17165 gen_rtvec (2, op0
, op1
), code
)));
17168 /* Expand an SVE vec_perm with the given operands. */
17171 aarch64_expand_sve_vec_perm (rtx target
, rtx op0
, rtx op1
, rtx sel
)
17173 machine_mode data_mode
= GET_MODE (target
);
17174 machine_mode sel_mode
= GET_MODE (sel
);
17175 /* Enforced by the pattern condition. */
17176 int nunits
= GET_MODE_NUNITS (sel_mode
).to_constant ();
17178 /* Note: vec_perm indices are supposed to wrap when they go beyond the
17179 size of the two value vectors, i.e. the upper bits of the indices
17180 are effectively ignored. SVE TBL instead produces 0 for any
17181 out-of-range indices, so we need to modulo all the vec_perm indices
17182 to ensure they are all in range. */
17183 rtx sel_reg
= force_reg (sel_mode
, sel
);
17185 /* Check if the sel only references the first values vector. */
17186 if (GET_CODE (sel
) == CONST_VECTOR
17187 && aarch64_const_vec_all_in_range_p (sel
, 0, nunits
- 1))
17189 emit_unspec2 (target
, UNSPEC_TBL
, op0
, sel_reg
);
17193 /* Check if the two values vectors are the same. */
17194 if (rtx_equal_p (op0
, op1
))
17196 rtx max_sel
= aarch64_simd_gen_const_vector_dup (sel_mode
, nunits
- 1);
17197 rtx sel_mod
= expand_simple_binop (sel_mode
, AND
, sel_reg
, max_sel
,
17198 NULL
, 0, OPTAB_DIRECT
);
17199 emit_unspec2 (target
, UNSPEC_TBL
, op0
, sel_mod
);
17203 /* Run TBL on for each value vector and combine the results. */
17205 rtx res0
= gen_reg_rtx (data_mode
);
17206 rtx res1
= gen_reg_rtx (data_mode
);
17207 rtx neg_num_elems
= aarch64_simd_gen_const_vector_dup (sel_mode
, -nunits
);
17208 if (GET_CODE (sel
) != CONST_VECTOR
17209 || !aarch64_const_vec_all_in_range_p (sel
, 0, 2 * nunits
- 1))
17211 rtx max_sel
= aarch64_simd_gen_const_vector_dup (sel_mode
,
17213 sel_reg
= expand_simple_binop (sel_mode
, AND
, sel_reg
, max_sel
,
17214 NULL
, 0, OPTAB_DIRECT
);
17216 emit_unspec2 (res0
, UNSPEC_TBL
, op0
, sel_reg
);
17217 rtx sel_sub
= expand_simple_binop (sel_mode
, PLUS
, sel_reg
, neg_num_elems
,
17218 NULL
, 0, OPTAB_DIRECT
);
17219 emit_unspec2 (res1
, UNSPEC_TBL
, op1
, sel_sub
);
17220 if (GET_MODE_CLASS (data_mode
) == MODE_VECTOR_INT
)
17221 emit_insn (gen_rtx_SET (target
, gen_rtx_IOR (data_mode
, res0
, res1
)));
17223 emit_unspec2 (target
, UNSPEC_IORF
, res0
, res1
);
17226 /* Recognize patterns suitable for the TRN instructions. */
17228 aarch64_evpc_trn (struct expand_vec_perm_d
*d
)
17231 poly_uint64 nelt
= d
->perm
.length ();
17232 rtx out
, in0
, in1
, x
;
17233 machine_mode vmode
= d
->vmode
;
17235 if (GET_MODE_UNIT_SIZE (vmode
) > 8)
17238 /* Note that these are little-endian tests.
17239 We correct for big-endian later. */
17240 if (!d
->perm
[0].is_constant (&odd
)
17241 || (odd
!= 0 && odd
!= 1)
17242 || !d
->perm
.series_p (0, 2, odd
, 2)
17243 || !d
->perm
.series_p (1, 2, nelt
+ odd
, 2))
17252 /* We don't need a big-endian lane correction for SVE; see the comment
17253 at the head of aarch64-sve.md for details. */
17254 if (BYTES_BIG_ENDIAN
&& d
->vec_flags
== VEC_ADVSIMD
)
17256 x
= in0
, in0
= in1
, in1
= x
;
17261 emit_set_insn (out
, gen_rtx_UNSPEC (vmode
, gen_rtvec (2, in0
, in1
),
17262 odd
? UNSPEC_TRN2
: UNSPEC_TRN1
));
17266 /* Recognize patterns suitable for the UZP instructions. */
17268 aarch64_evpc_uzp (struct expand_vec_perm_d
*d
)
17271 rtx out
, in0
, in1
, x
;
17272 machine_mode vmode
= d
->vmode
;
17274 if (GET_MODE_UNIT_SIZE (vmode
) > 8)
17277 /* Note that these are little-endian tests.
17278 We correct for big-endian later. */
17279 if (!d
->perm
[0].is_constant (&odd
)
17280 || (odd
!= 0 && odd
!= 1)
17281 || !d
->perm
.series_p (0, 1, odd
, 2))
17290 /* We don't need a big-endian lane correction for SVE; see the comment
17291 at the head of aarch64-sve.md for details. */
17292 if (BYTES_BIG_ENDIAN
&& d
->vec_flags
== VEC_ADVSIMD
)
17294 x
= in0
, in0
= in1
, in1
= x
;
17299 emit_set_insn (out
, gen_rtx_UNSPEC (vmode
, gen_rtvec (2, in0
, in1
),
17300 odd
? UNSPEC_UZP2
: UNSPEC_UZP1
));
17304 /* Recognize patterns suitable for the ZIP instructions. */
17306 aarch64_evpc_zip (struct expand_vec_perm_d
*d
)
17309 poly_uint64 nelt
= d
->perm
.length ();
17310 rtx out
, in0
, in1
, x
;
17311 machine_mode vmode
= d
->vmode
;
17313 if (GET_MODE_UNIT_SIZE (vmode
) > 8)
17316 /* Note that these are little-endian tests.
17317 We correct for big-endian later. */
17318 poly_uint64 first
= d
->perm
[0];
17319 if ((maybe_ne (first
, 0U) && maybe_ne (first
* 2, nelt
))
17320 || !d
->perm
.series_p (0, 2, first
, 1)
17321 || !d
->perm
.series_p (1, 2, first
+ nelt
, 1))
17323 high
= maybe_ne (first
, 0U);
17331 /* We don't need a big-endian lane correction for SVE; see the comment
17332 at the head of aarch64-sve.md for details. */
17333 if (BYTES_BIG_ENDIAN
&& d
->vec_flags
== VEC_ADVSIMD
)
17335 x
= in0
, in0
= in1
, in1
= x
;
17340 emit_set_insn (out
, gen_rtx_UNSPEC (vmode
, gen_rtvec (2, in0
, in1
),
17341 high
? UNSPEC_ZIP2
: UNSPEC_ZIP1
));
17345 /* Recognize patterns for the EXT insn. */
17348 aarch64_evpc_ext (struct expand_vec_perm_d
*d
)
17350 HOST_WIDE_INT location
;
17353 /* The first element always refers to the first vector.
17354 Check if the extracted indices are increasing by one. */
17355 if (d
->vec_flags
== VEC_SVE_PRED
17356 || !d
->perm
[0].is_constant (&location
)
17357 || !d
->perm
.series_p (0, 1, location
, 1))
17364 /* The case where (location == 0) is a no-op for both big- and little-endian,
17365 and is removed by the mid-end at optimization levels -O1 and higher.
17367 We don't need a big-endian lane correction for SVE; see the comment
17368 at the head of aarch64-sve.md for details. */
17369 if (BYTES_BIG_ENDIAN
&& location
!= 0 && d
->vec_flags
== VEC_ADVSIMD
)
17371 /* After setup, we want the high elements of the first vector (stored
17372 at the LSB end of the register), and the low elements of the second
17373 vector (stored at the MSB end of the register). So swap. */
17374 std::swap (d
->op0
, d
->op1
);
17375 /* location != 0 (above), so safe to assume (nelt - location) < nelt.
17376 to_constant () is safe since this is restricted to Advanced SIMD
17378 location
= d
->perm
.length ().to_constant () - location
;
17381 offset
= GEN_INT (location
);
17382 emit_set_insn (d
->target
,
17383 gen_rtx_UNSPEC (d
->vmode
,
17384 gen_rtvec (3, d
->op0
, d
->op1
, offset
),
17389 /* Recognize patterns for the REV{64,32,16} insns, which reverse elements
17390 within each 64-bit, 32-bit or 16-bit granule. */
17393 aarch64_evpc_rev_local (struct expand_vec_perm_d
*d
)
17395 HOST_WIDE_INT diff
;
17396 unsigned int i
, size
, unspec
;
17397 machine_mode pred_mode
;
17399 if (d
->vec_flags
== VEC_SVE_PRED
17400 || !d
->one_vector_p
17401 || !d
->perm
[0].is_constant (&diff
))
17404 size
= (diff
+ 1) * GET_MODE_UNIT_SIZE (d
->vmode
);
17407 unspec
= UNSPEC_REV64
;
17408 pred_mode
= VNx2BImode
;
17410 else if (size
== 4)
17412 unspec
= UNSPEC_REV32
;
17413 pred_mode
= VNx4BImode
;
17415 else if (size
== 2)
17417 unspec
= UNSPEC_REV16
;
17418 pred_mode
= VNx8BImode
;
17423 unsigned int step
= diff
+ 1;
17424 for (i
= 0; i
< step
; ++i
)
17425 if (!d
->perm
.series_p (i
, step
, diff
- i
, step
))
17432 rtx src
= gen_rtx_UNSPEC (d
->vmode
, gen_rtvec (1, d
->op0
), unspec
);
17433 if (d
->vec_flags
== VEC_SVE_DATA
)
17435 rtx pred
= aarch64_ptrue_reg (pred_mode
);
17436 src
= gen_rtx_UNSPEC (d
->vmode
, gen_rtvec (2, pred
, src
),
17437 UNSPEC_MERGE_PTRUE
);
17439 emit_set_insn (d
->target
, src
);
17443 /* Recognize patterns for the REV insn, which reverses elements within
17447 aarch64_evpc_rev_global (struct expand_vec_perm_d
*d
)
17449 poly_uint64 nelt
= d
->perm
.length ();
17451 if (!d
->one_vector_p
|| d
->vec_flags
!= VEC_SVE_DATA
)
17454 if (!d
->perm
.series_p (0, 1, nelt
- 1, -1))
17461 rtx src
= gen_rtx_UNSPEC (d
->vmode
, gen_rtvec (1, d
->op0
), UNSPEC_REV
);
17462 emit_set_insn (d
->target
, src
);
17467 aarch64_evpc_dup (struct expand_vec_perm_d
*d
)
17469 rtx out
= d
->target
;
17472 machine_mode vmode
= d
->vmode
;
17475 if (d
->vec_flags
== VEC_SVE_PRED
17476 || d
->perm
.encoding ().encoded_nelts () != 1
17477 || !d
->perm
[0].is_constant (&elt
))
17480 if (d
->vec_flags
== VEC_SVE_DATA
&& elt
>= 64 * GET_MODE_UNIT_SIZE (vmode
))
17487 /* The generic preparation in aarch64_expand_vec_perm_const_1
17488 swaps the operand order and the permute indices if it finds
17489 d->perm[0] to be in the second operand. Thus, we can always
17490 use d->op0 and need not do any extra arithmetic to get the
17491 correct lane number. */
17493 lane
= GEN_INT (elt
); /* The pattern corrects for big-endian. */
17495 rtx parallel
= gen_rtx_PARALLEL (vmode
, gen_rtvec (1, lane
));
17496 rtx select
= gen_rtx_VEC_SELECT (GET_MODE_INNER (vmode
), in0
, parallel
);
17497 emit_set_insn (out
, gen_rtx_VEC_DUPLICATE (vmode
, select
));
17502 aarch64_evpc_tbl (struct expand_vec_perm_d
*d
)
17504 rtx rperm
[MAX_COMPILE_TIME_VEC_BYTES
], sel
;
17505 machine_mode vmode
= d
->vmode
;
17507 /* Make sure that the indices are constant. */
17508 unsigned int encoded_nelts
= d
->perm
.encoding ().encoded_nelts ();
17509 for (unsigned int i
= 0; i
< encoded_nelts
; ++i
)
17510 if (!d
->perm
[i
].is_constant ())
17516 /* Generic code will try constant permutation twice. Once with the
17517 original mode and again with the elements lowered to QImode.
17518 So wait and don't do the selector expansion ourselves. */
17519 if (vmode
!= V8QImode
&& vmode
!= V16QImode
)
17522 /* to_constant is safe since this routine is specific to Advanced SIMD
17524 unsigned int nelt
= d
->perm
.length ().to_constant ();
17525 for (unsigned int i
= 0; i
< nelt
; ++i
)
17526 /* If big-endian and two vectors we end up with a weird mixed-endian
17527 mode on NEON. Reverse the index within each word but not the word
17528 itself. to_constant is safe because we checked is_constant above. */
17529 rperm
[i
] = GEN_INT (BYTES_BIG_ENDIAN
17530 ? d
->perm
[i
].to_constant () ^ (nelt
- 1)
17531 : d
->perm
[i
].to_constant ());
17533 sel
= gen_rtx_CONST_VECTOR (vmode
, gen_rtvec_v (nelt
, rperm
));
17534 sel
= force_reg (vmode
, sel
);
17536 aarch64_expand_vec_perm_1 (d
->target
, d
->op0
, d
->op1
, sel
);
17540 /* Try to implement D using an SVE TBL instruction. */
17543 aarch64_evpc_sve_tbl (struct expand_vec_perm_d
*d
)
17545 unsigned HOST_WIDE_INT nelt
;
17547 /* Permuting two variable-length vectors could overflow the
17549 if (!d
->one_vector_p
&& !d
->perm
.length ().is_constant (&nelt
))
17555 machine_mode sel_mode
= mode_for_int_vector (d
->vmode
).require ();
17556 rtx sel
= vec_perm_indices_to_rtx (sel_mode
, d
->perm
);
17557 if (d
->one_vector_p
)
17558 emit_unspec2 (d
->target
, UNSPEC_TBL
, d
->op0
, force_reg (sel_mode
, sel
));
17560 aarch64_expand_sve_vec_perm (d
->target
, d
->op0
, d
->op1
, sel
);
17565 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d
*d
)
17567 /* The pattern matching functions above are written to look for a small
17568 number to begin the sequence (0, 1, N/2). If we begin with an index
17569 from the second operand, we can swap the operands. */
17570 poly_int64 nelt
= d
->perm
.length ();
17571 if (known_ge (d
->perm
[0], nelt
))
17573 d
->perm
.rotate_inputs (1);
17574 std::swap (d
->op0
, d
->op1
);
17577 if ((d
->vec_flags
== VEC_ADVSIMD
17578 || d
->vec_flags
== VEC_SVE_DATA
17579 || d
->vec_flags
== VEC_SVE_PRED
)
17580 && known_gt (nelt
, 1))
17582 if (aarch64_evpc_rev_local (d
))
17584 else if (aarch64_evpc_rev_global (d
))
17586 else if (aarch64_evpc_ext (d
))
17588 else if (aarch64_evpc_dup (d
))
17590 else if (aarch64_evpc_zip (d
))
17592 else if (aarch64_evpc_uzp (d
))
17594 else if (aarch64_evpc_trn (d
))
17596 if (d
->vec_flags
== VEC_SVE_DATA
)
17597 return aarch64_evpc_sve_tbl (d
);
17598 else if (d
->vec_flags
== VEC_ADVSIMD
)
17599 return aarch64_evpc_tbl (d
);
17604 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST. */
17607 aarch64_vectorize_vec_perm_const (machine_mode vmode
, rtx target
, rtx op0
,
17608 rtx op1
, const vec_perm_indices
&sel
)
17610 struct expand_vec_perm_d d
;
17612 /* Check whether the mask can be applied to a single vector. */
17613 if (sel
.ninputs () == 1
17614 || (op0
&& rtx_equal_p (op0
, op1
)))
17615 d
.one_vector_p
= true;
17616 else if (sel
.all_from_input_p (0))
17618 d
.one_vector_p
= true;
17621 else if (sel
.all_from_input_p (1))
17623 d
.one_vector_p
= true;
17627 d
.one_vector_p
= false;
17629 d
.perm
.new_vector (sel
.encoding (), d
.one_vector_p
? 1 : 2,
17630 sel
.nelts_per_input ());
17632 d
.vec_flags
= aarch64_classify_vector_mode (d
.vmode
);
17636 d
.testing_p
= !target
;
17639 return aarch64_expand_vec_perm_const_1 (&d
);
17641 rtx_insn
*last
= get_last_insn ();
17642 bool ret
= aarch64_expand_vec_perm_const_1 (&d
);
17643 gcc_assert (last
== get_last_insn ());
17648 /* Generate a byte permute mask for a register of mode MODE,
17649 which has NUNITS units. */
17652 aarch64_reverse_mask (machine_mode mode
, unsigned int nunits
)
17654 /* We have to reverse each vector because we dont have
17655 a permuted load that can reverse-load according to ABI rules. */
17657 rtvec v
= rtvec_alloc (16);
17659 unsigned int usize
= GET_MODE_UNIT_SIZE (mode
);
17661 gcc_assert (BYTES_BIG_ENDIAN
);
17662 gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode
));
17664 for (i
= 0; i
< nunits
; i
++)
17665 for (j
= 0; j
< usize
; j
++)
17666 RTVEC_ELT (v
, i
* usize
+ j
) = GEN_INT ((i
+ 1) * usize
- 1 - j
);
17667 mask
= gen_rtx_CONST_VECTOR (V16QImode
, v
);
17668 return force_reg (V16QImode
, mask
);
17671 /* Return true if X is a valid second operand for the SVE instruction
17672 that implements integer comparison OP_CODE. */
17675 aarch64_sve_cmp_operand_p (rtx_code op_code
, rtx x
)
17677 if (register_operand (x
, VOIDmode
))
17686 return aarch64_sve_cmp_immediate_p (x
, false);
17693 return aarch64_sve_cmp_immediate_p (x
, true);
17695 gcc_unreachable ();
17699 /* Use predicated SVE instructions to implement the equivalent of:
17703 given that PTRUE is an all-true predicate of the appropriate mode. */
17706 aarch64_emit_sve_ptrue_op (rtx target
, rtx ptrue
, rtx op
)
17708 rtx unspec
= gen_rtx_UNSPEC (GET_MODE (target
),
17709 gen_rtvec (2, ptrue
, op
),
17710 UNSPEC_MERGE_PTRUE
);
17711 rtx_insn
*insn
= emit_set_insn (target
, unspec
);
17712 set_unique_reg_note (insn
, REG_EQUAL
, copy_rtx (op
));
17715 /* Likewise, but also clobber the condition codes. */
17718 aarch64_emit_sve_ptrue_op_cc (rtx target
, rtx ptrue
, rtx op
)
17720 rtx unspec
= gen_rtx_UNSPEC (GET_MODE (target
),
17721 gen_rtvec (2, ptrue
, op
),
17722 UNSPEC_MERGE_PTRUE
);
17723 rtx_insn
*insn
= emit_insn (gen_set_clobber_cc_nzc (target
, unspec
));
17724 set_unique_reg_note (insn
, REG_EQUAL
, copy_rtx (op
));
17727 /* Return the UNSPEC_COND_* code for comparison CODE. */
17729 static unsigned int
17730 aarch64_unspec_cond_code (rtx_code code
)
17735 return UNSPEC_COND_FCMNE
;
17737 return UNSPEC_COND_FCMEQ
;
17739 return UNSPEC_COND_FCMLT
;
17741 return UNSPEC_COND_FCMGT
;
17743 return UNSPEC_COND_FCMLE
;
17745 return UNSPEC_COND_FCMGE
;
17747 gcc_unreachable ();
17753 (set TARGET (unspec [PRED OP0 OP1] UNSPEC_COND_<X>))
17755 where <X> is the operation associated with comparison CODE. This form
17756 of instruction is used when (and (CODE OP0 OP1) PRED) would have different
17757 semantics, such as when PRED might not be all-true and when comparing
17758 inactive lanes could have side effects. */
17761 aarch64_emit_sve_predicated_cond (rtx target
, rtx_code code
,
17762 rtx pred
, rtx op0
, rtx op1
)
17764 rtx unspec
= gen_rtx_UNSPEC (GET_MODE (pred
),
17765 gen_rtvec (3, pred
, op0
, op1
),
17766 aarch64_unspec_cond_code (code
));
17767 emit_set_insn (target
, unspec
);
17770 /* Expand an SVE integer comparison using the SVE equivalent of:
17772 (set TARGET (CODE OP0 OP1)). */
17775 aarch64_expand_sve_vec_cmp_int (rtx target
, rtx_code code
, rtx op0
, rtx op1
)
17777 machine_mode pred_mode
= GET_MODE (target
);
17778 machine_mode data_mode
= GET_MODE (op0
);
17780 if (!aarch64_sve_cmp_operand_p (code
, op1
))
17781 op1
= force_reg (data_mode
, op1
);
17783 rtx ptrue
= aarch64_ptrue_reg (pred_mode
);
17784 rtx cond
= gen_rtx_fmt_ee (code
, pred_mode
, op0
, op1
);
17785 aarch64_emit_sve_ptrue_op_cc (target
, ptrue
, cond
);
17788 /* Emit the SVE equivalent of:
17790 (set TMP1 (CODE1 OP0 OP1))
17791 (set TMP2 (CODE2 OP0 OP1))
17792 (set TARGET (ior:PRED_MODE TMP1 TMP2))
17794 PTRUE is an all-true predicate with the same mode as TARGET. */
17797 aarch64_emit_sve_or_conds (rtx target
, rtx_code code1
, rtx_code code2
,
17798 rtx ptrue
, rtx op0
, rtx op1
)
17800 machine_mode pred_mode
= GET_MODE (ptrue
);
17801 rtx tmp1
= gen_reg_rtx (pred_mode
);
17802 aarch64_emit_sve_ptrue_op (tmp1
, ptrue
,
17803 gen_rtx_fmt_ee (code1
, pred_mode
, op0
, op1
));
17804 rtx tmp2
= gen_reg_rtx (pred_mode
);
17805 aarch64_emit_sve_ptrue_op (tmp2
, ptrue
,
17806 gen_rtx_fmt_ee (code2
, pred_mode
, op0
, op1
));
17807 aarch64_emit_binop (target
, ior_optab
, tmp1
, tmp2
);
17810 /* Emit the SVE equivalent of:
17812 (set TMP (CODE OP0 OP1))
17813 (set TARGET (not TMP))
17815 PTRUE is an all-true predicate with the same mode as TARGET. */
17818 aarch64_emit_sve_inverted_cond (rtx target
, rtx ptrue
, rtx_code code
,
17821 machine_mode pred_mode
= GET_MODE (ptrue
);
17822 rtx tmp
= gen_reg_rtx (pred_mode
);
17823 aarch64_emit_sve_ptrue_op (tmp
, ptrue
,
17824 gen_rtx_fmt_ee (code
, pred_mode
, op0
, op1
));
17825 aarch64_emit_unop (target
, one_cmpl_optab
, tmp
);
17828 /* Expand an SVE floating-point comparison using the SVE equivalent of:
17830 (set TARGET (CODE OP0 OP1))
17832 If CAN_INVERT_P is true, the caller can also handle inverted results;
17833 return true if the result is in fact inverted. */
17836 aarch64_expand_sve_vec_cmp_float (rtx target
, rtx_code code
,
17837 rtx op0
, rtx op1
, bool can_invert_p
)
17839 machine_mode pred_mode
= GET_MODE (target
);
17840 machine_mode data_mode
= GET_MODE (op0
);
17842 rtx ptrue
= aarch64_ptrue_reg (pred_mode
);
17846 /* UNORDERED has no immediate form. */
17847 op1
= force_reg (data_mode
, op1
);
17856 /* There is native support for the comparison. */
17857 rtx cond
= gen_rtx_fmt_ee (code
, pred_mode
, op0
, op1
);
17858 aarch64_emit_sve_ptrue_op (target
, ptrue
, cond
);
17863 /* This is a trapping operation (LT or GT). */
17864 aarch64_emit_sve_or_conds (target
, LT
, GT
, ptrue
, op0
, op1
);
17868 if (!flag_trapping_math
)
17870 /* This would trap for signaling NaNs. */
17871 op1
= force_reg (data_mode
, op1
);
17872 aarch64_emit_sve_or_conds (target
, UNORDERED
, EQ
, ptrue
, op0
, op1
);
17880 if (flag_trapping_math
)
17882 /* Work out which elements are ordered. */
17883 rtx ordered
= gen_reg_rtx (pred_mode
);
17884 op1
= force_reg (data_mode
, op1
);
17885 aarch64_emit_sve_inverted_cond (ordered
, ptrue
, UNORDERED
, op0
, op1
);
17887 /* Test the opposite condition for the ordered elements,
17888 then invert the result. */
17892 code
= reverse_condition_maybe_unordered (code
);
17895 aarch64_emit_sve_predicated_cond (target
, code
,
17896 ordered
, op0
, op1
);
17899 rtx tmp
= gen_reg_rtx (pred_mode
);
17900 aarch64_emit_sve_predicated_cond (tmp
, code
, ordered
, op0
, op1
);
17901 aarch64_emit_unop (target
, one_cmpl_optab
, tmp
);
17907 /* ORDERED has no immediate form. */
17908 op1
= force_reg (data_mode
, op1
);
17912 gcc_unreachable ();
17915 /* There is native support for the inverse comparison. */
17916 code
= reverse_condition_maybe_unordered (code
);
17919 rtx cond
= gen_rtx_fmt_ee (code
, pred_mode
, op0
, op1
);
17920 aarch64_emit_sve_ptrue_op (target
, ptrue
, cond
);
17923 aarch64_emit_sve_inverted_cond (target
, ptrue
, code
, op0
, op1
);
17927 /* Expand an SVE vcond pattern with operands OPS. DATA_MODE is the mode
17928 of the data being selected and CMP_MODE is the mode of the values being
17932 aarch64_expand_sve_vcond (machine_mode data_mode
, machine_mode cmp_mode
,
17935 machine_mode pred_mode
17936 = aarch64_get_mask_mode (GET_MODE_NUNITS (cmp_mode
),
17937 GET_MODE_SIZE (cmp_mode
)).require ();
17938 rtx pred
= gen_reg_rtx (pred_mode
);
17939 if (FLOAT_MODE_P (cmp_mode
))
17941 if (aarch64_expand_sve_vec_cmp_float (pred
, GET_CODE (ops
[3]),
17942 ops
[4], ops
[5], true))
17943 std::swap (ops
[1], ops
[2]);
17946 aarch64_expand_sve_vec_cmp_int (pred
, GET_CODE (ops
[3]), ops
[4], ops
[5]);
17948 rtvec vec
= gen_rtvec (3, pred
, ops
[1], ops
[2]);
17949 emit_set_insn (ops
[0], gen_rtx_UNSPEC (data_mode
, vec
, UNSPEC_SEL
));
17952 /* Implement TARGET_MODES_TIEABLE_P. In principle we should always return
17953 true. However due to issues with register allocation it is preferable
17954 to avoid tieing integer scalar and FP scalar modes. Executing integer
17955 operations in general registers is better than treating them as scalar
17956 vector operations. This reduces latency and avoids redundant int<->FP
17957 moves. So tie modes if they are either the same class, or vector modes
17958 with other vector modes, vector structs or any scalar mode. */
17961 aarch64_modes_tieable_p (machine_mode mode1
, machine_mode mode2
)
17963 if (GET_MODE_CLASS (mode1
) == GET_MODE_CLASS (mode2
))
17966 /* We specifically want to allow elements of "structure" modes to
17967 be tieable to the structure. This more general condition allows
17968 other rarer situations too. The reason we don't extend this to
17969 predicate modes is that there are no predicate structure modes
17970 nor any specific instructions for extracting part of a predicate
17972 if (aarch64_vector_data_mode_p (mode1
)
17973 && aarch64_vector_data_mode_p (mode2
))
17976 /* Also allow any scalar modes with vectors. */
17977 if (aarch64_vector_mode_supported_p (mode1
)
17978 || aarch64_vector_mode_supported_p (mode2
))
17984 /* Return a new RTX holding the result of moving POINTER forward by
17988 aarch64_move_pointer (rtx pointer
, poly_int64 amount
)
17990 rtx next
= plus_constant (Pmode
, XEXP (pointer
, 0), amount
);
17992 return adjust_automodify_address (pointer
, GET_MODE (pointer
),
17996 /* Return a new RTX holding the result of moving POINTER forward by the
17997 size of the mode it points to. */
18000 aarch64_progress_pointer (rtx pointer
)
18002 return aarch64_move_pointer (pointer
, GET_MODE_SIZE (GET_MODE (pointer
)));
18005 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
18009 aarch64_copy_one_block_and_progress_pointers (rtx
*src
, rtx
*dst
,
18012 rtx reg
= gen_reg_rtx (mode
);
18014 /* "Cast" the pointers to the correct mode. */
18015 *src
= adjust_address (*src
, mode
, 0);
18016 *dst
= adjust_address (*dst
, mode
, 0);
18017 /* Emit the memcpy. */
18018 emit_move_insn (reg
, *src
);
18019 emit_move_insn (*dst
, reg
);
18020 /* Move the pointers forward. */
18021 *src
= aarch64_progress_pointer (*src
);
18022 *dst
= aarch64_progress_pointer (*dst
);
18025 /* Expand cpymem, as if from a __builtin_memcpy. Return true if
18026 we succeed, otherwise return false. */
18029 aarch64_expand_cpymem (rtx
*operands
)
18032 rtx dst
= operands
[0];
18033 rtx src
= operands
[1];
18035 machine_mode cur_mode
= BLKmode
, next_mode
;
18036 bool speed_p
= !optimize_function_for_size_p (cfun
);
18038 /* When optimizing for size, give a better estimate of the length of a
18039 memcpy call, but use the default otherwise. Moves larger than 8 bytes
18040 will always require an even number of instructions to do now. And each
18041 operation requires both a load+store, so devide the max number by 2. */
18042 int max_num_moves
= (speed_p
? 16 : AARCH64_CALL_RATIO
) / 2;
18044 /* We can't do anything smart if the amount to copy is not constant. */
18045 if (!CONST_INT_P (operands
[2]))
18048 n
= INTVAL (operands
[2]);
18050 /* Try to keep the number of instructions low. For all cases we will do at
18051 most two moves for the residual amount, since we'll always overlap the
18053 if (((n
/ 16) + (n
% 16 ? 2 : 0)) > max_num_moves
)
18056 base
= copy_to_mode_reg (Pmode
, XEXP (dst
, 0));
18057 dst
= adjust_automodify_address (dst
, VOIDmode
, base
, 0);
18059 base
= copy_to_mode_reg (Pmode
, XEXP (src
, 0));
18060 src
= adjust_automodify_address (src
, VOIDmode
, base
, 0);
18062 /* Convert n to bits to make the rest of the code simpler. */
18063 n
= n
* BITS_PER_UNIT
;
18065 /* Maximum amount to copy in one go. The AArch64 back-end has integer modes
18066 larger than TImode, but we should not use them for loads/stores here. */
18067 const int copy_limit
= GET_MODE_BITSIZE (TImode
);
18071 /* Find the largest mode in which to do the copy in without over reading
18073 opt_scalar_int_mode mode_iter
;
18074 FOR_EACH_MODE_IN_CLASS (mode_iter
, MODE_INT
)
18075 if (GET_MODE_BITSIZE (mode_iter
.require ()) <= MIN (n
, copy_limit
))
18076 cur_mode
= mode_iter
.require ();
18078 gcc_assert (cur_mode
!= BLKmode
);
18080 mode_bits
= GET_MODE_BITSIZE (cur_mode
).to_constant ();
18081 aarch64_copy_one_block_and_progress_pointers (&src
, &dst
, cur_mode
);
18085 /* Do certain trailing copies as overlapping if it's going to be
18086 cheaper. i.e. less instructions to do so. For instance doing a 15
18087 byte copy it's more efficient to do two overlapping 8 byte copies than
18089 if (n
> 0 && n
<= 8 * BITS_PER_UNIT
)
18091 next_mode
= smallest_mode_for_size (n
, MODE_INT
);
18092 int n_bits
= GET_MODE_BITSIZE (next_mode
).to_constant ();
18093 src
= aarch64_move_pointer (src
, (n
- n_bits
) / BITS_PER_UNIT
);
18094 dst
= aarch64_move_pointer (dst
, (n
- n_bits
) / BITS_PER_UNIT
);
18102 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
18103 SImode stores. Handle the case when the constant has identical
18104 bottom and top halves. This is beneficial when the two stores can be
18105 merged into an STP and we avoid synthesising potentially expensive
18106 immediates twice. Return true if such a split is possible. */
18109 aarch64_split_dimode_const_store (rtx dst
, rtx src
)
18111 rtx lo
= gen_lowpart (SImode
, src
);
18112 rtx hi
= gen_highpart_mode (SImode
, DImode
, src
);
18114 bool size_p
= optimize_function_for_size_p (cfun
);
18116 if (!rtx_equal_p (lo
, hi
))
18119 unsigned int orig_cost
18120 = aarch64_internal_mov_immediate (NULL_RTX
, src
, false, DImode
);
18121 unsigned int lo_cost
18122 = aarch64_internal_mov_immediate (NULL_RTX
, lo
, false, SImode
);
18124 /* We want to transform:
18126 MOVK x1, 0x140, lsl 16
18127 MOVK x1, 0xc0da, lsl 32
18128 MOVK x1, 0x140, lsl 48
18132 MOVK w1, 0x140, lsl 16
18134 So we want to perform this only when we save two instructions
18135 or more. When optimizing for size, however, accept any code size
18137 if (size_p
&& orig_cost
<= lo_cost
)
18141 && (orig_cost
<= lo_cost
+ 1))
18144 rtx mem_lo
= adjust_address (dst
, SImode
, 0);
18145 if (!aarch64_mem_pair_operand (mem_lo
, SImode
))
18148 rtx tmp_reg
= gen_reg_rtx (SImode
);
18149 aarch64_expand_mov_immediate (tmp_reg
, lo
);
18150 rtx mem_hi
= aarch64_move_pointer (mem_lo
, GET_MODE_SIZE (SImode
));
18151 /* Don't emit an explicit store pair as this may not be always profitable.
18152 Let the sched-fusion logic decide whether to merge them. */
18153 emit_move_insn (mem_lo
, tmp_reg
);
18154 emit_move_insn (mem_hi
, tmp_reg
);
18159 /* Generate RTL for a conditional branch with rtx comparison CODE in
18160 mode CC_MODE. The destination of the unlikely conditional branch
18164 aarch64_gen_unlikely_cbranch (enum rtx_code code
, machine_mode cc_mode
,
18168 x
= gen_rtx_fmt_ee (code
, VOIDmode
,
18169 gen_rtx_REG (cc_mode
, CC_REGNUM
),
18172 x
= gen_rtx_IF_THEN_ELSE (VOIDmode
, x
,
18173 gen_rtx_LABEL_REF (VOIDmode
, label_ref
),
18175 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx
, x
));
18178 /* Generate DImode scratch registers for 128-bit (TImode) addition.
18180 OP1 represents the TImode destination operand 1
18181 OP2 represents the TImode destination operand 2
18182 LOW_DEST represents the low half (DImode) of TImode operand 0
18183 LOW_IN1 represents the low half (DImode) of TImode operand 1
18184 LOW_IN2 represents the low half (DImode) of TImode operand 2
18185 HIGH_DEST represents the high half (DImode) of TImode operand 0
18186 HIGH_IN1 represents the high half (DImode) of TImode operand 1
18187 HIGH_IN2 represents the high half (DImode) of TImode operand 2. */
18190 aarch64_addti_scratch_regs (rtx op1
, rtx op2
, rtx
*low_dest
,
18191 rtx
*low_in1
, rtx
*low_in2
,
18192 rtx
*high_dest
, rtx
*high_in1
,
18195 *low_dest
= gen_reg_rtx (DImode
);
18196 *low_in1
= gen_lowpart (DImode
, op1
);
18197 *low_in2
= simplify_gen_subreg (DImode
, op2
, TImode
,
18198 subreg_lowpart_offset (DImode
, TImode
));
18199 *high_dest
= gen_reg_rtx (DImode
);
18200 *high_in1
= gen_highpart (DImode
, op1
);
18201 *high_in2
= simplify_gen_subreg (DImode
, op2
, TImode
,
18202 subreg_highpart_offset (DImode
, TImode
));
18205 /* Generate DImode scratch registers for 128-bit (TImode) subtraction.
18207 This function differs from 'arch64_addti_scratch_regs' in that
18208 OP1 can be an immediate constant (zero). We must call
18209 subreg_highpart_offset with DImode and TImode arguments, otherwise
18210 VOIDmode will be used for the const_int which generates an internal
18211 error from subreg_size_highpart_offset which does not expect a size of zero.
18213 OP1 represents the TImode destination operand 1
18214 OP2 represents the TImode destination operand 2
18215 LOW_DEST represents the low half (DImode) of TImode operand 0
18216 LOW_IN1 represents the low half (DImode) of TImode operand 1
18217 LOW_IN2 represents the low half (DImode) of TImode operand 2
18218 HIGH_DEST represents the high half (DImode) of TImode operand 0
18219 HIGH_IN1 represents the high half (DImode) of TImode operand 1
18220 HIGH_IN2 represents the high half (DImode) of TImode operand 2. */
18224 aarch64_subvti_scratch_regs (rtx op1
, rtx op2
, rtx
*low_dest
,
18225 rtx
*low_in1
, rtx
*low_in2
,
18226 rtx
*high_dest
, rtx
*high_in1
,
18229 *low_dest
= gen_reg_rtx (DImode
);
18230 *low_in1
= simplify_gen_subreg (DImode
, op1
, TImode
,
18231 subreg_lowpart_offset (DImode
, TImode
));
18233 *low_in2
= simplify_gen_subreg (DImode
, op2
, TImode
,
18234 subreg_lowpart_offset (DImode
, TImode
));
18235 *high_dest
= gen_reg_rtx (DImode
);
18237 *high_in1
= simplify_gen_subreg (DImode
, op1
, TImode
,
18238 subreg_highpart_offset (DImode
, TImode
));
18239 *high_in2
= simplify_gen_subreg (DImode
, op2
, TImode
,
18240 subreg_highpart_offset (DImode
, TImode
));
18243 /* Generate RTL for 128-bit (TImode) subtraction with overflow.
18245 OP0 represents the TImode destination operand 0
18246 LOW_DEST represents the low half (DImode) of TImode operand 0
18247 LOW_IN1 represents the low half (DImode) of TImode operand 1
18248 LOW_IN2 represents the low half (DImode) of TImode operand 2
18249 HIGH_DEST represents the high half (DImode) of TImode operand 0
18250 HIGH_IN1 represents the high half (DImode) of TImode operand 1
18251 HIGH_IN2 represents the high half (DImode) of TImode operand 2
18252 UNSIGNED_P is true if the operation is being performed on unsigned
18255 aarch64_expand_subvti (rtx op0
, rtx low_dest
, rtx low_in1
,
18256 rtx low_in2
, rtx high_dest
, rtx high_in1
,
18257 rtx high_in2
, bool unsigned_p
)
18259 if (low_in2
== const0_rtx
)
18261 low_dest
= low_in1
;
18262 high_in2
= force_reg (DImode
, high_in2
);
18264 emit_insn (gen_subdi3_compare1 (high_dest
, high_in1
, high_in2
));
18266 emit_insn (gen_subvdi_insn (high_dest
, high_in1
, high_in2
));
18270 if (CONST_INT_P (low_in2
))
18272 high_in2
= force_reg (DImode
, high_in2
);
18273 emit_insn (gen_subdi3_compare1_imm (low_dest
, low_in1
, low_in2
,
18274 GEN_INT (-INTVAL (low_in2
))));
18277 emit_insn (gen_subdi3_compare1 (low_dest
, low_in1
, low_in2
));
18280 emit_insn (gen_usubdi3_carryinC (high_dest
, high_in1
, high_in2
));
18282 emit_insn (gen_subdi3_carryinV (high_dest
, high_in1
, high_in2
));
18285 emit_move_insn (gen_lowpart (DImode
, op0
), low_dest
);
18286 emit_move_insn (gen_highpart (DImode
, op0
), high_dest
);
18290 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
18292 static unsigned HOST_WIDE_INT
18293 aarch64_asan_shadow_offset (void)
18296 return (HOST_WIDE_INT_1
<< 29);
18298 return (HOST_WIDE_INT_1
<< 36);
18302 aarch64_gen_ccmp_first (rtx_insn
**prep_seq
, rtx_insn
**gen_seq
,
18303 int code
, tree treeop0
, tree treeop1
)
18305 machine_mode op_mode
, cmp_mode
, cc_mode
= CCmode
;
18307 int unsignedp
= TYPE_UNSIGNED (TREE_TYPE (treeop0
));
18309 struct expand_operand ops
[4];
18312 expand_operands (treeop0
, treeop1
, NULL_RTX
, &op0
, &op1
, EXPAND_NORMAL
);
18314 op_mode
= GET_MODE (op0
);
18315 if (op_mode
== VOIDmode
)
18316 op_mode
= GET_MODE (op1
);
18324 icode
= CODE_FOR_cmpsi
;
18329 icode
= CODE_FOR_cmpdi
;
18334 cc_mode
= aarch64_select_cc_mode ((rtx_code
) code
, op0
, op1
);
18335 icode
= cc_mode
== CCFPEmode
? CODE_FOR_fcmpesf
: CODE_FOR_fcmpsf
;
18340 cc_mode
= aarch64_select_cc_mode ((rtx_code
) code
, op0
, op1
);
18341 icode
= cc_mode
== CCFPEmode
? CODE_FOR_fcmpedf
: CODE_FOR_fcmpdf
;
18349 op0
= prepare_operand (icode
, op0
, 0, op_mode
, cmp_mode
, unsignedp
);
18350 op1
= prepare_operand (icode
, op1
, 1, op_mode
, cmp_mode
, unsignedp
);
18356 *prep_seq
= get_insns ();
18359 create_fixed_operand (&ops
[0], op0
);
18360 create_fixed_operand (&ops
[1], op1
);
18363 if (!maybe_expand_insn (icode
, 2, ops
))
18368 *gen_seq
= get_insns ();
18371 return gen_rtx_fmt_ee ((rtx_code
) code
, cc_mode
,
18372 gen_rtx_REG (cc_mode
, CC_REGNUM
), const0_rtx
);
18376 aarch64_gen_ccmp_next (rtx_insn
**prep_seq
, rtx_insn
**gen_seq
, rtx prev
,
18377 int cmp_code
, tree treeop0
, tree treeop1
, int bit_code
)
18379 rtx op0
, op1
, target
;
18380 machine_mode op_mode
, cmp_mode
, cc_mode
= CCmode
;
18381 int unsignedp
= TYPE_UNSIGNED (TREE_TYPE (treeop0
));
18383 struct expand_operand ops
[6];
18386 push_to_sequence (*prep_seq
);
18387 expand_operands (treeop0
, treeop1
, NULL_RTX
, &op0
, &op1
, EXPAND_NORMAL
);
18389 op_mode
= GET_MODE (op0
);
18390 if (op_mode
== VOIDmode
)
18391 op_mode
= GET_MODE (op1
);
18399 icode
= CODE_FOR_ccmpsi
;
18404 icode
= CODE_FOR_ccmpdi
;
18409 cc_mode
= aarch64_select_cc_mode ((rtx_code
) cmp_code
, op0
, op1
);
18410 icode
= cc_mode
== CCFPEmode
? CODE_FOR_fccmpesf
: CODE_FOR_fccmpsf
;
18415 cc_mode
= aarch64_select_cc_mode ((rtx_code
) cmp_code
, op0
, op1
);
18416 icode
= cc_mode
== CCFPEmode
? CODE_FOR_fccmpedf
: CODE_FOR_fccmpdf
;
18424 op0
= prepare_operand (icode
, op0
, 2, op_mode
, cmp_mode
, unsignedp
);
18425 op1
= prepare_operand (icode
, op1
, 3, op_mode
, cmp_mode
, unsignedp
);
18431 *prep_seq
= get_insns ();
18434 target
= gen_rtx_REG (cc_mode
, CC_REGNUM
);
18435 aarch64_cond
= aarch64_get_condition_code_1 (cc_mode
, (rtx_code
) cmp_code
);
18437 if (bit_code
!= AND
)
18439 prev
= gen_rtx_fmt_ee (REVERSE_CONDITION (GET_CODE (prev
),
18440 GET_MODE (XEXP (prev
, 0))),
18441 VOIDmode
, XEXP (prev
, 0), const0_rtx
);
18442 aarch64_cond
= AARCH64_INVERSE_CONDITION_CODE (aarch64_cond
);
18445 create_fixed_operand (&ops
[0], XEXP (prev
, 0));
18446 create_fixed_operand (&ops
[1], target
);
18447 create_fixed_operand (&ops
[2], op0
);
18448 create_fixed_operand (&ops
[3], op1
);
18449 create_fixed_operand (&ops
[4], prev
);
18450 create_fixed_operand (&ops
[5], GEN_INT (aarch64_cond
));
18452 push_to_sequence (*gen_seq
);
18453 if (!maybe_expand_insn (icode
, 6, ops
))
18459 *gen_seq
= get_insns ();
18462 return gen_rtx_fmt_ee ((rtx_code
) cmp_code
, VOIDmode
, target
, const0_rtx
);
18465 #undef TARGET_GEN_CCMP_FIRST
18466 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
18468 #undef TARGET_GEN_CCMP_NEXT
18469 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
18471 /* Implement TARGET_SCHED_MACRO_FUSION_P. Return true if target supports
18472 instruction fusion of some sort. */
18475 aarch64_macro_fusion_p (void)
18477 return aarch64_tune_params
.fusible_ops
!= AARCH64_FUSE_NOTHING
;
18481 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P. Return true if PREV and CURR
18482 should be kept together during scheduling. */
18485 aarch_macro_fusion_pair_p (rtx_insn
*prev
, rtx_insn
*curr
)
18488 rtx prev_set
= single_set (prev
);
18489 rtx curr_set
= single_set (curr
);
18490 /* prev and curr are simple SET insns i.e. no flag setting or branching. */
18491 bool simple_sets_p
= prev_set
&& curr_set
&& !any_condjump_p (curr
);
18493 if (!aarch64_macro_fusion_p ())
18496 if (simple_sets_p
&& aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK
))
18498 /* We are trying to match:
18499 prev (mov) == (set (reg r0) (const_int imm16))
18500 curr (movk) == (set (zero_extract (reg r0)
18503 (const_int imm16_1)) */
18505 set_dest
= SET_DEST (curr_set
);
18507 if (GET_CODE (set_dest
) == ZERO_EXTRACT
18508 && CONST_INT_P (SET_SRC (curr_set
))
18509 && CONST_INT_P (SET_SRC (prev_set
))
18510 && CONST_INT_P (XEXP (set_dest
, 2))
18511 && INTVAL (XEXP (set_dest
, 2)) == 16
18512 && REG_P (XEXP (set_dest
, 0))
18513 && REG_P (SET_DEST (prev_set
))
18514 && REGNO (XEXP (set_dest
, 0)) == REGNO (SET_DEST (prev_set
)))
18520 if (simple_sets_p
&& aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD
))
18523 /* We're trying to match:
18524 prev (adrp) == (set (reg r1)
18525 (high (symbol_ref ("SYM"))))
18526 curr (add) == (set (reg r0)
18528 (symbol_ref ("SYM"))))
18529 Note that r0 need not necessarily be the same as r1, especially
18530 during pre-regalloc scheduling. */
18532 if (satisfies_constraint_Ush (SET_SRC (prev_set
))
18533 && REG_P (SET_DEST (prev_set
)) && REG_P (SET_DEST (curr_set
)))
18535 if (GET_CODE (SET_SRC (curr_set
)) == LO_SUM
18536 && REG_P (XEXP (SET_SRC (curr_set
), 0))
18537 && REGNO (XEXP (SET_SRC (curr_set
), 0))
18538 == REGNO (SET_DEST (prev_set
))
18539 && rtx_equal_p (XEXP (SET_SRC (prev_set
), 0),
18540 XEXP (SET_SRC (curr_set
), 1)))
18545 if (simple_sets_p
&& aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK
))
18548 /* We're trying to match:
18549 prev (movk) == (set (zero_extract (reg r0)
18552 (const_int imm16_1))
18553 curr (movk) == (set (zero_extract (reg r0)
18556 (const_int imm16_2)) */
18558 if (GET_CODE (SET_DEST (prev_set
)) == ZERO_EXTRACT
18559 && GET_CODE (SET_DEST (curr_set
)) == ZERO_EXTRACT
18560 && REG_P (XEXP (SET_DEST (prev_set
), 0))
18561 && REG_P (XEXP (SET_DEST (curr_set
), 0))
18562 && REGNO (XEXP (SET_DEST (prev_set
), 0))
18563 == REGNO (XEXP (SET_DEST (curr_set
), 0))
18564 && CONST_INT_P (XEXP (SET_DEST (prev_set
), 2))
18565 && CONST_INT_P (XEXP (SET_DEST (curr_set
), 2))
18566 && INTVAL (XEXP (SET_DEST (prev_set
), 2)) == 32
18567 && INTVAL (XEXP (SET_DEST (curr_set
), 2)) == 48
18568 && CONST_INT_P (SET_SRC (prev_set
))
18569 && CONST_INT_P (SET_SRC (curr_set
)))
18573 if (simple_sets_p
&& aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR
))
18575 /* We're trying to match:
18576 prev (adrp) == (set (reg r0)
18577 (high (symbol_ref ("SYM"))))
18578 curr (ldr) == (set (reg r1)
18579 (mem (lo_sum (reg r0)
18580 (symbol_ref ("SYM")))))
18582 curr (ldr) == (set (reg r1)
18585 (symbol_ref ("SYM")))))) */
18586 if (satisfies_constraint_Ush (SET_SRC (prev_set
))
18587 && REG_P (SET_DEST (prev_set
)) && REG_P (SET_DEST (curr_set
)))
18589 rtx curr_src
= SET_SRC (curr_set
);
18591 if (GET_CODE (curr_src
) == ZERO_EXTEND
)
18592 curr_src
= XEXP (curr_src
, 0);
18594 if (MEM_P (curr_src
) && GET_CODE (XEXP (curr_src
, 0)) == LO_SUM
18595 && REG_P (XEXP (XEXP (curr_src
, 0), 0))
18596 && REGNO (XEXP (XEXP (curr_src
, 0), 0))
18597 == REGNO (SET_DEST (prev_set
))
18598 && rtx_equal_p (XEXP (XEXP (curr_src
, 0), 1),
18599 XEXP (SET_SRC (prev_set
), 0)))
18604 if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH
)
18605 && any_condjump_p (curr
))
18607 unsigned int condreg1
, condreg2
;
18609 aarch64_fixed_condition_code_regs (&condreg1
, &condreg2
);
18610 cc_reg_1
= gen_rtx_REG (CCmode
, condreg1
);
18612 if (reg_referenced_p (cc_reg_1
, PATTERN (curr
))
18614 && modified_in_p (cc_reg_1
, prev
))
18616 enum attr_type prev_type
= get_attr_type (prev
);
18618 /* FIXME: this misses some which is considered simple arthematic
18619 instructions for ThunderX. Simple shifts are missed here. */
18620 if (prev_type
== TYPE_ALUS_SREG
18621 || prev_type
== TYPE_ALUS_IMM
18622 || prev_type
== TYPE_LOGICS_REG
18623 || prev_type
== TYPE_LOGICS_IMM
)
18630 && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH
)
18631 && any_condjump_p (curr
))
18633 /* We're trying to match:
18634 prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
18635 curr (cbz) == (set (pc) (if_then_else (eq/ne) (r0)
18637 (label_ref ("SYM"))
18639 if (SET_DEST (curr_set
) == (pc_rtx
)
18640 && GET_CODE (SET_SRC (curr_set
)) == IF_THEN_ELSE
18641 && REG_P (XEXP (XEXP (SET_SRC (curr_set
), 0), 0))
18642 && REG_P (SET_DEST (prev_set
))
18643 && REGNO (SET_DEST (prev_set
))
18644 == REGNO (XEXP (XEXP (SET_SRC (curr_set
), 0), 0)))
18646 /* Fuse ALU operations followed by conditional branch instruction. */
18647 switch (get_attr_type (prev
))
18650 case TYPE_ALU_SREG
:
18653 case TYPE_ADCS_REG
:
18654 case TYPE_ADCS_IMM
:
18655 case TYPE_LOGIC_REG
:
18656 case TYPE_LOGIC_IMM
:
18660 case TYPE_SHIFT_REG
:
18661 case TYPE_SHIFT_IMM
:
18676 /* Return true iff the instruction fusion described by OP is enabled. */
18679 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op
)
18681 return (aarch64_tune_params
.fusible_ops
& op
) != 0;
18684 /* If MEM is in the form of [base+offset], extract the two parts
18685 of address and set to BASE and OFFSET, otherwise return false
18686 after clearing BASE and OFFSET. */
18689 extract_base_offset_in_addr (rtx mem
, rtx
*base
, rtx
*offset
)
18693 gcc_assert (MEM_P (mem
));
18695 addr
= XEXP (mem
, 0);
18700 *offset
= const0_rtx
;
18704 if (GET_CODE (addr
) == PLUS
18705 && REG_P (XEXP (addr
, 0)) && CONST_INT_P (XEXP (addr
, 1)))
18707 *base
= XEXP (addr
, 0);
18708 *offset
= XEXP (addr
, 1);
18713 *offset
= NULL_RTX
;
18718 /* Types for scheduling fusion. */
18719 enum sched_fusion_type
18721 SCHED_FUSION_NONE
= 0,
18722 SCHED_FUSION_LD_SIGN_EXTEND
,
18723 SCHED_FUSION_LD_ZERO_EXTEND
,
18729 /* If INSN is a load or store of address in the form of [base+offset],
18730 extract the two parts and set to BASE and OFFSET. Return scheduling
18731 fusion type this INSN is. */
18733 static enum sched_fusion_type
18734 fusion_load_store (rtx_insn
*insn
, rtx
*base
, rtx
*offset
)
18737 enum sched_fusion_type fusion
= SCHED_FUSION_LD
;
18739 gcc_assert (INSN_P (insn
));
18740 x
= PATTERN (insn
);
18741 if (GET_CODE (x
) != SET
)
18742 return SCHED_FUSION_NONE
;
18745 dest
= SET_DEST (x
);
18747 machine_mode dest_mode
= GET_MODE (dest
);
18749 if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode
))
18750 return SCHED_FUSION_NONE
;
18752 if (GET_CODE (src
) == SIGN_EXTEND
)
18754 fusion
= SCHED_FUSION_LD_SIGN_EXTEND
;
18755 src
= XEXP (src
, 0);
18756 if (GET_CODE (src
) != MEM
|| GET_MODE (src
) != SImode
)
18757 return SCHED_FUSION_NONE
;
18759 else if (GET_CODE (src
) == ZERO_EXTEND
)
18761 fusion
= SCHED_FUSION_LD_ZERO_EXTEND
;
18762 src
= XEXP (src
, 0);
18763 if (GET_CODE (src
) != MEM
|| GET_MODE (src
) != SImode
)
18764 return SCHED_FUSION_NONE
;
18767 if (GET_CODE (src
) == MEM
&& REG_P (dest
))
18768 extract_base_offset_in_addr (src
, base
, offset
);
18769 else if (GET_CODE (dest
) == MEM
&& (REG_P (src
) || src
== const0_rtx
))
18771 fusion
= SCHED_FUSION_ST
;
18772 extract_base_offset_in_addr (dest
, base
, offset
);
18775 return SCHED_FUSION_NONE
;
18777 if (*base
== NULL_RTX
|| *offset
== NULL_RTX
)
18778 fusion
= SCHED_FUSION_NONE
;
18783 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
18785 Currently we only support to fuse ldr or str instructions, so FUSION_PRI
18786 and PRI are only calculated for these instructions. For other instruction,
18787 FUSION_PRI and PRI are simply set to MAX_PRI - 1. In the future, other
18788 type instruction fusion can be added by returning different priorities.
18790 It's important that irrelevant instructions get the largest FUSION_PRI. */
18793 aarch64_sched_fusion_priority (rtx_insn
*insn
, int max_pri
,
18794 int *fusion_pri
, int *pri
)
18798 enum sched_fusion_type fusion
;
18800 gcc_assert (INSN_P (insn
));
18803 fusion
= fusion_load_store (insn
, &base
, &offset
);
18804 if (fusion
== SCHED_FUSION_NONE
)
18811 /* Set FUSION_PRI according to fusion type and base register. */
18812 *fusion_pri
= tmp
- fusion
* FIRST_PSEUDO_REGISTER
- REGNO (base
);
18814 /* Calculate PRI. */
18817 /* INSN with smaller offset goes first. */
18818 off_val
= (int)(INTVAL (offset
));
18820 tmp
-= (off_val
& 0xfffff);
18822 tmp
+= ((- off_val
) & 0xfffff);
18828 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
18829 Adjust priority of sha1h instructions so they are scheduled before
18830 other SHA1 instructions. */
18833 aarch64_sched_adjust_priority (rtx_insn
*insn
, int priority
)
18835 rtx x
= PATTERN (insn
);
18837 if (GET_CODE (x
) == SET
)
18841 if (GET_CODE (x
) == UNSPEC
&& XINT (x
, 1) == UNSPEC_SHA1H
)
18842 return priority
+ 10;
18848 /* Given OPERANDS of consecutive load/store, check if we can merge
18849 them into ldp/stp. LOAD is true if they are load instructions.
18850 MODE is the mode of memory operands. */
18853 aarch64_operands_ok_for_ldpstp (rtx
*operands
, bool load
,
18856 HOST_WIDE_INT offval_1
, offval_2
, msize
;
18857 enum reg_class rclass_1
, rclass_2
;
18858 rtx mem_1
, mem_2
, reg_1
, reg_2
, base_1
, base_2
, offset_1
, offset_2
;
18862 mem_1
= operands
[1];
18863 mem_2
= operands
[3];
18864 reg_1
= operands
[0];
18865 reg_2
= operands
[2];
18866 gcc_assert (REG_P (reg_1
) && REG_P (reg_2
));
18867 if (REGNO (reg_1
) == REGNO (reg_2
))
18872 mem_1
= operands
[0];
18873 mem_2
= operands
[2];
18874 reg_1
= operands
[1];
18875 reg_2
= operands
[3];
18878 /* The mems cannot be volatile. */
18879 if (MEM_VOLATILE_P (mem_1
) || MEM_VOLATILE_P (mem_2
))
18882 /* If we have SImode and slow unaligned ldp,
18883 check the alignment to be at least 8 byte. */
18885 && (aarch64_tune_params
.extra_tuning_flags
18886 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
)
18888 && MEM_ALIGN (mem_1
) < 8 * BITS_PER_UNIT
)
18891 /* Check if the addresses are in the form of [base+offset]. */
18892 extract_base_offset_in_addr (mem_1
, &base_1
, &offset_1
);
18893 if (base_1
== NULL_RTX
|| offset_1
== NULL_RTX
)
18895 extract_base_offset_in_addr (mem_2
, &base_2
, &offset_2
);
18896 if (base_2
== NULL_RTX
|| offset_2
== NULL_RTX
)
18899 /* Check if the bases are same. */
18900 if (!rtx_equal_p (base_1
, base_2
))
18903 /* The operands must be of the same size. */
18904 gcc_assert (known_eq (GET_MODE_SIZE (GET_MODE (mem_1
)),
18905 GET_MODE_SIZE (GET_MODE (mem_2
))));
18907 offval_1
= INTVAL (offset_1
);
18908 offval_2
= INTVAL (offset_2
);
18909 /* We should only be trying this for fixed-sized modes. There is no
18910 SVE LDP/STP instruction. */
18911 msize
= GET_MODE_SIZE (mode
).to_constant ();
18912 /* Check if the offsets are consecutive. */
18913 if (offval_1
!= (offval_2
+ msize
) && offval_2
!= (offval_1
+ msize
))
18916 /* Check if the addresses are clobbered by load. */
18919 if (reg_mentioned_p (reg_1
, mem_1
))
18922 /* In increasing order, the last load can clobber the address. */
18923 if (offval_1
> offval_2
&& reg_mentioned_p (reg_2
, mem_2
))
18927 /* One of the memory accesses must be a mempair operand.
18928 If it is not the first one, they need to be swapped by the
18930 if (!aarch64_mem_pair_operand (mem_1
, GET_MODE (mem_1
))
18931 && !aarch64_mem_pair_operand (mem_2
, GET_MODE (mem_2
)))
18934 if (REG_P (reg_1
) && FP_REGNUM_P (REGNO (reg_1
)))
18935 rclass_1
= FP_REGS
;
18937 rclass_1
= GENERAL_REGS
;
18939 if (REG_P (reg_2
) && FP_REGNUM_P (REGNO (reg_2
)))
18940 rclass_2
= FP_REGS
;
18942 rclass_2
= GENERAL_REGS
;
18944 /* Check if the registers are of same class. */
18945 if (rclass_1
!= rclass_2
)
18951 /* Given OPERANDS of consecutive load/store that can be merged,
18952 swap them if they are not in ascending order. */
18954 aarch64_swap_ldrstr_operands (rtx
* operands
, bool load
)
18956 rtx mem_1
, mem_2
, base_1
, base_2
, offset_1
, offset_2
;
18957 HOST_WIDE_INT offval_1
, offval_2
;
18961 mem_1
= operands
[1];
18962 mem_2
= operands
[3];
18966 mem_1
= operands
[0];
18967 mem_2
= operands
[2];
18970 extract_base_offset_in_addr (mem_1
, &base_1
, &offset_1
);
18971 extract_base_offset_in_addr (mem_2
, &base_2
, &offset_2
);
18973 offval_1
= INTVAL (offset_1
);
18974 offval_2
= INTVAL (offset_2
);
18976 if (offval_1
> offval_2
)
18978 /* Irrespective of whether this is a load or a store,
18979 we do the same swap. */
18980 std::swap (operands
[0], operands
[2]);
18981 std::swap (operands
[1], operands
[3]);
18985 /* Taking X and Y to be HOST_WIDE_INT pointers, return the result of a
18986 comparison between the two. */
18988 aarch64_host_wide_int_compare (const void *x
, const void *y
)
18990 return wi::cmps (* ((const HOST_WIDE_INT
*) x
),
18991 * ((const HOST_WIDE_INT
*) y
));
18994 /* Taking X and Y to be pairs of RTX, one pointing to a MEM rtx and the
18995 other pointing to a REG rtx containing an offset, compare the offsets
19000 1 iff offset (X) > offset (Y)
19001 0 iff offset (X) == offset (Y)
19002 -1 iff offset (X) < offset (Y) */
19004 aarch64_ldrstr_offset_compare (const void *x
, const void *y
)
19006 const rtx
* operands_1
= (const rtx
*) x
;
19007 const rtx
* operands_2
= (const rtx
*) y
;
19008 rtx mem_1
, mem_2
, base
, offset_1
, offset_2
;
19010 if (MEM_P (operands_1
[0]))
19011 mem_1
= operands_1
[0];
19013 mem_1
= operands_1
[1];
19015 if (MEM_P (operands_2
[0]))
19016 mem_2
= operands_2
[0];
19018 mem_2
= operands_2
[1];
19020 /* Extract the offsets. */
19021 extract_base_offset_in_addr (mem_1
, &base
, &offset_1
);
19022 extract_base_offset_in_addr (mem_2
, &base
, &offset_2
);
19024 gcc_assert (offset_1
!= NULL_RTX
&& offset_2
!= NULL_RTX
);
19026 return wi::cmps (INTVAL (offset_1
), INTVAL (offset_2
));
19029 /* Given OPERANDS of consecutive load/store, check if we can merge
19030 them into ldp/stp by adjusting the offset. LOAD is true if they
19031 are load instructions. MODE is the mode of memory operands.
19033 Given below consecutive stores:
19035 str w1, [xb, 0x100]
19036 str w1, [xb, 0x104]
19037 str w1, [xb, 0x108]
19038 str w1, [xb, 0x10c]
19040 Though the offsets are out of the range supported by stp, we can
19041 still pair them after adjusting the offset, like:
19043 add scratch, xb, 0x100
19044 stp w1, w1, [scratch]
19045 stp w1, w1, [scratch, 0x8]
19047 The peephole patterns detecting this opportunity should guarantee
19048 the scratch register is avaliable. */
19051 aarch64_operands_adjust_ok_for_ldpstp (rtx
*operands
, bool load
,
19054 const int num_insns
= 4;
19055 enum reg_class rclass
;
19056 HOST_WIDE_INT offvals
[num_insns
], msize
;
19057 rtx mem
[num_insns
], reg
[num_insns
], base
[num_insns
], offset
[num_insns
];
19061 for (int i
= 0; i
< num_insns
; i
++)
19063 reg
[i
] = operands
[2 * i
];
19064 mem
[i
] = operands
[2 * i
+ 1];
19066 gcc_assert (REG_P (reg
[i
]));
19069 /* Do not attempt to merge the loads if the loads clobber each other. */
19070 for (int i
= 0; i
< 8; i
+= 2)
19071 for (int j
= i
+ 2; j
< 8; j
+= 2)
19072 if (reg_overlap_mentioned_p (operands
[i
], operands
[j
]))
19076 for (int i
= 0; i
< num_insns
; i
++)
19078 mem
[i
] = operands
[2 * i
];
19079 reg
[i
] = operands
[2 * i
+ 1];
19082 /* Skip if memory operand is by itself valid for ldp/stp. */
19083 if (!MEM_P (mem
[0]) || aarch64_mem_pair_operand (mem
[0], mode
))
19086 for (int i
= 0; i
< num_insns
; i
++)
19088 /* The mems cannot be volatile. */
19089 if (MEM_VOLATILE_P (mem
[i
]))
19092 /* Check if the addresses are in the form of [base+offset]. */
19093 extract_base_offset_in_addr (mem
[i
], base
+ i
, offset
+ i
);
19094 if (base
[i
] == NULL_RTX
|| offset
[i
] == NULL_RTX
)
19098 /* Check if the registers are of same class. */
19099 rclass
= REG_P (reg
[0]) && FP_REGNUM_P (REGNO (reg
[0]))
19100 ? FP_REGS
: GENERAL_REGS
;
19102 for (int i
= 1; i
< num_insns
; i
++)
19103 if (REG_P (reg
[i
]) && FP_REGNUM_P (REGNO (reg
[i
])))
19105 if (rclass
!= FP_REGS
)
19110 if (rclass
!= GENERAL_REGS
)
19114 /* Only the last register in the order in which they occur
19115 may be clobbered by the load. */
19116 if (rclass
== GENERAL_REGS
&& load
)
19117 for (int i
= 0; i
< num_insns
- 1; i
++)
19118 if (reg_mentioned_p (reg
[i
], mem
[i
]))
19121 /* Check if the bases are same. */
19122 for (int i
= 0; i
< num_insns
- 1; i
++)
19123 if (!rtx_equal_p (base
[i
], base
[i
+ 1]))
19126 for (int i
= 0; i
< num_insns
; i
++)
19127 offvals
[i
] = INTVAL (offset
[i
]);
19129 msize
= GET_MODE_SIZE (mode
);
19131 /* Check if the offsets can be put in the right order to do a ldp/stp. */
19132 qsort (offvals
, num_insns
, sizeof (HOST_WIDE_INT
),
19133 aarch64_host_wide_int_compare
);
19135 if (!(offvals
[1] == offvals
[0] + msize
19136 && offvals
[3] == offvals
[2] + msize
))
19139 /* Check that offsets are within range of each other. The ldp/stp
19140 instructions have 7 bit immediate offsets, so use 0x80. */
19141 if (offvals
[2] - offvals
[0] >= msize
* 0x80)
19144 /* The offsets must be aligned with respect to each other. */
19145 if (offvals
[0] % msize
!= offvals
[2] % msize
)
19148 /* If we have SImode and slow unaligned ldp,
19149 check the alignment to be at least 8 byte. */
19151 && (aarch64_tune_params
.extra_tuning_flags
19152 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
)
19154 && MEM_ALIGN (mem
[0]) < 8 * BITS_PER_UNIT
)
19160 /* Given OPERANDS of consecutive load/store, this function pairs them
19161 into LDP/STP after adjusting the offset. It depends on the fact
19162 that the operands can be sorted so the offsets are correct for STP.
19163 MODE is the mode of memory operands. CODE is the rtl operator
19164 which should be applied to all memory operands, it's SIGN_EXTEND,
19165 ZERO_EXTEND or UNKNOWN. */
19168 aarch64_gen_adjusted_ldpstp (rtx
*operands
, bool load
,
19169 scalar_mode mode
, RTX_CODE code
)
19171 rtx base
, offset_1
, offset_3
, t1
, t2
;
19172 rtx mem_1
, mem_2
, mem_3
, mem_4
;
19173 rtx temp_operands
[8];
19174 HOST_WIDE_INT off_val_1
, off_val_3
, base_off
, new_off_1
, new_off_3
,
19175 stp_off_upper_limit
, stp_off_lower_limit
, msize
;
19177 /* We make changes on a copy as we may still bail out. */
19178 for (int i
= 0; i
< 8; i
++)
19179 temp_operands
[i
] = operands
[i
];
19181 /* Sort the operands. */
19182 qsort (temp_operands
, 4, 2 * sizeof (rtx
*), aarch64_ldrstr_offset_compare
);
19184 /* Copy the memory operands so that if we have to bail for some
19185 reason the original addresses are unchanged. */
19188 mem_1
= copy_rtx (temp_operands
[1]);
19189 mem_2
= copy_rtx (temp_operands
[3]);
19190 mem_3
= copy_rtx (temp_operands
[5]);
19191 mem_4
= copy_rtx (temp_operands
[7]);
19195 mem_1
= copy_rtx (temp_operands
[0]);
19196 mem_2
= copy_rtx (temp_operands
[2]);
19197 mem_3
= copy_rtx (temp_operands
[4]);
19198 mem_4
= copy_rtx (temp_operands
[6]);
19199 gcc_assert (code
== UNKNOWN
);
19202 extract_base_offset_in_addr (mem_1
, &base
, &offset_1
);
19203 extract_base_offset_in_addr (mem_3
, &base
, &offset_3
);
19204 gcc_assert (base
!= NULL_RTX
&& offset_1
!= NULL_RTX
19205 && offset_3
!= NULL_RTX
);
19207 /* Adjust offset so it can fit in LDP/STP instruction. */
19208 msize
= GET_MODE_SIZE (mode
);
19209 stp_off_upper_limit
= msize
* (0x40 - 1);
19210 stp_off_lower_limit
= - msize
* 0x40;
19212 off_val_1
= INTVAL (offset_1
);
19213 off_val_3
= INTVAL (offset_3
);
19215 /* The base offset is optimally half way between the two STP/LDP offsets. */
19217 base_off
= (off_val_1
+ off_val_3
) / 2;
19219 /* However, due to issues with negative LDP/STP offset generation for
19220 larger modes, for DF, DI and vector modes. we must not use negative
19221 addresses smaller than 9 signed unadjusted bits can store. This
19222 provides the most range in this case. */
19223 base_off
= off_val_1
;
19225 /* Adjust the base so that it is aligned with the addresses but still
19227 if (base_off
% msize
!= off_val_1
% msize
)
19228 /* Fix the offset, bearing in mind we want to make it bigger not
19230 base_off
+= (((base_off
% msize
) - (off_val_1
% msize
)) + msize
) % msize
;
19231 else if (msize
<= 4)
19232 /* The negative range of LDP/STP is one larger than the positive range. */
19235 /* Check if base offset is too big or too small. We can attempt to resolve
19236 this issue by setting it to the maximum value and seeing if the offsets
19238 if (base_off
>= 0x1000)
19240 base_off
= 0x1000 - 1;
19241 /* We must still make sure that the base offset is aligned with respect
19242 to the address. But it may may not be made any bigger. */
19243 base_off
-= (((base_off
% msize
) - (off_val_1
% msize
)) + msize
) % msize
;
19246 /* Likewise for the case where the base is too small. */
19247 if (base_off
<= -0x1000)
19249 base_off
= -0x1000 + 1;
19250 base_off
+= (((base_off
% msize
) - (off_val_1
% msize
)) + msize
) % msize
;
19253 /* Offset of the first STP/LDP. */
19254 new_off_1
= off_val_1
- base_off
;
19256 /* Offset of the second STP/LDP. */
19257 new_off_3
= off_val_3
- base_off
;
19259 /* The offsets must be within the range of the LDP/STP instructions. */
19260 if (new_off_1
> stp_off_upper_limit
|| new_off_1
< stp_off_lower_limit
19261 || new_off_3
> stp_off_upper_limit
|| new_off_3
< stp_off_lower_limit
)
19264 replace_equiv_address_nv (mem_1
, plus_constant (Pmode
, operands
[8],
19266 replace_equiv_address_nv (mem_2
, plus_constant (Pmode
, operands
[8],
19267 new_off_1
+ msize
), true);
19268 replace_equiv_address_nv (mem_3
, plus_constant (Pmode
, operands
[8],
19270 replace_equiv_address_nv (mem_4
, plus_constant (Pmode
, operands
[8],
19271 new_off_3
+ msize
), true);
19273 if (!aarch64_mem_pair_operand (mem_1
, mode
)
19274 || !aarch64_mem_pair_operand (mem_3
, mode
))
19277 if (code
== ZERO_EXTEND
)
19279 mem_1
= gen_rtx_ZERO_EXTEND (DImode
, mem_1
);
19280 mem_2
= gen_rtx_ZERO_EXTEND (DImode
, mem_2
);
19281 mem_3
= gen_rtx_ZERO_EXTEND (DImode
, mem_3
);
19282 mem_4
= gen_rtx_ZERO_EXTEND (DImode
, mem_4
);
19284 else if (code
== SIGN_EXTEND
)
19286 mem_1
= gen_rtx_SIGN_EXTEND (DImode
, mem_1
);
19287 mem_2
= gen_rtx_SIGN_EXTEND (DImode
, mem_2
);
19288 mem_3
= gen_rtx_SIGN_EXTEND (DImode
, mem_3
);
19289 mem_4
= gen_rtx_SIGN_EXTEND (DImode
, mem_4
);
19294 operands
[0] = temp_operands
[0];
19295 operands
[1] = mem_1
;
19296 operands
[2] = temp_operands
[2];
19297 operands
[3] = mem_2
;
19298 operands
[4] = temp_operands
[4];
19299 operands
[5] = mem_3
;
19300 operands
[6] = temp_operands
[6];
19301 operands
[7] = mem_4
;
19305 operands
[0] = mem_1
;
19306 operands
[1] = temp_operands
[1];
19307 operands
[2] = mem_2
;
19308 operands
[3] = temp_operands
[3];
19309 operands
[4] = mem_3
;
19310 operands
[5] = temp_operands
[5];
19311 operands
[6] = mem_4
;
19312 operands
[7] = temp_operands
[7];
19315 /* Emit adjusting instruction. */
19316 emit_insn (gen_rtx_SET (operands
[8], plus_constant (DImode
, base
, base_off
)));
19317 /* Emit ldp/stp instructions. */
19318 t1
= gen_rtx_SET (operands
[0], operands
[1]);
19319 t2
= gen_rtx_SET (operands
[2], operands
[3]);
19320 emit_insn (gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (2, t1
, t2
)));
19321 t1
= gen_rtx_SET (operands
[4], operands
[5]);
19322 t2
= gen_rtx_SET (operands
[6], operands
[7]);
19323 emit_insn (gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (2, t1
, t2
)));
19327 /* Implement TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE. Assume for now that
19328 it isn't worth branching around empty masked ops (including masked
19332 aarch64_empty_mask_is_expensive (unsigned)
19337 /* Return 1 if pseudo register should be created and used to hold
19338 GOT address for PIC code. */
19341 aarch64_use_pseudo_pic_reg (void)
19343 return aarch64_cmodel
== AARCH64_CMODEL_SMALL_SPIC
;
19346 /* Implement TARGET_UNSPEC_MAY_TRAP_P. */
19349 aarch64_unspec_may_trap_p (const_rtx x
, unsigned flags
)
19351 switch (XINT (x
, 1))
19353 case UNSPEC_GOTSMALLPIC
:
19354 case UNSPEC_GOTSMALLPIC28K
:
19355 case UNSPEC_GOTTINYPIC
:
19361 return default_unspec_may_trap_p (x
, flags
);
19365 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
19366 return the log2 of that value. Otherwise return -1. */
19369 aarch64_fpconst_pow_of_2 (rtx x
)
19371 const REAL_VALUE_TYPE
*r
;
19373 if (!CONST_DOUBLE_P (x
))
19376 r
= CONST_DOUBLE_REAL_VALUE (x
);
19378 if (REAL_VALUE_NEGATIVE (*r
)
19379 || REAL_VALUE_ISNAN (*r
)
19380 || REAL_VALUE_ISINF (*r
)
19381 || !real_isinteger (r
, DFmode
))
19384 return exact_log2 (real_to_integer (r
));
19387 /* If X is a vector of equal CONST_DOUBLE values and that value is
19388 Y, return the aarch64_fpconst_pow_of_2 of Y. Otherwise return -1. */
19391 aarch64_vec_fpconst_pow_of_2 (rtx x
)
19394 if (GET_CODE (x
) != CONST_VECTOR
19395 || !CONST_VECTOR_NUNITS (x
).is_constant (&nelts
))
19398 if (GET_MODE_CLASS (GET_MODE (x
)) != MODE_VECTOR_FLOAT
)
19401 int firstval
= aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x
, 0));
19405 for (int i
= 1; i
< nelts
; i
++)
19406 if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x
, i
)) != firstval
)
19412 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
19415 __fp16 always promotes through this hook.
19416 _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
19417 through the generic excess precision logic rather than here. */
19420 aarch64_promoted_type (const_tree t
)
19422 if (SCALAR_FLOAT_TYPE_P (t
)
19423 && TYPE_MAIN_VARIANT (t
) == aarch64_fp16_type_node
)
19424 return float_type_node
;
19429 /* Implement the TARGET_OPTAB_SUPPORTED_P hook. */
19432 aarch64_optab_supported_p (int op
, machine_mode mode1
, machine_mode
,
19433 optimization_type opt_type
)
19438 return opt_type
== OPTIMIZE_FOR_SPEED
&& use_rsqrt_p (mode1
);
19445 /* Implement the TARGET_DWARF_POLY_INDETERMINATE_VALUE hook. */
19447 static unsigned int
19448 aarch64_dwarf_poly_indeterminate_value (unsigned int i
, unsigned int *factor
,
19451 /* Polynomial invariant 1 == (VG / 2) - 1. */
19452 gcc_assert (i
== 1);
19455 return AARCH64_DWARF_VG
;
19458 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
19459 if MODE is HFmode, and punt to the generic implementation otherwise. */
19462 aarch64_libgcc_floating_mode_supported_p (scalar_float_mode mode
)
19464 return (mode
== HFmode
19466 : default_libgcc_floating_mode_supported_p (mode
));
19469 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
19470 if MODE is HFmode, and punt to the generic implementation otherwise. */
19473 aarch64_scalar_mode_supported_p (scalar_mode mode
)
19475 return (mode
== HFmode
19477 : default_scalar_mode_supported_p (mode
));
19480 /* Set the value of FLT_EVAL_METHOD.
19481 ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
19483 0: evaluate all operations and constants, whose semantic type has at
19484 most the range and precision of type float, to the range and
19485 precision of float; evaluate all other operations and constants to
19486 the range and precision of the semantic type;
19488 N, where _FloatN is a supported interchange floating type
19489 evaluate all operations and constants, whose semantic type has at
19490 most the range and precision of _FloatN type, to the range and
19491 precision of the _FloatN type; evaluate all other operations and
19492 constants to the range and precision of the semantic type;
19494 If we have the ARMv8.2-A extensions then we support _Float16 in native
19495 precision, so we should set this to 16. Otherwise, we support the type,
19496 but want to evaluate expressions in float precision, so set this to
19499 static enum flt_eval_method
19500 aarch64_excess_precision (enum excess_precision_type type
)
19504 case EXCESS_PRECISION_TYPE_FAST
:
19505 case EXCESS_PRECISION_TYPE_STANDARD
:
19506 /* We can calculate either in 16-bit range and precision or
19507 32-bit range and precision. Make that decision based on whether
19508 we have native support for the ARMv8.2-A 16-bit floating-point
19509 instructions or not. */
19510 return (TARGET_FP_F16INST
19511 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
19512 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT
);
19513 case EXCESS_PRECISION_TYPE_IMPLICIT
:
19514 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
;
19516 gcc_unreachable ();
19518 return FLT_EVAL_METHOD_UNPREDICTABLE
;
19521 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN. Return true if INSN can be
19522 scheduled for speculative execution. Reject the long-running division
19523 and square-root instructions. */
19526 aarch64_sched_can_speculate_insn (rtx_insn
*insn
)
19528 switch (get_attr_type (insn
))
19536 case TYPE_NEON_FP_SQRT_S
:
19537 case TYPE_NEON_FP_SQRT_D
:
19538 case TYPE_NEON_FP_SQRT_S_Q
:
19539 case TYPE_NEON_FP_SQRT_D_Q
:
19540 case TYPE_NEON_FP_DIV_S
:
19541 case TYPE_NEON_FP_DIV_D
:
19542 case TYPE_NEON_FP_DIV_S_Q
:
19543 case TYPE_NEON_FP_DIV_D_Q
:
19550 /* Implement TARGET_COMPUTE_PRESSURE_CLASSES. */
19553 aarch64_compute_pressure_classes (reg_class
*classes
)
19556 classes
[i
++] = GENERAL_REGS
;
19557 classes
[i
++] = FP_REGS
;
19558 /* PR_REGS isn't a useful pressure class because many predicate pseudo
19559 registers need to go in PR_LO_REGS at some point during their
19560 lifetime. Splitting it into two halves has the effect of making
19561 all predicates count against PR_LO_REGS, so that we try whenever
19562 possible to restrict the number of live predicates to 8. This
19563 greatly reduces the amount of spilling in certain loops. */
19564 classes
[i
++] = PR_LO_REGS
;
19565 classes
[i
++] = PR_HI_REGS
;
19569 /* Implement TARGET_CAN_CHANGE_MODE_CLASS. */
19572 aarch64_can_change_mode_class (machine_mode from
,
19573 machine_mode to
, reg_class_t
)
19575 if (BYTES_BIG_ENDIAN
)
19577 bool from_sve_p
= aarch64_sve_data_mode_p (from
);
19578 bool to_sve_p
= aarch64_sve_data_mode_p (to
);
19580 /* Don't allow changes between SVE data modes and non-SVE modes.
19581 See the comment at the head of aarch64-sve.md for details. */
19582 if (from_sve_p
!= to_sve_p
)
19585 /* Don't allow changes in element size: lane 0 of the new vector
19586 would not then be lane 0 of the old vector. See the comment
19587 above aarch64_maybe_expand_sve_subreg_move for a more detailed
19590 In the worst case, this forces a register to be spilled in
19591 one mode and reloaded in the other, which handles the
19592 endianness correctly. */
19593 if (from_sve_p
&& GET_MODE_UNIT_SIZE (from
) != GET_MODE_UNIT_SIZE (to
))
19599 /* Implement TARGET_EARLY_REMAT_MODES. */
19602 aarch64_select_early_remat_modes (sbitmap modes
)
19604 /* SVE values are not normally live across a call, so it should be
19605 worth doing early rematerialization even in VL-specific mode. */
19606 for (int i
= 0; i
< NUM_MACHINE_MODES
; ++i
)
19608 machine_mode mode
= (machine_mode
) i
;
19609 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
19610 if (vec_flags
& VEC_ANY_SVE
)
19611 bitmap_set_bit (modes
, i
);
19615 /* Override the default target speculation_safe_value. */
19617 aarch64_speculation_safe_value (machine_mode mode
,
19618 rtx result
, rtx val
, rtx failval
)
19620 /* Maybe we should warn if falling back to hard barriers. They are
19621 likely to be noticably more expensive than the alternative below. */
19622 if (!aarch64_track_speculation
)
19623 return default_speculation_safe_value (mode
, result
, val
, failval
);
19626 val
= copy_to_mode_reg (mode
, val
);
19628 if (!aarch64_reg_or_zero (failval
, mode
))
19629 failval
= copy_to_mode_reg (mode
, failval
);
19631 emit_insn (gen_despeculate_copy (mode
, result
, val
, failval
));
19635 /* Implement TARGET_ESTIMATED_POLY_VALUE.
19636 Look into the tuning structure for an estimate.
19637 VAL.coeffs[1] is multiplied by the number of VQ chunks over the initial
19638 Advanced SIMD 128 bits. */
19640 static HOST_WIDE_INT
19641 aarch64_estimated_poly_value (poly_int64 val
)
19643 enum aarch64_sve_vector_bits_enum width_source
19644 = aarch64_tune_params
.sve_width
;
19646 /* If we still don't have an estimate, use the default. */
19647 if (width_source
== SVE_SCALABLE
)
19648 return default_estimated_poly_value (val
);
19650 HOST_WIDE_INT over_128
= width_source
- 128;
19651 return val
.coeffs
[0] + val
.coeffs
[1] * over_128
/ 128;
19655 /* Return true for types that could be supported as SIMD return or
19659 supported_simd_type (tree t
)
19661 if (SCALAR_FLOAT_TYPE_P (t
) || INTEGRAL_TYPE_P (t
) || POINTER_TYPE_P (t
))
19663 HOST_WIDE_INT s
= tree_to_shwi (TYPE_SIZE_UNIT (t
));
19664 return s
== 1 || s
== 2 || s
== 4 || s
== 8;
19669 /* Return true for types that currently are supported as SIMD return
19670 or argument types. */
19673 currently_supported_simd_type (tree t
, tree b
)
19675 if (COMPLEX_FLOAT_TYPE_P (t
))
19678 if (TYPE_SIZE (t
) != TYPE_SIZE (b
))
19681 return supported_simd_type (t
);
19684 /* Implement TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN. */
19687 aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node
*node
,
19688 struct cgraph_simd_clone
*clonei
,
19689 tree base_type
, int num
)
19691 tree t
, ret_type
, arg_type
;
19692 unsigned int elt_bits
, vec_bits
, count
;
19697 if (clonei
->simdlen
19698 && (clonei
->simdlen
< 2
19699 || clonei
->simdlen
> 1024
19700 || (clonei
->simdlen
& (clonei
->simdlen
- 1)) != 0))
19702 warning_at (DECL_SOURCE_LOCATION (node
->decl
), 0,
19703 "unsupported simdlen %d", clonei
->simdlen
);
19707 ret_type
= TREE_TYPE (TREE_TYPE (node
->decl
));
19708 if (TREE_CODE (ret_type
) != VOID_TYPE
19709 && !currently_supported_simd_type (ret_type
, base_type
))
19711 if (TYPE_SIZE (ret_type
) != TYPE_SIZE (base_type
))
19712 warning_at (DECL_SOURCE_LOCATION (node
->decl
), 0,
19713 "GCC does not currently support mixed size types "
19714 "for %<simd%> functions");
19715 else if (supported_simd_type (ret_type
))
19716 warning_at (DECL_SOURCE_LOCATION (node
->decl
), 0,
19717 "GCC does not currently support return type %qT "
19718 "for %<simd%> functions", ret_type
);
19720 warning_at (DECL_SOURCE_LOCATION (node
->decl
), 0,
19721 "unsupported return type %qT for %<simd%> functions",
19726 for (t
= DECL_ARGUMENTS (node
->decl
); t
; t
= DECL_CHAIN (t
))
19728 arg_type
= TREE_TYPE (t
);
19730 if (!currently_supported_simd_type (arg_type
, base_type
))
19732 if (TYPE_SIZE (arg_type
) != TYPE_SIZE (base_type
))
19733 warning_at (DECL_SOURCE_LOCATION (node
->decl
), 0,
19734 "GCC does not currently support mixed size types "
19735 "for %<simd%> functions");
19737 warning_at (DECL_SOURCE_LOCATION (node
->decl
), 0,
19738 "GCC does not currently support argument type %qT "
19739 "for %<simd%> functions", arg_type
);
19744 clonei
->vecsize_mangle
= 'n';
19745 clonei
->mask_mode
= VOIDmode
;
19746 elt_bits
= GET_MODE_BITSIZE (SCALAR_TYPE_MODE (base_type
));
19747 if (clonei
->simdlen
== 0)
19750 vec_bits
= (num
== 0 ? 64 : 128);
19751 clonei
->simdlen
= vec_bits
/ elt_bits
;
19756 vec_bits
= clonei
->simdlen
* elt_bits
;
19757 if (vec_bits
!= 64 && vec_bits
!= 128)
19759 warning_at (DECL_SOURCE_LOCATION (node
->decl
), 0,
19760 "GCC does not currently support simdlen %d for type %qT",
19761 clonei
->simdlen
, base_type
);
19765 clonei
->vecsize_int
= vec_bits
;
19766 clonei
->vecsize_float
= vec_bits
;
19770 /* Implement TARGET_SIMD_CLONE_ADJUST. */
19773 aarch64_simd_clone_adjust (struct cgraph_node
*node
)
19775 /* Add aarch64_vector_pcs target attribute to SIMD clones so they
19776 use the correct ABI. */
19778 tree t
= TREE_TYPE (node
->decl
);
19779 TYPE_ATTRIBUTES (t
) = make_attribute ("aarch64_vector_pcs", "default",
19780 TYPE_ATTRIBUTES (t
));
19783 /* Implement TARGET_SIMD_CLONE_USABLE. */
19786 aarch64_simd_clone_usable (struct cgraph_node
*node
)
19788 switch (node
->simdclone
->vecsize_mangle
)
19795 gcc_unreachable ();
19799 /* Implement TARGET_COMP_TYPE_ATTRIBUTES */
19802 aarch64_comp_type_attributes (const_tree type1
, const_tree type2
)
19804 if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (type1
))
19805 != lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (type2
)))
19810 /* Implement TARGET_GET_MULTILIB_ABI_NAME */
19812 static const char *
19813 aarch64_get_multilib_abi_name (void)
19815 if (TARGET_BIG_END
)
19816 return TARGET_ILP32
? "aarch64_be_ilp32" : "aarch64_be";
19817 return TARGET_ILP32
? "aarch64_ilp32" : "aarch64";
19820 /* Implement TARGET_STACK_PROTECT_GUARD. In case of a
19821 global variable based guard use the default else
19822 return a null tree. */
19824 aarch64_stack_protect_guard (void)
19826 if (aarch64_stack_protector_guard
== SSP_GLOBAL
)
19827 return default_stack_protect_guard ();
19832 /* Implement TARGET_ASM_FILE_END for AArch64. This adds the AArch64 GNU NOTE
19833 section at the end if needed. */
19834 #define GNU_PROPERTY_AARCH64_FEATURE_1_AND 0xc0000000
19835 #define GNU_PROPERTY_AARCH64_FEATURE_1_BTI (1U << 0)
19836 #define GNU_PROPERTY_AARCH64_FEATURE_1_PAC (1U << 1)
19838 aarch64_file_end_indicate_exec_stack ()
19840 file_end_indicate_exec_stack ();
19842 unsigned feature_1_and
= 0;
19843 if (aarch64_bti_enabled ())
19844 feature_1_and
|= GNU_PROPERTY_AARCH64_FEATURE_1_BTI
;
19846 if (aarch64_ra_sign_scope
!= AARCH64_FUNCTION_NONE
)
19847 feature_1_and
|= GNU_PROPERTY_AARCH64_FEATURE_1_PAC
;
19851 /* Generate .note.gnu.property section. */
19852 switch_to_section (get_section (".note.gnu.property",
19853 SECTION_NOTYPE
, NULL
));
19855 /* PT_NOTE header: namesz, descsz, type.
19856 namesz = 4 ("GNU\0")
19857 descsz = 16 (Size of the program property array)
19858 [(12 + padding) * Number of array elements]
19859 type = 5 (NT_GNU_PROPERTY_TYPE_0). */
19860 assemble_align (POINTER_SIZE
);
19861 assemble_integer (GEN_INT (4), 4, 32, 1);
19862 assemble_integer (GEN_INT (ROUND_UP (12, POINTER_BYTES
)), 4, 32, 1);
19863 assemble_integer (GEN_INT (5), 4, 32, 1);
19865 /* PT_NOTE name. */
19866 assemble_string ("GNU", 4);
19868 /* PT_NOTE contents for NT_GNU_PROPERTY_TYPE_0:
19869 type = GNU_PROPERTY_AARCH64_FEATURE_1_AND
19871 data = feature_1_and. */
19872 assemble_integer (GEN_INT (GNU_PROPERTY_AARCH64_FEATURE_1_AND
), 4, 32, 1);
19873 assemble_integer (GEN_INT (4), 4, 32, 1);
19874 assemble_integer (GEN_INT (feature_1_and
), 4, 32, 1);
19876 /* Pad the size of the note to the required alignment. */
19877 assemble_align (POINTER_SIZE
);
19880 #undef GNU_PROPERTY_AARCH64_FEATURE_1_PAC
19881 #undef GNU_PROPERTY_AARCH64_FEATURE_1_BTI
19882 #undef GNU_PROPERTY_AARCH64_FEATURE_1_AND
19884 /* Target-specific selftests. */
19888 namespace selftest
{
19890 /* Selftest for the RTL loader.
19891 Verify that the RTL loader copes with a dump from
19892 print_rtx_function. This is essentially just a test that class
19893 function_reader can handle a real dump, but it also verifies
19894 that lookup_reg_by_dump_name correctly handles hard regs.
19895 The presence of hard reg names in the dump means that the test is
19896 target-specific, hence it is in this file. */
19899 aarch64_test_loading_full_dump ()
19901 rtl_dump_test
t (SELFTEST_LOCATION
, locate_file ("aarch64/times-two.rtl"));
19903 ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun
->decl
)));
19905 rtx_insn
*insn_1
= get_insn_by_uid (1);
19906 ASSERT_EQ (NOTE
, GET_CODE (insn_1
));
19908 rtx_insn
*insn_15
= get_insn_by_uid (15);
19909 ASSERT_EQ (INSN
, GET_CODE (insn_15
));
19910 ASSERT_EQ (USE
, GET_CODE (PATTERN (insn_15
)));
19912 /* Verify crtl->return_rtx. */
19913 ASSERT_EQ (REG
, GET_CODE (crtl
->return_rtx
));
19914 ASSERT_EQ (0, REGNO (crtl
->return_rtx
));
19915 ASSERT_EQ (SImode
, GET_MODE (crtl
->return_rtx
));
19918 /* Run all target-specific selftests. */
19921 aarch64_run_selftests (void)
19923 aarch64_test_loading_full_dump ();
19926 } // namespace selftest
19928 #endif /* #if CHECKING_P */
19930 #undef TARGET_STACK_PROTECT_GUARD
19931 #define TARGET_STACK_PROTECT_GUARD aarch64_stack_protect_guard
19933 #undef TARGET_ADDRESS_COST
19934 #define TARGET_ADDRESS_COST aarch64_address_cost
19936 /* This hook will determines whether unnamed bitfields affect the alignment
19937 of the containing structure. The hook returns true if the structure
19938 should inherit the alignment requirements of an unnamed bitfield's
19940 #undef TARGET_ALIGN_ANON_BITFIELD
19941 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
19943 #undef TARGET_ASM_ALIGNED_DI_OP
19944 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
19946 #undef TARGET_ASM_ALIGNED_HI_OP
19947 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
19949 #undef TARGET_ASM_ALIGNED_SI_OP
19950 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
19952 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
19953 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
19954 hook_bool_const_tree_hwi_hwi_const_tree_true
19956 #undef TARGET_ASM_FILE_START
19957 #define TARGET_ASM_FILE_START aarch64_start_file
19959 #undef TARGET_ASM_OUTPUT_MI_THUNK
19960 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
19962 #undef TARGET_ASM_SELECT_RTX_SECTION
19963 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
19965 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
19966 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
19968 #undef TARGET_BUILD_BUILTIN_VA_LIST
19969 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
19971 #undef TARGET_CALLEE_COPIES
19972 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
19974 #undef TARGET_CAN_ELIMINATE
19975 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
19977 #undef TARGET_CAN_INLINE_P
19978 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
19980 #undef TARGET_CANNOT_FORCE_CONST_MEM
19981 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
19983 #undef TARGET_CASE_VALUES_THRESHOLD
19984 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
19986 #undef TARGET_CONDITIONAL_REGISTER_USAGE
19987 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
19989 /* Only the least significant bit is used for initialization guard
19991 #undef TARGET_CXX_GUARD_MASK_BIT
19992 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
19994 #undef TARGET_C_MODE_FOR_SUFFIX
19995 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
19997 #ifdef TARGET_BIG_ENDIAN_DEFAULT
19998 #undef TARGET_DEFAULT_TARGET_FLAGS
19999 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
20002 #undef TARGET_CLASS_MAX_NREGS
20003 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
20005 #undef TARGET_BUILTIN_DECL
20006 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
20008 #undef TARGET_BUILTIN_RECIPROCAL
20009 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
20011 #undef TARGET_C_EXCESS_PRECISION
20012 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
20014 #undef TARGET_EXPAND_BUILTIN
20015 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
20017 #undef TARGET_EXPAND_BUILTIN_VA_START
20018 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
20020 #undef TARGET_FOLD_BUILTIN
20021 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
20023 #undef TARGET_FUNCTION_ARG
20024 #define TARGET_FUNCTION_ARG aarch64_function_arg
20026 #undef TARGET_FUNCTION_ARG_ADVANCE
20027 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
20029 #undef TARGET_FUNCTION_ARG_BOUNDARY
20030 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
20032 #undef TARGET_FUNCTION_ARG_PADDING
20033 #define TARGET_FUNCTION_ARG_PADDING aarch64_function_arg_padding
20035 #undef TARGET_GET_RAW_RESULT_MODE
20036 #define TARGET_GET_RAW_RESULT_MODE aarch64_get_reg_raw_mode
20037 #undef TARGET_GET_RAW_ARG_MODE
20038 #define TARGET_GET_RAW_ARG_MODE aarch64_get_reg_raw_mode
20040 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
20041 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
20043 #undef TARGET_FUNCTION_VALUE
20044 #define TARGET_FUNCTION_VALUE aarch64_function_value
20046 #undef TARGET_FUNCTION_VALUE_REGNO_P
20047 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
20049 #undef TARGET_GIMPLE_FOLD_BUILTIN
20050 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
20052 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
20053 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
20055 #undef TARGET_INIT_BUILTINS
20056 #define TARGET_INIT_BUILTINS aarch64_init_builtins
20058 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
20059 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
20060 aarch64_ira_change_pseudo_allocno_class
20062 #undef TARGET_LEGITIMATE_ADDRESS_P
20063 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
20065 #undef TARGET_LEGITIMATE_CONSTANT_P
20066 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
20068 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
20069 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
20070 aarch64_legitimize_address_displacement
20072 #undef TARGET_LIBGCC_CMP_RETURN_MODE
20073 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
20075 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
20076 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
20077 aarch64_libgcc_floating_mode_supported_p
20079 #undef TARGET_MANGLE_TYPE
20080 #define TARGET_MANGLE_TYPE aarch64_mangle_type
20082 #undef TARGET_MEMORY_MOVE_COST
20083 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
20085 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
20086 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
20088 #undef TARGET_MUST_PASS_IN_STACK
20089 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
20091 /* This target hook should return true if accesses to volatile bitfields
20092 should use the narrowest mode possible. It should return false if these
20093 accesses should use the bitfield container type. */
20094 #undef TARGET_NARROW_VOLATILE_BITFIELD
20095 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
20097 #undef TARGET_OPTION_OVERRIDE
20098 #define TARGET_OPTION_OVERRIDE aarch64_override_options
20100 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
20101 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
20102 aarch64_override_options_after_change
20104 #undef TARGET_OPTION_SAVE
20105 #define TARGET_OPTION_SAVE aarch64_option_save
20107 #undef TARGET_OPTION_RESTORE
20108 #define TARGET_OPTION_RESTORE aarch64_option_restore
20110 #undef TARGET_OPTION_PRINT
20111 #define TARGET_OPTION_PRINT aarch64_option_print
20113 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
20114 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
20116 #undef TARGET_SET_CURRENT_FUNCTION
20117 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
20119 #undef TARGET_PASS_BY_REFERENCE
20120 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
20122 #undef TARGET_PREFERRED_RELOAD_CLASS
20123 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
20125 #undef TARGET_SCHED_REASSOCIATION_WIDTH
20126 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
20128 #undef TARGET_PROMOTED_TYPE
20129 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
20131 #undef TARGET_SECONDARY_RELOAD
20132 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
20134 #undef TARGET_SHIFT_TRUNCATION_MASK
20135 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
20137 #undef TARGET_SETUP_INCOMING_VARARGS
20138 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
20140 #undef TARGET_STRUCT_VALUE_RTX
20141 #define TARGET_STRUCT_VALUE_RTX aarch64_struct_value_rtx
20143 #undef TARGET_REGISTER_MOVE_COST
20144 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
20146 #undef TARGET_RETURN_IN_MEMORY
20147 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
20149 #undef TARGET_RETURN_IN_MSB
20150 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
20152 #undef TARGET_RTX_COSTS
20153 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
20155 #undef TARGET_SCALAR_MODE_SUPPORTED_P
20156 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
20158 #undef TARGET_SCHED_ISSUE_RATE
20159 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
20161 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
20162 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
20163 aarch64_sched_first_cycle_multipass_dfa_lookahead
20165 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
20166 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
20167 aarch64_first_cycle_multipass_dfa_lookahead_guard
20169 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
20170 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
20171 aarch64_get_separate_components
20173 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
20174 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
20175 aarch64_components_for_bb
20177 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
20178 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
20179 aarch64_disqualify_components
20181 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
20182 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
20183 aarch64_emit_prologue_components
20185 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
20186 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
20187 aarch64_emit_epilogue_components
20189 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
20190 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
20191 aarch64_set_handled_components
20193 #undef TARGET_TRAMPOLINE_INIT
20194 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
20196 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
20197 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
20199 #undef TARGET_VECTOR_MODE_SUPPORTED_P
20200 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
20202 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
20203 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
20204 aarch64_builtin_support_vector_misalignment
20206 #undef TARGET_ARRAY_MODE
20207 #define TARGET_ARRAY_MODE aarch64_array_mode
20209 #undef TARGET_ARRAY_MODE_SUPPORTED_P
20210 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
20212 #undef TARGET_VECTORIZE_ADD_STMT_COST
20213 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
20215 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
20216 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
20217 aarch64_builtin_vectorization_cost
20219 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
20220 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
20222 #undef TARGET_VECTORIZE_BUILTINS
20223 #define TARGET_VECTORIZE_BUILTINS
20225 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
20226 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
20227 aarch64_builtin_vectorized_function
20229 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
20230 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
20231 aarch64_autovectorize_vector_sizes
20233 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
20234 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
20235 aarch64_atomic_assign_expand_fenv
20237 /* Section anchor support. */
20239 #undef TARGET_MIN_ANCHOR_OFFSET
20240 #define TARGET_MIN_ANCHOR_OFFSET -256
20242 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
20243 byte offset; we can do much more for larger data types, but have no way
20244 to determine the size of the access. We assume accesses are aligned. */
20245 #undef TARGET_MAX_ANCHOR_OFFSET
20246 #define TARGET_MAX_ANCHOR_OFFSET 4095
20248 #undef TARGET_VECTOR_ALIGNMENT
20249 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
20251 #undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
20252 #define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
20253 aarch64_vectorize_preferred_vector_alignment
20254 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
20255 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
20256 aarch64_simd_vector_alignment_reachable
20258 /* vec_perm support. */
20260 #undef TARGET_VECTORIZE_VEC_PERM_CONST
20261 #define TARGET_VECTORIZE_VEC_PERM_CONST \
20262 aarch64_vectorize_vec_perm_const
20264 #undef TARGET_VECTORIZE_GET_MASK_MODE
20265 #define TARGET_VECTORIZE_GET_MASK_MODE aarch64_get_mask_mode
20266 #undef TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE
20267 #define TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE \
20268 aarch64_empty_mask_is_expensive
20269 #undef TARGET_PREFERRED_ELSE_VALUE
20270 #define TARGET_PREFERRED_ELSE_VALUE \
20271 aarch64_preferred_else_value
20273 #undef TARGET_INIT_LIBFUNCS
20274 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
20276 #undef TARGET_FIXED_CONDITION_CODE_REGS
20277 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
20279 #undef TARGET_FLAGS_REGNUM
20280 #define TARGET_FLAGS_REGNUM CC_REGNUM
20282 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
20283 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
20285 #undef TARGET_ASAN_SHADOW_OFFSET
20286 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
20288 #undef TARGET_LEGITIMIZE_ADDRESS
20289 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
20291 #undef TARGET_SCHED_CAN_SPECULATE_INSN
20292 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
20294 #undef TARGET_CAN_USE_DOLOOP_P
20295 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
20297 #undef TARGET_SCHED_ADJUST_PRIORITY
20298 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
20300 #undef TARGET_SCHED_MACRO_FUSION_P
20301 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
20303 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
20304 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
20306 #undef TARGET_SCHED_FUSION_PRIORITY
20307 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
20309 #undef TARGET_UNSPEC_MAY_TRAP_P
20310 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
20312 #undef TARGET_USE_PSEUDO_PIC_REG
20313 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
20315 #undef TARGET_PRINT_OPERAND
20316 #define TARGET_PRINT_OPERAND aarch64_print_operand
20318 #undef TARGET_PRINT_OPERAND_ADDRESS
20319 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
20321 #undef TARGET_OPTAB_SUPPORTED_P
20322 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
20324 #undef TARGET_OMIT_STRUCT_RETURN_REG
20325 #define TARGET_OMIT_STRUCT_RETURN_REG true
20327 #undef TARGET_DWARF_POLY_INDETERMINATE_VALUE
20328 #define TARGET_DWARF_POLY_INDETERMINATE_VALUE \
20329 aarch64_dwarf_poly_indeterminate_value
20331 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors. */
20332 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
20333 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
20335 #undef TARGET_HARD_REGNO_NREGS
20336 #define TARGET_HARD_REGNO_NREGS aarch64_hard_regno_nregs
20337 #undef TARGET_HARD_REGNO_MODE_OK
20338 #define TARGET_HARD_REGNO_MODE_OK aarch64_hard_regno_mode_ok
20340 #undef TARGET_MODES_TIEABLE_P
20341 #define TARGET_MODES_TIEABLE_P aarch64_modes_tieable_p
20343 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
20344 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
20345 aarch64_hard_regno_call_part_clobbered
20347 #undef TARGET_REMOVE_EXTRA_CALL_PRESERVED_REGS
20348 #define TARGET_REMOVE_EXTRA_CALL_PRESERVED_REGS \
20349 aarch64_remove_extra_call_preserved_regs
20351 #undef TARGET_RETURN_CALL_WITH_MAX_CLOBBERS
20352 #define TARGET_RETURN_CALL_WITH_MAX_CLOBBERS \
20353 aarch64_return_call_with_max_clobbers
20355 #undef TARGET_CONSTANT_ALIGNMENT
20356 #define TARGET_CONSTANT_ALIGNMENT aarch64_constant_alignment
20358 #undef TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE
20359 #define TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE \
20360 aarch64_stack_clash_protection_alloca_probe_range
20362 #undef TARGET_COMPUTE_PRESSURE_CLASSES
20363 #define TARGET_COMPUTE_PRESSURE_CLASSES aarch64_compute_pressure_classes
20365 #undef TARGET_CAN_CHANGE_MODE_CLASS
20366 #define TARGET_CAN_CHANGE_MODE_CLASS aarch64_can_change_mode_class
20368 #undef TARGET_SELECT_EARLY_REMAT_MODES
20369 #define TARGET_SELECT_EARLY_REMAT_MODES aarch64_select_early_remat_modes
20371 #undef TARGET_SPECULATION_SAFE_VALUE
20372 #define TARGET_SPECULATION_SAFE_VALUE aarch64_speculation_safe_value
20374 #undef TARGET_ESTIMATED_POLY_VALUE
20375 #define TARGET_ESTIMATED_POLY_VALUE aarch64_estimated_poly_value
20377 #undef TARGET_ATTRIBUTE_TABLE
20378 #define TARGET_ATTRIBUTE_TABLE aarch64_attribute_table
20380 #undef TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN
20381 #define TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN \
20382 aarch64_simd_clone_compute_vecsize_and_simdlen
20384 #undef TARGET_SIMD_CLONE_ADJUST
20385 #define TARGET_SIMD_CLONE_ADJUST aarch64_simd_clone_adjust
20387 #undef TARGET_SIMD_CLONE_USABLE
20388 #define TARGET_SIMD_CLONE_USABLE aarch64_simd_clone_usable
20390 #undef TARGET_COMP_TYPE_ATTRIBUTES
20391 #define TARGET_COMP_TYPE_ATTRIBUTES aarch64_comp_type_attributes
20393 #undef TARGET_GET_MULTILIB_ABI_NAME
20394 #define TARGET_GET_MULTILIB_ABI_NAME aarch64_get_multilib_abi_name
20397 #undef TARGET_RUN_TARGET_SELFTESTS
20398 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
20399 #endif /* #if CHECKING_P */
20401 #undef TARGET_ASM_POST_CFI_STARTPROC
20402 #define TARGET_ASM_POST_CFI_STARTPROC aarch64_post_cfi_startproc
20404 struct gcc_target targetm
= TARGET_INITIALIZER
;
20406 #include "gt-aarch64.h"