1 /* Machine description for AArch64 architecture.
2 Copyright (C) 2009-2023 Free Software Foundation, Inc.
3 Contributed by ARM Ltd.
5 This file is part of GCC.
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3, or (at your option)
12 GCC is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
21 #define IN_TARGET_CODE 1
23 #define INCLUDE_STRING
24 #define INCLUDE_ALGORITHM
27 #include "coretypes.h"
38 #include "stringpool.h"
45 #include "diagnostic.h"
46 #include "insn-attr.h"
48 #include "fold-const.h"
49 #include "stor-layout.h"
57 #include "langhooks.h"
61 #include "gimple-iterator.h"
62 #include "tree-vectorizer.h"
63 #include "aarch64-cost-tables.h"
67 #include "tm-constrs.h"
68 #include "sched-int.h"
69 #include "target-globals.h"
70 #include "common/common-target.h"
73 #include "selftest-rtl.h"
74 #include "rtx-vector-builder.h"
77 #include "function-abi.h"
78 #include "gimple-pretty-print.h"
79 #include "tree-ssa-loop-niter.h"
80 #include "fractional-cost.h"
84 #include "aarch64-feature-deps.h"
86 /* This file should be included last. */
87 #include "target-def.h"
89 /* Defined for convenience. */
90 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
92 /* Information about a legitimate vector immediate operand. */
93 struct simd_immediate_info
95 enum insn_type
{ MOV
, MVN
, INDEX
, PTRUE
};
96 enum modifier_type
{ LSL
, MSL
};
98 simd_immediate_info () {}
99 simd_immediate_info (scalar_float_mode
, rtx
);
100 simd_immediate_info (scalar_int_mode
, unsigned HOST_WIDE_INT
,
101 insn_type
= MOV
, modifier_type
= LSL
,
103 simd_immediate_info (scalar_mode
, rtx
, rtx
);
104 simd_immediate_info (scalar_int_mode
, aarch64_svpattern
);
106 /* The mode of the elements. */
107 scalar_mode elt_mode
;
109 /* The instruction to use to move the immediate into a vector. */
114 /* For MOV and MVN. */
117 /* The value of each element. */
120 /* The kind of shift modifier to use, and the number of bits to shift.
121 This is (LSL, 0) if no shift is needed. */
122 modifier_type modifier
;
129 /* The value of the first element and the step to be added for each
130 subsequent element. */
135 aarch64_svpattern pattern
;
139 /* Construct a floating-point immediate in which each element has mode
140 ELT_MODE_IN and value VALUE_IN. */
141 inline simd_immediate_info
142 ::simd_immediate_info (scalar_float_mode elt_mode_in
, rtx value_in
)
143 : elt_mode (elt_mode_in
), insn (MOV
)
145 u
.mov
.value
= value_in
;
146 u
.mov
.modifier
= LSL
;
150 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
151 and value VALUE_IN. The other parameters are as for the structure
153 inline simd_immediate_info
154 ::simd_immediate_info (scalar_int_mode elt_mode_in
,
155 unsigned HOST_WIDE_INT value_in
,
156 insn_type insn_in
, modifier_type modifier_in
,
157 unsigned int shift_in
)
158 : elt_mode (elt_mode_in
), insn (insn_in
)
160 u
.mov
.value
= gen_int_mode (value_in
, elt_mode_in
);
161 u
.mov
.modifier
= modifier_in
;
162 u
.mov
.shift
= shift_in
;
165 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
166 and where element I is equal to BASE_IN + I * STEP_IN. */
167 inline simd_immediate_info
168 ::simd_immediate_info (scalar_mode elt_mode_in
, rtx base_in
, rtx step_in
)
169 : elt_mode (elt_mode_in
), insn (INDEX
)
171 u
.index
.base
= base_in
;
172 u
.index
.step
= step_in
;
175 /* Construct a predicate that controls elements of mode ELT_MODE_IN
176 and has PTRUE pattern PATTERN_IN. */
177 inline simd_immediate_info
178 ::simd_immediate_info (scalar_int_mode elt_mode_in
,
179 aarch64_svpattern pattern_in
)
180 : elt_mode (elt_mode_in
), insn (PTRUE
)
182 u
.pattern
= pattern_in
;
187 /* Describes types that map to Pure Scalable Types (PSTs) in the AAPCS64. */
188 class pure_scalable_type_info
191 /* Represents the result of analyzing a type. All values are nonzero,
192 in the possibly forlorn hope that accidental conversions to bool
193 trigger a warning. */
196 /* The type does not have an ABI identity; i.e. it doesn't contain
197 at least one object whose type is a Fundamental Data Type. */
200 /* The type is definitely a Pure Scalable Type. */
203 /* The type is definitely not a Pure Scalable Type. */
206 /* It doesn't matter for PCS purposes whether the type is a Pure
207 Scalable Type or not, since the type will be handled the same
210 Specifically, this means that if the type is a Pure Scalable Type,
211 there aren't enough argument registers to hold it, and so it will
212 need to be passed or returned in memory. If the type isn't a
213 Pure Scalable Type, it's too big to be passed or returned in core
214 or SIMD&FP registers, and so again will need to go in memory. */
218 /* Aggregates of 17 bytes or more are normally passed and returned
219 in memory, so aggregates of that size can safely be analyzed as
220 DOESNT_MATTER. We need to be able to collect enough pieces to
221 represent a PST that is smaller than that. Since predicates are
222 2 bytes in size for -msve-vector-bits=128, that means we need to be
223 able to store at least 8 pieces.
225 We also need to be able to store enough pieces to represent
226 a single vector in each vector argument register and a single
227 predicate in each predicate argument register. This means that
228 we need at least 12 pieces. */
229 static const unsigned int MAX_PIECES
= NUM_FP_ARG_REGS
+ NUM_PR_ARG_REGS
;
230 static_assert (MAX_PIECES
>= 8, "Need to store at least 8 predicates");
232 /* Describes one piece of a PST. Each piece is one of:
234 - a single Scalable Vector Type (SVT)
235 - a single Scalable Predicate Type (SPT)
236 - a PST containing 2, 3 or 4 SVTs, with no padding
238 It either represents a single built-in type or a PST formed from
239 multiple homogeneous built-in types. */
242 rtx
get_rtx (unsigned int, unsigned int) const;
244 /* The number of vector and predicate registers that the piece
245 occupies. One of the two is always zero. */
249 /* The mode of the registers described above. */
252 /* If this piece is formed from multiple homogeneous built-in types,
253 this is the mode of the built-in types, otherwise it is MODE. */
254 machine_mode orig_mode
;
256 /* The offset in bytes of the piece from the start of the type. */
257 poly_uint64_pod offset
;
260 /* Divides types analyzed as IS_PST into individual pieces. The pieces
261 are in memory order. */
262 auto_vec
<piece
, MAX_PIECES
> pieces
;
264 unsigned int num_zr () const;
265 unsigned int num_pr () const;
267 rtx
get_rtx (machine_mode mode
, unsigned int, unsigned int) const;
269 analysis_result
analyze (const_tree
);
270 bool analyze_registers (const_tree
);
273 analysis_result
analyze_array (const_tree
);
274 analysis_result
analyze_record (const_tree
);
275 void add_piece (const piece
&);
279 /* The current code model. */
280 enum aarch64_code_model aarch64_cmodel
;
282 /* The number of 64-bit elements in an SVE vector. */
283 poly_uint16 aarch64_sve_vg
;
286 #undef TARGET_HAVE_TLS
287 #define TARGET_HAVE_TLS 1
290 static bool aarch64_composite_type_p (const_tree
, machine_mode
);
291 static bool aarch64_return_in_memory_1 (const_tree
);
292 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode
,
294 machine_mode
*, int *,
296 static void aarch64_elf_asm_constructor (rtx
, int) ATTRIBUTE_UNUSED
;
297 static void aarch64_elf_asm_destructor (rtx
, int) ATTRIBUTE_UNUSED
;
298 static void aarch64_override_options_after_change (void);
299 static bool aarch64_vector_mode_supported_p (machine_mode
);
300 static int aarch64_address_cost (rtx
, machine_mode
, addr_space_t
, bool);
301 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode
,
305 static machine_mode
aarch64_simd_container_mode (scalar_mode
, poly_int64
);
306 static bool aarch64_print_address_internal (FILE*, machine_mode
, rtx
,
307 aarch64_addr_query_type
);
309 /* The processor for which instructions should be scheduled. */
310 enum aarch64_processor aarch64_tune
= cortexa53
;
312 /* Mask to specify which instruction scheduling options should be used. */
313 uint64_t aarch64_tune_flags
= 0;
315 /* Global flag for PC relative loads. */
316 bool aarch64_pcrelative_literal_loads
;
318 /* Global flag for whether frame pointer is enabled. */
319 bool aarch64_use_frame_pointer
;
321 #define BRANCH_PROTECT_STR_MAX 255
322 char *accepted_branch_protection_string
= NULL
;
324 static enum aarch64_parse_opt_result
325 aarch64_parse_branch_protection (const char*, char**);
327 /* Support for command line parsing of boolean flags in the tuning
329 struct aarch64_flag_desc
335 #define AARCH64_FUSION_PAIR(name, internal_name) \
336 { name, AARCH64_FUSE_##internal_name },
337 static const struct aarch64_flag_desc aarch64_fusible_pairs
[] =
339 { "none", AARCH64_FUSE_NOTHING
},
340 #include "aarch64-fusion-pairs.def"
341 { "all", AARCH64_FUSE_ALL
},
342 { NULL
, AARCH64_FUSE_NOTHING
}
345 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
346 { name, AARCH64_EXTRA_TUNE_##internal_name },
347 static const struct aarch64_flag_desc aarch64_tuning_flags
[] =
349 { "none", AARCH64_EXTRA_TUNE_NONE
},
350 #include "aarch64-tuning-flags.def"
351 { "all", AARCH64_EXTRA_TUNE_ALL
},
352 { NULL
, AARCH64_EXTRA_TUNE_NONE
}
355 /* Tuning parameters. */
357 static const struct cpu_addrcost_table generic_addrcost_table
=
367 0, /* post_modify_ld3_st3 */
368 0, /* post_modify_ld4_st4 */
369 0, /* register_offset */
370 0, /* register_sextend */
371 0, /* register_zextend */
375 static const struct cpu_addrcost_table exynosm1_addrcost_table
=
385 0, /* post_modify_ld3_st3 */
386 0, /* post_modify_ld4_st4 */
387 1, /* register_offset */
388 1, /* register_sextend */
389 2, /* register_zextend */
393 static const struct cpu_addrcost_table xgene1_addrcost_table
=
403 1, /* post_modify_ld3_st3 */
404 1, /* post_modify_ld4_st4 */
405 0, /* register_offset */
406 1, /* register_sextend */
407 1, /* register_zextend */
411 static const struct cpu_addrcost_table thunderx2t99_addrcost_table
=
421 0, /* post_modify_ld3_st3 */
422 0, /* post_modify_ld4_st4 */
423 2, /* register_offset */
424 3, /* register_sextend */
425 3, /* register_zextend */
429 static const struct cpu_addrcost_table thunderx3t110_addrcost_table
=
439 0, /* post_modify_ld3_st3 */
440 0, /* post_modify_ld4_st4 */
441 2, /* register_offset */
442 3, /* register_sextend */
443 3, /* register_zextend */
447 static const struct cpu_addrcost_table tsv110_addrcost_table
=
457 0, /* post_modify_ld3_st3 */
458 0, /* post_modify_ld4_st4 */
459 0, /* register_offset */
460 1, /* register_sextend */
461 1, /* register_zextend */
465 static const struct cpu_addrcost_table qdf24xx_addrcost_table
=
475 1, /* post_modify_ld3_st3 */
476 1, /* post_modify_ld4_st4 */
477 3, /* register_offset */
478 3, /* register_sextend */
479 3, /* register_zextend */
483 static const struct cpu_addrcost_table a64fx_addrcost_table
=
493 0, /* post_modify_ld3_st3 */
494 0, /* post_modify_ld4_st4 */
495 2, /* register_offset */
496 3, /* register_sextend */
497 3, /* register_zextend */
501 static const struct cpu_addrcost_table neoversev1_addrcost_table
=
511 3, /* post_modify_ld3_st3 */
512 3, /* post_modify_ld4_st4 */
513 0, /* register_offset */
514 0, /* register_sextend */
515 0, /* register_zextend */
519 static const struct cpu_addrcost_table neoversen2_addrcost_table
=
529 2, /* post_modify_ld3_st3 */
530 2, /* post_modify_ld4_st4 */
531 0, /* register_offset */
532 0, /* register_sextend */
533 0, /* register_zextend */
537 static const struct cpu_addrcost_table neoversev2_addrcost_table
=
547 2, /* post_modify_ld3_st3 */
548 2, /* post_modify_ld4_st4 */
549 0, /* register_offset */
550 0, /* register_sextend */
551 0, /* register_zextend */
555 static const struct cpu_regmove_cost generic_regmove_cost
=
558 /* Avoid the use of slow int<->fp moves for spilling by setting
559 their cost higher than memmov_cost. */
565 static const struct cpu_regmove_cost cortexa57_regmove_cost
=
568 /* Avoid the use of slow int<->fp moves for spilling by setting
569 their cost higher than memmov_cost. */
575 static const struct cpu_regmove_cost cortexa53_regmove_cost
=
578 /* Avoid the use of slow int<->fp moves for spilling by setting
579 their cost higher than memmov_cost. */
585 static const struct cpu_regmove_cost exynosm1_regmove_cost
=
588 /* Avoid the use of slow int<->fp moves for spilling by setting
589 their cost higher than memmov_cost (actual, 4 and 9). */
595 static const struct cpu_regmove_cost thunderx_regmove_cost
=
603 static const struct cpu_regmove_cost xgene1_regmove_cost
=
606 /* Avoid the use of slow int<->fp moves for spilling by setting
607 their cost higher than memmov_cost. */
613 static const struct cpu_regmove_cost qdf24xx_regmove_cost
=
616 /* Avoid the use of int<->fp moves for spilling. */
622 static const struct cpu_regmove_cost thunderx2t99_regmove_cost
=
625 /* Avoid the use of int<->fp moves for spilling. */
631 static const struct cpu_regmove_cost thunderx3t110_regmove_cost
=
634 /* Avoid the use of int<->fp moves for spilling. */
640 static const struct cpu_regmove_cost tsv110_regmove_cost
=
643 /* Avoid the use of slow int<->fp moves for spilling by setting
644 their cost higher than memmov_cost. */
650 static const struct cpu_regmove_cost a64fx_regmove_cost
=
653 /* Avoid the use of slow int<->fp moves for spilling by setting
654 their cost higher than memmov_cost. */
660 static const struct cpu_regmove_cost neoversen2_regmove_cost
=
663 /* Spilling to int<->fp instead of memory is recommended so set
664 realistic costs compared to memmov_cost. */
670 static const struct cpu_regmove_cost neoversev1_regmove_cost
=
673 /* Spilling to int<->fp instead of memory is recommended so set
674 realistic costs compared to memmov_cost. */
680 static const struct cpu_regmove_cost neoversev2_regmove_cost
=
683 /* Spilling to int<->fp instead of memory is recommended so set
684 realistic costs compared to memmov_cost. */
690 /* Generic costs for Advanced SIMD vector operations. */
691 static const advsimd_vec_cost generic_advsimd_vector_cost
=
693 1, /* int_stmt_cost */
694 1, /* fp_stmt_cost */
695 0, /* ld2_st2_permute_cost */
696 0, /* ld3_st3_permute_cost */
697 0, /* ld4_st4_permute_cost */
698 2, /* permute_cost */
699 2, /* reduc_i8_cost */
700 2, /* reduc_i16_cost */
701 2, /* reduc_i32_cost */
702 2, /* reduc_i64_cost */
703 2, /* reduc_f16_cost */
704 2, /* reduc_f32_cost */
705 2, /* reduc_f64_cost */
706 2, /* store_elt_extra_cost */
707 2, /* vec_to_scalar_cost */
708 1, /* scalar_to_vec_cost */
709 1, /* align_load_cost */
710 1, /* unalign_load_cost */
711 1, /* unalign_store_cost */
715 /* Generic costs for SVE vector operations. */
716 static const sve_vec_cost generic_sve_vector_cost
=
719 1, /* int_stmt_cost */
720 1, /* fp_stmt_cost */
721 0, /* ld2_st2_permute_cost */
722 0, /* ld3_st3_permute_cost */
723 0, /* ld4_st4_permute_cost */
724 2, /* permute_cost */
725 2, /* reduc_i8_cost */
726 2, /* reduc_i16_cost */
727 2, /* reduc_i32_cost */
728 2, /* reduc_i64_cost */
729 2, /* reduc_f16_cost */
730 2, /* reduc_f32_cost */
731 2, /* reduc_f64_cost */
732 2, /* store_elt_extra_cost */
733 2, /* vec_to_scalar_cost */
734 1, /* scalar_to_vec_cost */
735 1, /* align_load_cost */
736 1, /* unalign_load_cost */
737 1, /* unalign_store_cost */
741 2, /* fadda_f16_cost */
742 2, /* fadda_f32_cost */
743 2, /* fadda_f64_cost */
744 4, /* gather_load_x32_cost */
745 2, /* gather_load_x64_cost */
746 1 /* scatter_store_elt_cost */
749 /* Generic costs for vector insn classes. */
750 static const struct cpu_vector_cost generic_vector_cost
=
752 1, /* scalar_int_stmt_cost */
753 1, /* scalar_fp_stmt_cost */
754 1, /* scalar_load_cost */
755 1, /* scalar_store_cost */
756 3, /* cond_taken_branch_cost */
757 1, /* cond_not_taken_branch_cost */
758 &generic_advsimd_vector_cost
, /* advsimd */
759 &generic_sve_vector_cost
, /* sve */
760 nullptr /* issue_info */
763 static const advsimd_vec_cost a64fx_advsimd_vector_cost
=
765 2, /* int_stmt_cost */
766 5, /* fp_stmt_cost */
767 0, /* ld2_st2_permute_cost */
768 0, /* ld3_st3_permute_cost */
769 0, /* ld4_st4_permute_cost */
770 3, /* permute_cost */
771 13, /* reduc_i8_cost */
772 13, /* reduc_i16_cost */
773 13, /* reduc_i32_cost */
774 13, /* reduc_i64_cost */
775 13, /* reduc_f16_cost */
776 13, /* reduc_f32_cost */
777 13, /* reduc_f64_cost */
778 13, /* store_elt_extra_cost */
779 13, /* vec_to_scalar_cost */
780 4, /* scalar_to_vec_cost */
781 6, /* align_load_cost */
782 6, /* unalign_load_cost */
783 1, /* unalign_store_cost */
787 static const sve_vec_cost a64fx_sve_vector_cost
=
790 2, /* int_stmt_cost */
791 5, /* fp_stmt_cost */
792 0, /* ld2_st2_permute_cost */
793 0, /* ld3_st3_permute_cost */
794 0, /* ld4_st4_permute_cost */
795 3, /* permute_cost */
796 13, /* reduc_i8_cost */
797 13, /* reduc_i16_cost */
798 13, /* reduc_i32_cost */
799 13, /* reduc_i64_cost */
800 13, /* reduc_f16_cost */
801 13, /* reduc_f32_cost */
802 13, /* reduc_f64_cost */
803 13, /* store_elt_extra_cost */
804 13, /* vec_to_scalar_cost */
805 4, /* scalar_to_vec_cost */
806 6, /* align_load_cost */
807 6, /* unalign_load_cost */
808 1, /* unalign_store_cost */
812 13, /* fadda_f16_cost */
813 13, /* fadda_f32_cost */
814 13, /* fadda_f64_cost */
815 64, /* gather_load_x32_cost */
816 32, /* gather_load_x64_cost */
817 1 /* scatter_store_elt_cost */
820 static const struct cpu_vector_cost a64fx_vector_cost
=
822 1, /* scalar_int_stmt_cost */
823 5, /* scalar_fp_stmt_cost */
824 4, /* scalar_load_cost */
825 1, /* scalar_store_cost */
826 3, /* cond_taken_branch_cost */
827 1, /* cond_not_taken_branch_cost */
828 &a64fx_advsimd_vector_cost
, /* advsimd */
829 &a64fx_sve_vector_cost
, /* sve */
830 nullptr /* issue_info */
833 static const advsimd_vec_cost qdf24xx_advsimd_vector_cost
=
835 1, /* int_stmt_cost */
836 3, /* fp_stmt_cost */
837 0, /* ld2_st2_permute_cost */
838 0, /* ld3_st3_permute_cost */
839 0, /* ld4_st4_permute_cost */
840 2, /* permute_cost */
841 1, /* reduc_i8_cost */
842 1, /* reduc_i16_cost */
843 1, /* reduc_i32_cost */
844 1, /* reduc_i64_cost */
845 1, /* reduc_f16_cost */
846 1, /* reduc_f32_cost */
847 1, /* reduc_f64_cost */
848 1, /* store_elt_extra_cost */
849 1, /* vec_to_scalar_cost */
850 1, /* scalar_to_vec_cost */
851 1, /* align_load_cost */
852 1, /* unalign_load_cost */
853 1, /* unalign_store_cost */
857 /* QDF24XX costs for vector insn classes. */
858 static const struct cpu_vector_cost qdf24xx_vector_cost
=
860 1, /* scalar_int_stmt_cost */
861 1, /* scalar_fp_stmt_cost */
862 1, /* scalar_load_cost */
863 1, /* scalar_store_cost */
864 3, /* cond_taken_branch_cost */
865 1, /* cond_not_taken_branch_cost */
866 &qdf24xx_advsimd_vector_cost
, /* advsimd */
868 nullptr /* issue_info */
872 static const advsimd_vec_cost thunderx_advsimd_vector_cost
=
874 4, /* int_stmt_cost */
875 1, /* fp_stmt_cost */
876 0, /* ld2_st2_permute_cost */
877 0, /* ld3_st3_permute_cost */
878 0, /* ld4_st4_permute_cost */
879 4, /* permute_cost */
880 2, /* reduc_i8_cost */
881 2, /* reduc_i16_cost */
882 2, /* reduc_i32_cost */
883 2, /* reduc_i64_cost */
884 2, /* reduc_f16_cost */
885 2, /* reduc_f32_cost */
886 2, /* reduc_f64_cost */
887 2, /* store_elt_extra_cost */
888 2, /* vec_to_scalar_cost */
889 2, /* scalar_to_vec_cost */
890 3, /* align_load_cost */
891 5, /* unalign_load_cost */
892 5, /* unalign_store_cost */
896 /* ThunderX costs for vector insn classes. */
897 static const struct cpu_vector_cost thunderx_vector_cost
=
899 1, /* scalar_int_stmt_cost */
900 1, /* scalar_fp_stmt_cost */
901 3, /* scalar_load_cost */
902 1, /* scalar_store_cost */
903 3, /* cond_taken_branch_cost */
904 3, /* cond_not_taken_branch_cost */
905 &thunderx_advsimd_vector_cost
, /* advsimd */
907 nullptr /* issue_info */
910 static const advsimd_vec_cost tsv110_advsimd_vector_cost
=
912 2, /* int_stmt_cost */
913 2, /* fp_stmt_cost */
914 0, /* ld2_st2_permute_cost */
915 0, /* ld3_st3_permute_cost */
916 0, /* ld4_st4_permute_cost */
917 2, /* permute_cost */
918 3, /* reduc_i8_cost */
919 3, /* reduc_i16_cost */
920 3, /* reduc_i32_cost */
921 3, /* reduc_i64_cost */
922 3, /* reduc_f16_cost */
923 3, /* reduc_f32_cost */
924 3, /* reduc_f64_cost */
925 3, /* store_elt_extra_cost */
926 3, /* vec_to_scalar_cost */
927 2, /* scalar_to_vec_cost */
928 5, /* align_load_cost */
929 5, /* unalign_load_cost */
930 1, /* unalign_store_cost */
934 static const struct cpu_vector_cost tsv110_vector_cost
=
936 1, /* scalar_int_stmt_cost */
937 1, /* scalar_fp_stmt_cost */
938 5, /* scalar_load_cost */
939 1, /* scalar_store_cost */
940 1, /* cond_taken_branch_cost */
941 1, /* cond_not_taken_branch_cost */
942 &tsv110_advsimd_vector_cost
, /* advsimd */
944 nullptr /* issue_info */
947 static const advsimd_vec_cost cortexa57_advsimd_vector_cost
=
949 2, /* int_stmt_cost */
950 2, /* fp_stmt_cost */
951 0, /* ld2_st2_permute_cost */
952 0, /* ld3_st3_permute_cost */
953 0, /* ld4_st4_permute_cost */
954 3, /* permute_cost */
955 8, /* reduc_i8_cost */
956 8, /* reduc_i16_cost */
957 8, /* reduc_i32_cost */
958 8, /* reduc_i64_cost */
959 8, /* reduc_f16_cost */
960 8, /* reduc_f32_cost */
961 8, /* reduc_f64_cost */
962 8, /* store_elt_extra_cost */
963 8, /* vec_to_scalar_cost */
964 8, /* scalar_to_vec_cost */
965 4, /* align_load_cost */
966 4, /* unalign_load_cost */
967 1, /* unalign_store_cost */
971 /* Cortex-A57 costs for vector insn classes. */
972 static const struct cpu_vector_cost cortexa57_vector_cost
=
974 1, /* scalar_int_stmt_cost */
975 1, /* scalar_fp_stmt_cost */
976 4, /* scalar_load_cost */
977 1, /* scalar_store_cost */
978 1, /* cond_taken_branch_cost */
979 1, /* cond_not_taken_branch_cost */
980 &cortexa57_advsimd_vector_cost
, /* advsimd */
982 nullptr /* issue_info */
985 static const advsimd_vec_cost exynosm1_advsimd_vector_cost
=
987 3, /* int_stmt_cost */
988 3, /* fp_stmt_cost */
989 0, /* ld2_st2_permute_cost */
990 0, /* ld3_st3_permute_cost */
991 0, /* ld4_st4_permute_cost */
992 3, /* permute_cost */
993 3, /* reduc_i8_cost */
994 3, /* reduc_i16_cost */
995 3, /* reduc_i32_cost */
996 3, /* reduc_i64_cost */
997 3, /* reduc_f16_cost */
998 3, /* reduc_f32_cost */
999 3, /* reduc_f64_cost */
1000 3, /* store_elt_extra_cost */
1001 3, /* vec_to_scalar_cost */
1002 3, /* scalar_to_vec_cost */
1003 5, /* align_load_cost */
1004 5, /* unalign_load_cost */
1005 1, /* unalign_store_cost */
1009 static const struct cpu_vector_cost exynosm1_vector_cost
=
1011 1, /* scalar_int_stmt_cost */
1012 1, /* scalar_fp_stmt_cost */
1013 5, /* scalar_load_cost */
1014 1, /* scalar_store_cost */
1015 1, /* cond_taken_branch_cost */
1016 1, /* cond_not_taken_branch_cost */
1017 &exynosm1_advsimd_vector_cost
, /* advsimd */
1019 nullptr /* issue_info */
1022 static const advsimd_vec_cost xgene1_advsimd_vector_cost
=
1024 2, /* int_stmt_cost */
1025 2, /* fp_stmt_cost */
1026 0, /* ld2_st2_permute_cost */
1027 0, /* ld3_st3_permute_cost */
1028 0, /* ld4_st4_permute_cost */
1029 2, /* permute_cost */
1030 4, /* reduc_i8_cost */
1031 4, /* reduc_i16_cost */
1032 4, /* reduc_i32_cost */
1033 4, /* reduc_i64_cost */
1034 4, /* reduc_f16_cost */
1035 4, /* reduc_f32_cost */
1036 4, /* reduc_f64_cost */
1037 4, /* store_elt_extra_cost */
1038 4, /* vec_to_scalar_cost */
1039 4, /* scalar_to_vec_cost */
1040 10, /* align_load_cost */
1041 10, /* unalign_load_cost */
1042 2, /* unalign_store_cost */
1046 /* Generic costs for vector insn classes. */
1047 static const struct cpu_vector_cost xgene1_vector_cost
=
1049 1, /* scalar_int_stmt_cost */
1050 1, /* scalar_fp_stmt_cost */
1051 5, /* scalar_load_cost */
1052 1, /* scalar_store_cost */
1053 2, /* cond_taken_branch_cost */
1054 1, /* cond_not_taken_branch_cost */
1055 &xgene1_advsimd_vector_cost
, /* advsimd */
1057 nullptr /* issue_info */
1060 static const advsimd_vec_cost thunderx2t99_advsimd_vector_cost
=
1062 4, /* int_stmt_cost */
1063 5, /* fp_stmt_cost */
1064 0, /* ld2_st2_permute_cost */
1065 0, /* ld3_st3_permute_cost */
1066 0, /* ld4_st4_permute_cost */
1067 10, /* permute_cost */
1068 6, /* reduc_i8_cost */
1069 6, /* reduc_i16_cost */
1070 6, /* reduc_i32_cost */
1071 6, /* reduc_i64_cost */
1072 6, /* reduc_f16_cost */
1073 6, /* reduc_f32_cost */
1074 6, /* reduc_f64_cost */
1075 6, /* store_elt_extra_cost */
1076 6, /* vec_to_scalar_cost */
1077 5, /* scalar_to_vec_cost */
1078 4, /* align_load_cost */
1079 4, /* unalign_load_cost */
1080 1, /* unalign_store_cost */
1084 /* Costs for vector insn classes for Vulcan. */
1085 static const struct cpu_vector_cost thunderx2t99_vector_cost
=
1087 1, /* scalar_int_stmt_cost */
1088 6, /* scalar_fp_stmt_cost */
1089 4, /* scalar_load_cost */
1090 1, /* scalar_store_cost */
1091 2, /* cond_taken_branch_cost */
1092 1, /* cond_not_taken_branch_cost */
1093 &thunderx2t99_advsimd_vector_cost
, /* advsimd */
1095 nullptr /* issue_info */
1098 static const advsimd_vec_cost thunderx3t110_advsimd_vector_cost
=
1100 5, /* int_stmt_cost */
1101 5, /* fp_stmt_cost */
1102 0, /* ld2_st2_permute_cost */
1103 0, /* ld3_st3_permute_cost */
1104 0, /* ld4_st4_permute_cost */
1105 10, /* permute_cost */
1106 5, /* reduc_i8_cost */
1107 5, /* reduc_i16_cost */
1108 5, /* reduc_i32_cost */
1109 5, /* reduc_i64_cost */
1110 5, /* reduc_f16_cost */
1111 5, /* reduc_f32_cost */
1112 5, /* reduc_f64_cost */
1113 5, /* store_elt_extra_cost */
1114 5, /* vec_to_scalar_cost */
1115 5, /* scalar_to_vec_cost */
1116 4, /* align_load_cost */
1117 4, /* unalign_load_cost */
1118 4, /* unalign_store_cost */
1122 static const struct cpu_vector_cost thunderx3t110_vector_cost
=
1124 1, /* scalar_int_stmt_cost */
1125 5, /* scalar_fp_stmt_cost */
1126 4, /* scalar_load_cost */
1127 1, /* scalar_store_cost */
1128 2, /* cond_taken_branch_cost */
1129 1, /* cond_not_taken_branch_cost */
1130 &thunderx3t110_advsimd_vector_cost
, /* advsimd */
1132 nullptr /* issue_info */
1135 static const advsimd_vec_cost ampere1_advsimd_vector_cost
=
1137 3, /* int_stmt_cost */
1138 3, /* fp_stmt_cost */
1139 0, /* ld2_st2_permute_cost */
1140 0, /* ld3_st3_permute_cost */
1141 0, /* ld4_st4_permute_cost */
1142 2, /* permute_cost */
1143 12, /* reduc_i8_cost */
1144 9, /* reduc_i16_cost */
1145 6, /* reduc_i32_cost */
1146 5, /* reduc_i64_cost */
1147 9, /* reduc_f16_cost */
1148 6, /* reduc_f32_cost */
1149 5, /* reduc_f64_cost */
1150 8, /* store_elt_extra_cost */
1151 6, /* vec_to_scalar_cost */
1152 7, /* scalar_to_vec_cost */
1153 5, /* align_load_cost */
1154 5, /* unalign_load_cost */
1155 2, /* unalign_store_cost */
1159 /* Ampere-1 costs for vector insn classes. */
1160 static const struct cpu_vector_cost ampere1_vector_cost
=
1162 1, /* scalar_int_stmt_cost */
1163 1, /* scalar_fp_stmt_cost */
1164 4, /* scalar_load_cost */
1165 1, /* scalar_store_cost */
1166 1, /* cond_taken_branch_cost */
1167 1, /* cond_not_taken_branch_cost */
1168 &ere1_advsimd_vector_cost
, /* advsimd */
1170 nullptr /* issue_info */
1173 /* Generic costs for branch instructions. */
1174 static const struct cpu_branch_cost generic_branch_cost
=
1176 1, /* Predictable. */
1177 3 /* Unpredictable. */
1180 /* Generic approximation modes. */
1181 static const cpu_approx_modes generic_approx_modes
=
1183 AARCH64_APPROX_NONE
, /* division */
1184 AARCH64_APPROX_NONE
, /* sqrt */
1185 AARCH64_APPROX_NONE
/* recip_sqrt */
1188 /* Approximation modes for Exynos M1. */
1189 static const cpu_approx_modes exynosm1_approx_modes
=
1191 AARCH64_APPROX_NONE
, /* division */
1192 AARCH64_APPROX_ALL
, /* sqrt */
1193 AARCH64_APPROX_ALL
/* recip_sqrt */
1196 /* Approximation modes for X-Gene 1. */
1197 static const cpu_approx_modes xgene1_approx_modes
=
1199 AARCH64_APPROX_NONE
, /* division */
1200 AARCH64_APPROX_NONE
, /* sqrt */
1201 AARCH64_APPROX_ALL
/* recip_sqrt */
1204 /* Generic prefetch settings (which disable prefetch). */
1205 static const cpu_prefetch_tune generic_prefetch_tune
=
1208 -1, /* l1_cache_size */
1209 -1, /* l1_cache_line_size */
1210 -1, /* l2_cache_size */
1211 true, /* prefetch_dynamic_strides */
1212 -1, /* minimum_stride */
1213 -1 /* default_opt_level */
1216 static const cpu_prefetch_tune exynosm1_prefetch_tune
=
1219 -1, /* l1_cache_size */
1220 64, /* l1_cache_line_size */
1221 -1, /* l2_cache_size */
1222 true, /* prefetch_dynamic_strides */
1223 -1, /* minimum_stride */
1224 -1 /* default_opt_level */
1227 static const cpu_prefetch_tune qdf24xx_prefetch_tune
=
1230 32, /* l1_cache_size */
1231 64, /* l1_cache_line_size */
1232 512, /* l2_cache_size */
1233 false, /* prefetch_dynamic_strides */
1234 2048, /* minimum_stride */
1235 3 /* default_opt_level */
1238 static const cpu_prefetch_tune thunderxt88_prefetch_tune
=
1241 32, /* l1_cache_size */
1242 128, /* l1_cache_line_size */
1243 16*1024, /* l2_cache_size */
1244 true, /* prefetch_dynamic_strides */
1245 -1, /* minimum_stride */
1246 3 /* default_opt_level */
1249 static const cpu_prefetch_tune thunderx_prefetch_tune
=
1252 32, /* l1_cache_size */
1253 128, /* l1_cache_line_size */
1254 -1, /* l2_cache_size */
1255 true, /* prefetch_dynamic_strides */
1256 -1, /* minimum_stride */
1257 -1 /* default_opt_level */
1260 static const cpu_prefetch_tune thunderx2t99_prefetch_tune
=
1263 32, /* l1_cache_size */
1264 64, /* l1_cache_line_size */
1265 256, /* l2_cache_size */
1266 true, /* prefetch_dynamic_strides */
1267 -1, /* minimum_stride */
1268 -1 /* default_opt_level */
1271 static const cpu_prefetch_tune thunderx3t110_prefetch_tune
=
1274 32, /* l1_cache_size */
1275 64, /* l1_cache_line_size */
1276 256, /* l2_cache_size */
1277 true, /* prefetch_dynamic_strides */
1278 -1, /* minimum_stride */
1279 -1 /* default_opt_level */
1282 static const cpu_prefetch_tune tsv110_prefetch_tune
=
1285 64, /* l1_cache_size */
1286 64, /* l1_cache_line_size */
1287 512, /* l2_cache_size */
1288 true, /* prefetch_dynamic_strides */
1289 -1, /* minimum_stride */
1290 -1 /* default_opt_level */
1293 static const cpu_prefetch_tune xgene1_prefetch_tune
=
1296 32, /* l1_cache_size */
1297 64, /* l1_cache_line_size */
1298 256, /* l2_cache_size */
1299 true, /* prefetch_dynamic_strides */
1300 -1, /* minimum_stride */
1301 -1 /* default_opt_level */
1304 static const cpu_prefetch_tune a64fx_prefetch_tune
=
1307 64, /* l1_cache_size */
1308 256, /* l1_cache_line_size */
1309 32768, /* l2_cache_size */
1310 true, /* prefetch_dynamic_strides */
1311 -1, /* minimum_stride */
1312 -1 /* default_opt_level */
1315 static const cpu_prefetch_tune ampere1_prefetch_tune
=
1318 64, /* l1_cache_size */
1319 64, /* l1_cache_line_size */
1320 2048, /* l2_cache_size */
1321 true, /* prefetch_dynamic_strides */
1322 -1, /* minimum_stride */
1323 -1 /* default_opt_level */
1326 static const struct tune_params generic_tunings
=
1328 &cortexa57_extra_costs
,
1329 &generic_addrcost_table
,
1330 &generic_regmove_cost
,
1331 &generic_vector_cost
,
1332 &generic_branch_cost
,
1333 &generic_approx_modes
,
1334 SVE_NOT_IMPLEMENTED
, /* sve_width */
1335 { 4, /* load_int. */
1341 }, /* memmov_cost. */
1343 (AARCH64_FUSE_AES_AESMC
| AARCH64_FUSE_CMP_BRANCH
), /* fusible_ops */
1344 "16:12", /* function_align. */
1345 "4", /* jump_align. */
1346 "8", /* loop_align. */
1347 2, /* int_reassoc_width. */
1348 4, /* fp_reassoc_width. */
1349 1, /* fma_reassoc_width. */
1350 1, /* vec_reassoc_width. */
1351 2, /* min_div_recip_mul_sf. */
1352 2, /* min_div_recip_mul_df. */
1353 0, /* max_case_values. */
1354 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
1355 /* Enabling AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS significantly benefits
1356 Neoverse V1. It does not have a noticeable effect on A64FX and should
1357 have at most a very minor effect on SVE2 cores. */
1358 (AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
), /* tune_flags. */
1359 &generic_prefetch_tune
1362 static const struct tune_params cortexa35_tunings
=
1364 &cortexa53_extra_costs
,
1365 &generic_addrcost_table
,
1366 &cortexa53_regmove_cost
,
1367 &generic_vector_cost
,
1368 &generic_branch_cost
,
1369 &generic_approx_modes
,
1370 SVE_NOT_IMPLEMENTED
, /* sve_width */
1371 { 4, /* load_int. */
1377 }, /* memmov_cost. */
1379 (AARCH64_FUSE_AES_AESMC
| AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
1380 | AARCH64_FUSE_MOVK_MOVK
| AARCH64_FUSE_ADRP_LDR
), /* fusible_ops */
1381 "16", /* function_align. */
1382 "4", /* jump_align. */
1383 "8", /* loop_align. */
1384 2, /* int_reassoc_width. */
1385 4, /* fp_reassoc_width. */
1386 1, /* fma_reassoc_width. */
1387 1, /* vec_reassoc_width. */
1388 2, /* min_div_recip_mul_sf. */
1389 2, /* min_div_recip_mul_df. */
1390 0, /* max_case_values. */
1391 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
1392 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
1393 &generic_prefetch_tune
1396 static const struct tune_params cortexa53_tunings
=
1398 &cortexa53_extra_costs
,
1399 &generic_addrcost_table
,
1400 &cortexa53_regmove_cost
,
1401 &generic_vector_cost
,
1402 &generic_branch_cost
,
1403 &generic_approx_modes
,
1404 SVE_NOT_IMPLEMENTED
, /* sve_width */
1405 { 4, /* load_int. */
1411 }, /* memmov_cost. */
1413 (AARCH64_FUSE_AES_AESMC
| AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
1414 | AARCH64_FUSE_MOVK_MOVK
| AARCH64_FUSE_ADRP_LDR
), /* fusible_ops */
1415 "16", /* function_align. */
1416 "4", /* jump_align. */
1417 "8", /* loop_align. */
1418 2, /* int_reassoc_width. */
1419 4, /* fp_reassoc_width. */
1420 1, /* fma_reassoc_width. */
1421 1, /* vec_reassoc_width. */
1422 2, /* min_div_recip_mul_sf. */
1423 2, /* min_div_recip_mul_df. */
1424 0, /* max_case_values. */
1425 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
1426 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
1427 &generic_prefetch_tune
1430 static const struct tune_params cortexa57_tunings
=
1432 &cortexa57_extra_costs
,
1433 &generic_addrcost_table
,
1434 &cortexa57_regmove_cost
,
1435 &cortexa57_vector_cost
,
1436 &generic_branch_cost
,
1437 &generic_approx_modes
,
1438 SVE_NOT_IMPLEMENTED
, /* sve_width */
1439 { 4, /* load_int. */
1445 }, /* memmov_cost. */
1447 (AARCH64_FUSE_AES_AESMC
| AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
1448 | AARCH64_FUSE_MOVK_MOVK
), /* fusible_ops */
1449 "16", /* function_align. */
1450 "4", /* jump_align. */
1451 "8", /* loop_align. */
1452 2, /* int_reassoc_width. */
1453 4, /* fp_reassoc_width. */
1454 1, /* fma_reassoc_width. */
1455 1, /* vec_reassoc_width. */
1456 2, /* min_div_recip_mul_sf. */
1457 2, /* min_div_recip_mul_df. */
1458 0, /* max_case_values. */
1459 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
1460 (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS
), /* tune_flags. */
1461 &generic_prefetch_tune
1464 static const struct tune_params cortexa72_tunings
=
1466 &cortexa57_extra_costs
,
1467 &generic_addrcost_table
,
1468 &cortexa57_regmove_cost
,
1469 &cortexa57_vector_cost
,
1470 &generic_branch_cost
,
1471 &generic_approx_modes
,
1472 SVE_NOT_IMPLEMENTED
, /* sve_width */
1473 { 4, /* load_int. */
1479 }, /* memmov_cost. */
1481 (AARCH64_FUSE_AES_AESMC
| AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
1482 | AARCH64_FUSE_MOVK_MOVK
), /* fusible_ops */
1483 "16", /* function_align. */
1484 "4", /* jump_align. */
1485 "8", /* loop_align. */
1486 2, /* int_reassoc_width. */
1487 4, /* fp_reassoc_width. */
1488 1, /* fma_reassoc_width. */
1489 1, /* vec_reassoc_width. */
1490 2, /* min_div_recip_mul_sf. */
1491 2, /* min_div_recip_mul_df. */
1492 0, /* max_case_values. */
1493 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
1494 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
1495 &generic_prefetch_tune
1498 static const struct tune_params cortexa73_tunings
=
1500 &cortexa57_extra_costs
,
1501 &generic_addrcost_table
,
1502 &cortexa57_regmove_cost
,
1503 &cortexa57_vector_cost
,
1504 &generic_branch_cost
,
1505 &generic_approx_modes
,
1506 SVE_NOT_IMPLEMENTED
, /* sve_width */
1507 { 4, /* load_int. */
1513 }, /* memmov_cost. */
1514 2, /* issue_rate. */
1515 (AARCH64_FUSE_AES_AESMC
| AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
1516 | AARCH64_FUSE_MOVK_MOVK
| AARCH64_FUSE_ADRP_LDR
), /* fusible_ops */
1517 "16", /* function_align. */
1518 "4", /* jump_align. */
1519 "8", /* loop_align. */
1520 2, /* int_reassoc_width. */
1521 4, /* fp_reassoc_width. */
1522 1, /* fma_reassoc_width. */
1523 1, /* vec_reassoc_width. */
1524 2, /* min_div_recip_mul_sf. */
1525 2, /* min_div_recip_mul_df. */
1526 0, /* max_case_values. */
1527 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
1528 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
1529 &generic_prefetch_tune
1534 static const struct tune_params exynosm1_tunings
=
1536 &exynosm1_extra_costs
,
1537 &exynosm1_addrcost_table
,
1538 &exynosm1_regmove_cost
,
1539 &exynosm1_vector_cost
,
1540 &generic_branch_cost
,
1541 &exynosm1_approx_modes
,
1542 SVE_NOT_IMPLEMENTED
, /* sve_width */
1543 { 4, /* load_int. */
1549 }, /* memmov_cost. */
1551 (AARCH64_FUSE_AES_AESMC
), /* fusible_ops */
1552 "4", /* function_align. */
1553 "4", /* jump_align. */
1554 "4", /* loop_align. */
1555 2, /* int_reassoc_width. */
1556 4, /* fp_reassoc_width. */
1557 1, /* fma_reassoc_width. */
1558 1, /* vec_reassoc_width. */
1559 2, /* min_div_recip_mul_sf. */
1560 2, /* min_div_recip_mul_df. */
1561 48, /* max_case_values. */
1562 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
1563 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
1564 &exynosm1_prefetch_tune
1567 static const struct tune_params thunderxt88_tunings
=
1569 &thunderx_extra_costs
,
1570 &generic_addrcost_table
,
1571 &thunderx_regmove_cost
,
1572 &thunderx_vector_cost
,
1573 &generic_branch_cost
,
1574 &generic_approx_modes
,
1575 SVE_NOT_IMPLEMENTED
, /* sve_width */
1576 { 6, /* load_int. */
1582 }, /* memmov_cost. */
1584 AARCH64_FUSE_ALU_BRANCH
, /* fusible_ops */
1585 "8", /* function_align. */
1586 "8", /* jump_align. */
1587 "8", /* loop_align. */
1588 2, /* int_reassoc_width. */
1589 4, /* fp_reassoc_width. */
1590 1, /* fma_reassoc_width. */
1591 1, /* vec_reassoc_width. */
1592 2, /* min_div_recip_mul_sf. */
1593 2, /* min_div_recip_mul_df. */
1594 0, /* max_case_values. */
1595 tune_params::AUTOPREFETCHER_OFF
, /* autoprefetcher_model. */
1596 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
), /* tune_flags. */
1597 &thunderxt88_prefetch_tune
1600 static const struct tune_params thunderx_tunings
=
1602 &thunderx_extra_costs
,
1603 &generic_addrcost_table
,
1604 &thunderx_regmove_cost
,
1605 &thunderx_vector_cost
,
1606 &generic_branch_cost
,
1607 &generic_approx_modes
,
1608 SVE_NOT_IMPLEMENTED
, /* sve_width */
1609 { 6, /* load_int. */
1615 }, /* memmov_cost. */
1617 AARCH64_FUSE_ALU_BRANCH
, /* fusible_ops */
1618 "8", /* function_align. */
1619 "8", /* jump_align. */
1620 "8", /* loop_align. */
1621 2, /* int_reassoc_width. */
1622 4, /* fp_reassoc_width. */
1623 1, /* fma_reassoc_width. */
1624 1, /* vec_reassoc_width. */
1625 2, /* min_div_recip_mul_sf. */
1626 2, /* min_div_recip_mul_df. */
1627 0, /* max_case_values. */
1628 tune_params::AUTOPREFETCHER_OFF
, /* autoprefetcher_model. */
1629 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
1630 | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND
), /* tune_flags. */
1631 &thunderx_prefetch_tune
1634 static const struct tune_params tsv110_tunings
=
1636 &tsv110_extra_costs
,
1637 &tsv110_addrcost_table
,
1638 &tsv110_regmove_cost
,
1639 &tsv110_vector_cost
,
1640 &generic_branch_cost
,
1641 &generic_approx_modes
,
1642 SVE_NOT_IMPLEMENTED
, /* sve_width */
1643 { 4, /* load_int. */
1649 }, /* memmov_cost. */
1651 (AARCH64_FUSE_AES_AESMC
| AARCH64_FUSE_ALU_BRANCH
1652 | AARCH64_FUSE_ALU_CBZ
), /* fusible_ops */
1653 "16", /* function_align. */
1654 "4", /* jump_align. */
1655 "8", /* loop_align. */
1656 2, /* int_reassoc_width. */
1657 4, /* fp_reassoc_width. */
1658 1, /* fma_reassoc_width. */
1659 1, /* vec_reassoc_width. */
1660 2, /* min_div_recip_mul_sf. */
1661 2, /* min_div_recip_mul_df. */
1662 0, /* max_case_values. */
1663 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
1664 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
1665 &tsv110_prefetch_tune
1668 static const struct tune_params xgene1_tunings
=
1670 &xgene1_extra_costs
,
1671 &xgene1_addrcost_table
,
1672 &xgene1_regmove_cost
,
1673 &xgene1_vector_cost
,
1674 &generic_branch_cost
,
1675 &xgene1_approx_modes
,
1676 SVE_NOT_IMPLEMENTED
, /* sve_width */
1677 { 6, /* load_int. */
1683 }, /* memmov_cost. */
1685 AARCH64_FUSE_NOTHING
, /* fusible_ops */
1686 "16", /* function_align. */
1687 "16", /* jump_align. */
1688 "16", /* loop_align. */
1689 2, /* int_reassoc_width. */
1690 4, /* fp_reassoc_width. */
1691 1, /* fma_reassoc_width. */
1692 1, /* vec_reassoc_width. */
1693 2, /* min_div_recip_mul_sf. */
1694 2, /* min_div_recip_mul_df. */
1695 17, /* max_case_values. */
1696 tune_params::AUTOPREFETCHER_OFF
, /* autoprefetcher_model. */
1697 (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS
), /* tune_flags. */
1698 &xgene1_prefetch_tune
1701 static const struct tune_params emag_tunings
=
1703 &xgene1_extra_costs
,
1704 &xgene1_addrcost_table
,
1705 &xgene1_regmove_cost
,
1706 &xgene1_vector_cost
,
1707 &generic_branch_cost
,
1708 &xgene1_approx_modes
,
1709 SVE_NOT_IMPLEMENTED
,
1710 { 6, /* load_int. */
1716 }, /* memmov_cost. */
1718 AARCH64_FUSE_NOTHING
, /* fusible_ops */
1719 "16", /* function_align. */
1720 "16", /* jump_align. */
1721 "16", /* loop_align. */
1722 2, /* int_reassoc_width. */
1723 4, /* fp_reassoc_width. */
1724 1, /* fma_reassoc_width. */
1725 1, /* vec_reassoc_width. */
1726 2, /* min_div_recip_mul_sf. */
1727 2, /* min_div_recip_mul_df. */
1728 17, /* max_case_values. */
1729 tune_params::AUTOPREFETCHER_OFF
, /* autoprefetcher_model. */
1730 (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS
), /* tune_flags. */
1731 &xgene1_prefetch_tune
1734 static const struct tune_params qdf24xx_tunings
=
1736 &qdf24xx_extra_costs
,
1737 &qdf24xx_addrcost_table
,
1738 &qdf24xx_regmove_cost
,
1739 &qdf24xx_vector_cost
,
1740 &generic_branch_cost
,
1741 &generic_approx_modes
,
1742 SVE_NOT_IMPLEMENTED
, /* sve_width */
1743 { 4, /* load_int. */
1749 }, /* memmov_cost. */
1751 (AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
1752 | AARCH64_FUSE_MOVK_MOVK
), /* fuseable_ops */
1753 "16", /* function_align. */
1754 "8", /* jump_align. */
1755 "16", /* loop_align. */
1756 2, /* int_reassoc_width. */
1757 4, /* fp_reassoc_width. */
1758 1, /* fma_reassoc_width. */
1759 1, /* vec_reassoc_width. */
1760 2, /* min_div_recip_mul_sf. */
1761 2, /* min_div_recip_mul_df. */
1762 0, /* max_case_values. */
1763 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
1764 AARCH64_EXTRA_TUNE_RENAME_LOAD_REGS
, /* tune_flags. */
1765 &qdf24xx_prefetch_tune
1768 /* Tuning structure for the Qualcomm Saphira core. Default to falkor values
1770 static const struct tune_params saphira_tunings
=
1772 &generic_extra_costs
,
1773 &generic_addrcost_table
,
1774 &generic_regmove_cost
,
1775 &generic_vector_cost
,
1776 &generic_branch_cost
,
1777 &generic_approx_modes
,
1778 SVE_NOT_IMPLEMENTED
, /* sve_width */
1779 { 4, /* load_int. */
1785 }, /* memmov_cost. */
1787 (AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
1788 | AARCH64_FUSE_MOVK_MOVK
), /* fuseable_ops */
1789 "16", /* function_align. */
1790 "8", /* jump_align. */
1791 "16", /* loop_align. */
1792 2, /* int_reassoc_width. */
1793 4, /* fp_reassoc_width. */
1794 1, /* fma_reassoc_width. */
1795 1, /* vec_reassoc_width. */
1796 2, /* min_div_recip_mul_sf. */
1797 2, /* min_div_recip_mul_df. */
1798 0, /* max_case_values. */
1799 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
1800 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
1801 &generic_prefetch_tune
1804 static const struct tune_params thunderx2t99_tunings
=
1806 &thunderx2t99_extra_costs
,
1807 &thunderx2t99_addrcost_table
,
1808 &thunderx2t99_regmove_cost
,
1809 &thunderx2t99_vector_cost
,
1810 &generic_branch_cost
,
1811 &generic_approx_modes
,
1812 SVE_NOT_IMPLEMENTED
, /* sve_width */
1813 { 4, /* load_int. */
1819 }, /* memmov_cost. */
1820 4, /* issue_rate. */
1821 (AARCH64_FUSE_ALU_BRANCH
| AARCH64_FUSE_AES_AESMC
1822 | AARCH64_FUSE_ALU_CBZ
), /* fusible_ops */
1823 "16", /* function_align. */
1824 "8", /* jump_align. */
1825 "16", /* loop_align. */
1826 3, /* int_reassoc_width. */
1827 2, /* fp_reassoc_width. */
1828 1, /* fma_reassoc_width. */
1829 2, /* vec_reassoc_width. */
1830 2, /* min_div_recip_mul_sf. */
1831 2, /* min_div_recip_mul_df. */
1832 0, /* max_case_values. */
1833 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
1834 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
1835 &thunderx2t99_prefetch_tune
1838 static const struct tune_params thunderx3t110_tunings
=
1840 &thunderx3t110_extra_costs
,
1841 &thunderx3t110_addrcost_table
,
1842 &thunderx3t110_regmove_cost
,
1843 &thunderx3t110_vector_cost
,
1844 &generic_branch_cost
,
1845 &generic_approx_modes
,
1846 SVE_NOT_IMPLEMENTED
, /* sve_width */
1847 { 4, /* load_int. */
1853 }, /* memmov_cost. */
1854 6, /* issue_rate. */
1855 (AARCH64_FUSE_ALU_BRANCH
| AARCH64_FUSE_AES_AESMC
1856 | AARCH64_FUSE_ALU_CBZ
), /* fusible_ops */
1857 "16", /* function_align. */
1858 "8", /* jump_align. */
1859 "16", /* loop_align. */
1860 3, /* int_reassoc_width. */
1861 2, /* fp_reassoc_width. */
1862 1, /* fma_reassoc_width. */
1863 2, /* vec_reassoc_width. */
1864 2, /* min_div_recip_mul_sf. */
1865 2, /* min_div_recip_mul_df. */
1866 0, /* max_case_values. */
1867 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
1868 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
1869 &thunderx3t110_prefetch_tune
1872 static const struct tune_params neoversen1_tunings
=
1874 &cortexa76_extra_costs
,
1875 &generic_addrcost_table
,
1876 &generic_regmove_cost
,
1877 &cortexa57_vector_cost
,
1878 &generic_branch_cost
,
1879 &generic_approx_modes
,
1880 SVE_NOT_IMPLEMENTED
, /* sve_width */
1881 { 4, /* load_int. */
1887 }, /* memmov_cost. */
1889 (AARCH64_FUSE_AES_AESMC
| AARCH64_FUSE_CMP_BRANCH
), /* fusible_ops */
1890 "32:16", /* function_align. */
1891 "4", /* jump_align. */
1892 "32:16", /* loop_align. */
1893 2, /* int_reassoc_width. */
1894 4, /* fp_reassoc_width. */
1895 1, /* fma_reassoc_width. */
1896 2, /* vec_reassoc_width. */
1897 2, /* min_div_recip_mul_sf. */
1898 2, /* min_div_recip_mul_df. */
1899 0, /* max_case_values. */
1900 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
1901 (AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND
), /* tune_flags. */
1902 &generic_prefetch_tune
1905 static const struct tune_params ampere1_tunings
=
1907 &ere1_extra_costs
,
1908 &generic_addrcost_table
,
1909 &generic_regmove_cost
,
1910 &ere1_vector_cost
,
1911 &generic_branch_cost
,
1912 &generic_approx_modes
,
1913 SVE_NOT_IMPLEMENTED
, /* sve_width */
1914 { 4, /* load_int. */
1920 }, /* memmov_cost. */
1922 (AARCH64_FUSE_ADRP_ADD
| AARCH64_FUSE_AES_AESMC
|
1923 AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_MOVK_MOVK
|
1924 AARCH64_FUSE_ALU_BRANCH
/* adds, ands, bics, ccmp, ccmn */ |
1925 AARCH64_FUSE_CMP_BRANCH
),
1927 "32", /* function_align. */
1928 "4", /* jump_align. */
1929 "32:16", /* loop_align. */
1930 2, /* int_reassoc_width. */
1931 4, /* fp_reassoc_width. */
1932 1, /* fma_reassoc_width. */
1933 2, /* vec_reassoc_width. */
1934 2, /* min_div_recip_mul_sf. */
1935 2, /* min_div_recip_mul_df. */
1936 0, /* max_case_values. */
1937 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
1938 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
1939 &ere1_prefetch_tune
1942 static const struct tune_params ampere1a_tunings
=
1944 &ere1a_extra_costs
,
1945 &generic_addrcost_table
,
1946 &generic_regmove_cost
,
1947 &ere1_vector_cost
,
1948 &generic_branch_cost
,
1949 &generic_approx_modes
,
1950 SVE_NOT_IMPLEMENTED
, /* sve_width */
1951 { 4, /* load_int. */
1957 }, /* memmov_cost. */
1959 (AARCH64_FUSE_ADRP_ADD
| AARCH64_FUSE_AES_AESMC
|
1960 AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_MOVK_MOVK
|
1961 AARCH64_FUSE_ALU_BRANCH
/* adds, ands, bics, ccmp, ccmn */ |
1962 AARCH64_FUSE_CMP_BRANCH
| AARCH64_FUSE_ALU_CBZ
|
1963 AARCH64_FUSE_ADDSUB_2REG_CONST1
),
1965 "32", /* function_align. */
1966 "4", /* jump_align. */
1967 "32:16", /* loop_align. */
1968 2, /* int_reassoc_width. */
1969 4, /* fp_reassoc_width. */
1970 1, /* fma_reassoc_width. */
1971 2, /* vec_reassoc_width. */
1972 2, /* min_div_recip_mul_sf. */
1973 2, /* min_div_recip_mul_df. */
1974 0, /* max_case_values. */
1975 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
1976 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
1977 &ere1_prefetch_tune
1980 static const advsimd_vec_cost neoversev1_advsimd_vector_cost
=
1982 2, /* int_stmt_cost */
1983 2, /* fp_stmt_cost */
1984 4, /* ld2_st2_permute_cost */
1985 4, /* ld3_st3_permute_cost */
1986 5, /* ld4_st4_permute_cost */
1987 3, /* permute_cost */
1988 4, /* reduc_i8_cost */
1989 4, /* reduc_i16_cost */
1990 2, /* reduc_i32_cost */
1991 2, /* reduc_i64_cost */
1992 6, /* reduc_f16_cost */
1993 3, /* reduc_f32_cost */
1994 2, /* reduc_f64_cost */
1995 2, /* store_elt_extra_cost */
1996 /* This value is just inherited from the Cortex-A57 table. */
1997 8, /* vec_to_scalar_cost */
1998 /* This depends very much on what the scalar value is and
1999 where it comes from. E.g. some constants take two dependent
2000 instructions or a load, while others might be moved from a GPR.
2001 4 seems to be a reasonable compromise in practice. */
2002 4, /* scalar_to_vec_cost */
2003 4, /* align_load_cost */
2004 4, /* unalign_load_cost */
2005 /* Although stores have a latency of 2 and compete for the
2006 vector pipes, in practice it's better not to model that. */
2007 1, /* unalign_store_cost */
2011 static const sve_vec_cost neoversev1_sve_vector_cost
=
2014 2, /* int_stmt_cost */
2015 2, /* fp_stmt_cost */
2016 4, /* ld2_st2_permute_cost */
2017 7, /* ld3_st3_permute_cost */
2018 8, /* ld4_st4_permute_cost */
2019 3, /* permute_cost */
2020 /* Theoretically, a reduction involving 31 scalar ADDs could
2021 complete in ~9 cycles and would have a cost of 31. [SU]ADDV
2022 completes in 14 cycles, so give it a cost of 31 + 5. */
2023 36, /* reduc_i8_cost */
2024 /* Likewise for 15 scalar ADDs (~5 cycles) vs. 12: 15 + 7. */
2025 22, /* reduc_i16_cost */
2026 /* Likewise for 7 scalar ADDs (~3 cycles) vs. 10: 7 + 7. */
2027 14, /* reduc_i32_cost */
2028 /* Likewise for 3 scalar ADDs (~2 cycles) vs. 10: 3 + 8. */
2029 11, /* reduc_i64_cost */
2030 /* Theoretically, a reduction involving 15 scalar FADDs could
2031 complete in ~9 cycles and would have a cost of 30. FADDV
2032 completes in 13 cycles, so give it a cost of 30 + 4. */
2033 34, /* reduc_f16_cost */
2034 /* Likewise for 7 scalar FADDs (~6 cycles) vs. 11: 14 + 5. */
2035 19, /* reduc_f32_cost */
2036 /* Likewise for 3 scalar FADDs (~4 cycles) vs. 9: 6 + 5. */
2037 11, /* reduc_f64_cost */
2038 2, /* store_elt_extra_cost */
2039 /* This value is just inherited from the Cortex-A57 table. */
2040 8, /* vec_to_scalar_cost */
2041 /* See the comment above the Advanced SIMD versions. */
2042 4, /* scalar_to_vec_cost */
2043 4, /* align_load_cost */
2044 4, /* unalign_load_cost */
2045 /* Although stores have a latency of 2 and compete for the
2046 vector pipes, in practice it's better not to model that. */
2047 1, /* unalign_store_cost */
2051 19, /* fadda_f16_cost */
2052 11, /* fadda_f32_cost */
2053 8, /* fadda_f64_cost */
2054 32, /* gather_load_x32_cost */
2055 16, /* gather_load_x64_cost */
2056 3 /* scatter_store_elt_cost */
2059 static const aarch64_scalar_vec_issue_info neoversev1_scalar_issue_info
=
2061 3, /* loads_stores_per_cycle */
2062 2, /* stores_per_cycle */
2063 4, /* general_ops_per_cycle */
2064 0, /* fp_simd_load_general_ops */
2065 1 /* fp_simd_store_general_ops */
2068 static const aarch64_advsimd_vec_issue_info neoversev1_advsimd_issue_info
=
2071 3, /* loads_stores_per_cycle */
2072 2, /* stores_per_cycle */
2073 4, /* general_ops_per_cycle */
2074 0, /* fp_simd_load_general_ops */
2075 1 /* fp_simd_store_general_ops */
2077 2, /* ld2_st2_general_ops */
2078 2, /* ld3_st3_general_ops */
2079 3 /* ld4_st4_general_ops */
2082 static const aarch64_sve_vec_issue_info neoversev1_sve_issue_info
=
2086 2, /* loads_per_cycle */
2087 2, /* stores_per_cycle */
2088 2, /* general_ops_per_cycle */
2089 0, /* fp_simd_load_general_ops */
2090 1 /* fp_simd_store_general_ops */
2092 2, /* ld2_st2_general_ops */
2093 2, /* ld3_st3_general_ops */
2094 3 /* ld4_st4_general_ops */
2096 1, /* pred_ops_per_cycle */
2097 2, /* while_pred_ops */
2098 2, /* int_cmp_pred_ops */
2099 1, /* fp_cmp_pred_ops */
2100 1, /* gather_scatter_pair_general_ops */
2101 1 /* gather_scatter_pair_pred_ops */
2104 static const aarch64_vec_issue_info neoversev1_vec_issue_info
=
2106 &neoversev1_scalar_issue_info
,
2107 &neoversev1_advsimd_issue_info
,
2108 &neoversev1_sve_issue_info
2111 /* Neoverse V1 costs for vector insn classes. */
2112 static const struct cpu_vector_cost neoversev1_vector_cost
=
2114 1, /* scalar_int_stmt_cost */
2115 2, /* scalar_fp_stmt_cost */
2116 4, /* scalar_load_cost */
2117 1, /* scalar_store_cost */
2118 1, /* cond_taken_branch_cost */
2119 1, /* cond_not_taken_branch_cost */
2120 &neoversev1_advsimd_vector_cost
, /* advsimd */
2121 &neoversev1_sve_vector_cost
, /* sve */
2122 &neoversev1_vec_issue_info
/* issue_info */
2125 static const struct tune_params neoversev1_tunings
=
2127 &cortexa76_extra_costs
,
2128 &neoversev1_addrcost_table
,
2129 &neoversev1_regmove_cost
,
2130 &neoversev1_vector_cost
,
2131 &generic_branch_cost
,
2132 &generic_approx_modes
,
2133 SVE_256
, /* sve_width */
2134 { 4, /* load_int. */
2140 }, /* memmov_cost. */
2142 (AARCH64_FUSE_AES_AESMC
| AARCH64_FUSE_CMP_BRANCH
), /* fusible_ops */
2143 "32:16", /* function_align. */
2144 "4", /* jump_align. */
2145 "32:16", /* loop_align. */
2146 2, /* int_reassoc_width. */
2147 4, /* fp_reassoc_width. */
2148 4, /* fma_reassoc_width. */
2149 2, /* vec_reassoc_width. */
2150 2, /* min_div_recip_mul_sf. */
2151 2, /* min_div_recip_mul_df. */
2152 0, /* max_case_values. */
2153 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
2154 (AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
2155 | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
2156 | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT
2157 | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND
), /* tune_flags. */
2158 &generic_prefetch_tune
2161 static const sve_vec_cost neoverse512tvb_sve_vector_cost
=
2164 2, /* int_stmt_cost */
2165 2, /* fp_stmt_cost */
2166 4, /* ld2_st2_permute_cost */
2167 5, /* ld3_st3_permute_cost */
2168 5, /* ld4_st4_permute_cost */
2169 3, /* permute_cost */
2170 /* Theoretically, a reduction involving 15 scalar ADDs could
2171 complete in ~5 cycles and would have a cost of 15. Assume that
2172 [SU]ADDV completes in 11 cycles and so give it a cost of 15 + 6. */
2173 21, /* reduc_i8_cost */
2174 /* Likewise for 7 scalar ADDs (~3 cycles) vs. 9: 7 + 6. */
2175 13, /* reduc_i16_cost */
2176 /* Likewise for 3 scalar ADDs (~2 cycles) vs. 8: 3 + 6. */
2177 9, /* reduc_i32_cost */
2178 /* Likewise for 1 scalar ADD (1 cycle) vs. 8: 1 + 7. */
2179 8, /* reduc_i64_cost */
2180 /* Theoretically, a reduction involving 7 scalar FADDs could
2181 complete in ~6 cycles and would have a cost of 14. Assume that
2182 FADDV completes in 8 cycles and so give it a cost of 14 + 2. */
2183 16, /* reduc_f16_cost */
2184 /* Likewise for 3 scalar FADDs (~4 cycles) vs. 6: 6 + 2. */
2185 8, /* reduc_f32_cost */
2186 /* Likewise for 1 scalar FADD (2 cycles) vs. 4: 2 + 2. */
2187 4, /* reduc_f64_cost */
2188 2, /* store_elt_extra_cost */
2189 /* This value is just inherited from the Cortex-A57 table. */
2190 8, /* vec_to_scalar_cost */
2191 /* This depends very much on what the scalar value is and
2192 where it comes from. E.g. some constants take two dependent
2193 instructions or a load, while others might be moved from a GPR.
2194 4 seems to be a reasonable compromise in practice. */
2195 4, /* scalar_to_vec_cost */
2196 4, /* align_load_cost */
2197 4, /* unalign_load_cost */
2198 /* Although stores generally have a latency of 2 and compete for the
2199 vector pipes, in practice it's better not to model that. */
2200 1, /* unalign_store_cost */
2204 10, /* fadda_f16_cost */
2205 6, /* fadda_f32_cost */
2206 4, /* fadda_f64_cost */
2207 /* A strided Advanced SIMD x64 load would take two parallel FP loads
2208 (6 cycles) plus an insertion (2 cycles). Assume a 64-bit SVE gather
2209 is 1 cycle more. The Advanced SIMD version is costed as 2 scalar loads
2210 (cost 8) and a vec_construct (cost 2). Add a full vector operation
2211 (cost 2) to that, to avoid the difference being lost in rounding.
2213 There is no easy comparison between a strided Advanced SIMD x32 load
2214 and an SVE 32-bit gather, but cost an SVE 32-bit gather as 1 vector
2215 operation more than a 64-bit gather. */
2216 14, /* gather_load_x32_cost */
2217 12, /* gather_load_x64_cost */
2218 3 /* scatter_store_elt_cost */
2221 static const aarch64_sve_vec_issue_info neoverse512tvb_sve_issue_info
=
2225 3, /* loads_per_cycle */
2226 2, /* stores_per_cycle */
2227 4, /* general_ops_per_cycle */
2228 0, /* fp_simd_load_general_ops */
2229 1 /* fp_simd_store_general_ops */
2231 2, /* ld2_st2_general_ops */
2232 2, /* ld3_st3_general_ops */
2233 3 /* ld4_st4_general_ops */
2235 2, /* pred_ops_per_cycle */
2236 2, /* while_pred_ops */
2237 2, /* int_cmp_pred_ops */
2238 1, /* fp_cmp_pred_ops */
2239 1, /* gather_scatter_pair_general_ops */
2240 1 /* gather_scatter_pair_pred_ops */
2243 static const aarch64_vec_issue_info neoverse512tvb_vec_issue_info
=
2245 &neoversev1_scalar_issue_info
,
2246 &neoversev1_advsimd_issue_info
,
2247 &neoverse512tvb_sve_issue_info
2250 static const struct cpu_vector_cost neoverse512tvb_vector_cost
=
2252 1, /* scalar_int_stmt_cost */
2253 2, /* scalar_fp_stmt_cost */
2254 4, /* scalar_load_cost */
2255 1, /* scalar_store_cost */
2256 1, /* cond_taken_branch_cost */
2257 1, /* cond_not_taken_branch_cost */
2258 &neoversev1_advsimd_vector_cost
, /* advsimd */
2259 &neoverse512tvb_sve_vector_cost
, /* sve */
2260 &neoverse512tvb_vec_issue_info
/* issue_info */
2263 static const struct tune_params neoverse512tvb_tunings
=
2265 &cortexa76_extra_costs
,
2266 &neoversev1_addrcost_table
,
2267 &neoversev1_regmove_cost
,
2268 &neoverse512tvb_vector_cost
,
2269 &generic_branch_cost
,
2270 &generic_approx_modes
,
2271 SVE_128
| SVE_256
, /* sve_width */
2272 { 4, /* load_int. */
2278 }, /* memmov_cost. */
2280 (AARCH64_FUSE_AES_AESMC
| AARCH64_FUSE_CMP_BRANCH
), /* fusible_ops */
2281 "32:16", /* function_align. */
2282 "4", /* jump_align. */
2283 "32:16", /* loop_align. */
2284 2, /* int_reassoc_width. */
2285 4, /* fp_reassoc_width. */
2286 4, /* fma_reassoc_width. */
2287 2, /* vec_reassoc_width. */
2288 2, /* min_div_recip_mul_sf. */
2289 2, /* min_div_recip_mul_df. */
2290 0, /* max_case_values. */
2291 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
2292 (AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
2293 | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
2294 | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT
), /* tune_flags. */
2295 &generic_prefetch_tune
2298 static const advsimd_vec_cost neoversen2_advsimd_vector_cost
=
2300 2, /* int_stmt_cost */
2301 2, /* fp_stmt_cost */
2302 2, /* ld2_st2_permute_cost */
2303 2, /* ld3_st3_permute_cost */
2304 3, /* ld4_st4_permute_cost */
2305 3, /* permute_cost */
2306 4, /* reduc_i8_cost */
2307 4, /* reduc_i16_cost */
2308 2, /* reduc_i32_cost */
2309 2, /* reduc_i64_cost */
2310 6, /* reduc_f16_cost */
2311 4, /* reduc_f32_cost */
2312 2, /* reduc_f64_cost */
2313 2, /* store_elt_extra_cost */
2314 /* This value is just inherited from the Cortex-A57 table. */
2315 8, /* vec_to_scalar_cost */
2316 /* This depends very much on what the scalar value is and
2317 where it comes from. E.g. some constants take two dependent
2318 instructions or a load, while others might be moved from a GPR.
2319 4 seems to be a reasonable compromise in practice. */
2320 4, /* scalar_to_vec_cost */
2321 4, /* align_load_cost */
2322 4, /* unalign_load_cost */
2323 /* Although stores have a latency of 2 and compete for the
2324 vector pipes, in practice it's better not to model that. */
2325 1, /* unalign_store_cost */
2329 static const sve_vec_cost neoversen2_sve_vector_cost
=
2332 2, /* int_stmt_cost */
2333 2, /* fp_stmt_cost */
2334 3, /* ld2_st2_permute_cost */
2335 4, /* ld3_st3_permute_cost */
2336 4, /* ld4_st4_permute_cost */
2337 3, /* permute_cost */
2338 /* Theoretically, a reduction involving 15 scalar ADDs could
2339 complete in ~5 cycles and would have a cost of 15. [SU]ADDV
2340 completes in 11 cycles, so give it a cost of 15 + 6. */
2341 21, /* reduc_i8_cost */
2342 /* Likewise for 7 scalar ADDs (~3 cycles) vs. 9: 7 + 6. */
2343 13, /* reduc_i16_cost */
2344 /* Likewise for 3 scalar ADDs (~2 cycles) vs. 8: 3 + 6. */
2345 9, /* reduc_i32_cost */
2346 /* Likewise for 1 scalar ADD (~1 cycles) vs. 2: 1 + 1. */
2347 2, /* reduc_i64_cost */
2348 /* Theoretically, a reduction involving 7 scalar FADDs could
2349 complete in ~8 cycles and would have a cost of 14. FADDV
2350 completes in 6 cycles, so give it a cost of 14 - 2. */
2351 12, /* reduc_f16_cost */
2352 /* Likewise for 3 scalar FADDs (~4 cycles) vs. 4: 6 - 0. */
2353 6, /* reduc_f32_cost */
2354 /* Likewise for 1 scalar FADD (~2 cycles) vs. 2: 2 - 0. */
2355 2, /* reduc_f64_cost */
2356 2, /* store_elt_extra_cost */
2357 /* This value is just inherited from the Cortex-A57 table. */
2358 8, /* vec_to_scalar_cost */
2359 /* See the comment above the Advanced SIMD versions. */
2360 4, /* scalar_to_vec_cost */
2361 4, /* align_load_cost */
2362 4, /* unalign_load_cost */
2363 /* Although stores have a latency of 2 and compete for the
2364 vector pipes, in practice it's better not to model that. */
2365 1, /* unalign_store_cost */
2369 10, /* fadda_f16_cost */
2370 6, /* fadda_f32_cost */
2371 4, /* fadda_f64_cost */
2372 /* A strided Advanced SIMD x64 load would take two parallel FP loads
2373 (8 cycles) plus an insertion (2 cycles). Assume a 64-bit SVE gather
2374 is 1 cycle more. The Advanced SIMD version is costed as 2 scalar loads
2375 (cost 8) and a vec_construct (cost 2). Add a full vector operation
2376 (cost 2) to that, to avoid the difference being lost in rounding.
2378 There is no easy comparison between a strided Advanced SIMD x32 load
2379 and an SVE 32-bit gather, but cost an SVE 32-bit gather as 1 vector
2380 operation more than a 64-bit gather. */
2381 14, /* gather_load_x32_cost */
2382 12, /* gather_load_x64_cost */
2383 3 /* scatter_store_elt_cost */
2386 static const aarch64_scalar_vec_issue_info neoversen2_scalar_issue_info
=
2388 3, /* loads_stores_per_cycle */
2389 2, /* stores_per_cycle */
2390 4, /* general_ops_per_cycle */
2391 0, /* fp_simd_load_general_ops */
2392 1 /* fp_simd_store_general_ops */
2395 static const aarch64_advsimd_vec_issue_info neoversen2_advsimd_issue_info
=
2398 3, /* loads_stores_per_cycle */
2399 2, /* stores_per_cycle */
2400 2, /* general_ops_per_cycle */
2401 0, /* fp_simd_load_general_ops */
2402 1 /* fp_simd_store_general_ops */
2404 2, /* ld2_st2_general_ops */
2405 2, /* ld3_st3_general_ops */
2406 3 /* ld4_st4_general_ops */
2409 static const aarch64_sve_vec_issue_info neoversen2_sve_issue_info
=
2413 3, /* loads_per_cycle */
2414 2, /* stores_per_cycle */
2415 2, /* general_ops_per_cycle */
2416 0, /* fp_simd_load_general_ops */
2417 1 /* fp_simd_store_general_ops */
2419 2, /* ld2_st2_general_ops */
2420 3, /* ld3_st3_general_ops */
2421 3 /* ld4_st4_general_ops */
2423 2, /* pred_ops_per_cycle */
2424 2, /* while_pred_ops */
2425 2, /* int_cmp_pred_ops */
2426 1, /* fp_cmp_pred_ops */
2427 1, /* gather_scatter_pair_general_ops */
2428 1 /* gather_scatter_pair_pred_ops */
2431 static const aarch64_vec_issue_info neoversen2_vec_issue_info
=
2433 &neoversen2_scalar_issue_info
,
2434 &neoversen2_advsimd_issue_info
,
2435 &neoversen2_sve_issue_info
2438 /* Neoverse N2 costs for vector insn classes. */
2439 static const struct cpu_vector_cost neoversen2_vector_cost
=
2441 1, /* scalar_int_stmt_cost */
2442 2, /* scalar_fp_stmt_cost */
2443 4, /* scalar_load_cost */
2444 1, /* scalar_store_cost */
2445 1, /* cond_taken_branch_cost */
2446 1, /* cond_not_taken_branch_cost */
2447 &neoversen2_advsimd_vector_cost
, /* advsimd */
2448 &neoversen2_sve_vector_cost
, /* sve */
2449 &neoversen2_vec_issue_info
/* issue_info */
2452 static const struct tune_params neoversen2_tunings
=
2454 &cortexa76_extra_costs
,
2455 &neoversen2_addrcost_table
,
2456 &neoversen2_regmove_cost
,
2457 &neoversen2_vector_cost
,
2458 &generic_branch_cost
,
2459 &generic_approx_modes
,
2460 SVE_128
, /* sve_width */
2461 { 4, /* load_int. */
2467 }, /* memmov_cost. */
2469 (AARCH64_FUSE_AES_AESMC
| AARCH64_FUSE_CMP_BRANCH
), /* fusible_ops */
2470 "32:16", /* function_align. */
2471 "4", /* jump_align. */
2472 "32:16", /* loop_align. */
2473 2, /* int_reassoc_width. */
2474 4, /* fp_reassoc_width. */
2475 1, /* fma_reassoc_width. */
2476 2, /* vec_reassoc_width. */
2477 2, /* min_div_recip_mul_sf. */
2478 2, /* min_div_recip_mul_df. */
2479 0, /* max_case_values. */
2480 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
2481 (AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND
2482 | AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
2483 | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
2484 | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT
), /* tune_flags. */
2485 &generic_prefetch_tune
2488 static const advsimd_vec_cost neoversev2_advsimd_vector_cost
=
2490 2, /* int_stmt_cost */
2491 2, /* fp_stmt_cost */
2492 2, /* ld2_st2_permute_cost */
2493 2, /* ld3_st3_permute_cost */
2494 3, /* ld4_st4_permute_cost */
2495 3, /* permute_cost */
2496 4, /* reduc_i8_cost */
2497 4, /* reduc_i16_cost */
2498 2, /* reduc_i32_cost */
2499 2, /* reduc_i64_cost */
2500 6, /* reduc_f16_cost */
2501 3, /* reduc_f32_cost */
2502 2, /* reduc_f64_cost */
2503 2, /* store_elt_extra_cost */
2504 /* This value is just inherited from the Cortex-A57 table. */
2505 8, /* vec_to_scalar_cost */
2506 /* This depends very much on what the scalar value is and
2507 where it comes from. E.g. some constants take two dependent
2508 instructions or a load, while others might be moved from a GPR.
2509 4 seems to be a reasonable compromise in practice. */
2510 4, /* scalar_to_vec_cost */
2511 4, /* align_load_cost */
2512 4, /* unalign_load_cost */
2513 /* Although stores have a latency of 2 and compete for the
2514 vector pipes, in practice it's better not to model that. */
2515 1, /* unalign_store_cost */
2519 static const sve_vec_cost neoversev2_sve_vector_cost
=
2522 2, /* int_stmt_cost */
2523 2, /* fp_stmt_cost */
2524 3, /* ld2_st2_permute_cost */
2525 3, /* ld3_st3_permute_cost */
2526 4, /* ld4_st4_permute_cost */
2527 3, /* permute_cost */
2528 /* Theoretically, a reduction involving 15 scalar ADDs could
2529 complete in ~3 cycles and would have a cost of 15. [SU]ADDV
2530 completes in 11 cycles, so give it a cost of 15 + 8. */
2531 21, /* reduc_i8_cost */
2532 /* Likewise for 7 scalar ADDs (~2 cycles) vs. 9: 7 + 7. */
2533 14, /* reduc_i16_cost */
2534 /* Likewise for 3 scalar ADDs (~2 cycles) vs. 8: 3 + 4. */
2535 7, /* reduc_i32_cost */
2536 /* Likewise for 1 scalar ADD (~1 cycles) vs. 2: 1 + 1. */
2537 2, /* reduc_i64_cost */
2538 /* Theoretically, a reduction involving 7 scalar FADDs could
2539 complete in ~6 cycles and would have a cost of 14. FADDV
2540 completes in 8 cycles, so give it a cost of 14 + 2. */
2541 16, /* reduc_f16_cost */
2542 /* Likewise for 3 scalar FADDs (~4 cycles) vs. 6: 6 + 2. */
2543 8, /* reduc_f32_cost */
2544 /* Likewise for 1 scalar FADD (~2 cycles) vs. 4: 2 + 2. */
2545 4, /* reduc_f64_cost */
2546 2, /* store_elt_extra_cost */
2547 /* This value is just inherited from the Cortex-A57 table. */
2548 8, /* vec_to_scalar_cost */
2549 /* See the comment above the Advanced SIMD versions. */
2550 4, /* scalar_to_vec_cost */
2551 4, /* align_load_cost */
2552 4, /* unalign_load_cost */
2553 /* Although stores have a latency of 2 and compete for the
2554 vector pipes, in practice it's better not to model that. */
2555 1, /* unalign_store_cost */
2559 10, /* fadda_f16_cost */
2560 6, /* fadda_f32_cost */
2561 4, /* fadda_f64_cost */
2562 /* A strided Advanced SIMD x64 load would take two parallel FP loads
2563 (8 cycles) plus an insertion (2 cycles). Assume a 64-bit SVE gather
2564 is 1 cycle more. The Advanced SIMD version is costed as 2 scalar loads
2565 (cost 8) and a vec_construct (cost 2). Add a full vector operation
2566 (cost 2) to that, to avoid the difference being lost in rounding.
2568 There is no easy comparison between a strided Advanced SIMD x32 load
2569 and an SVE 32-bit gather, but cost an SVE 32-bit gather as 1 vector
2570 operation more than a 64-bit gather. */
2571 14, /* gather_load_x32_cost */
2572 12, /* gather_load_x64_cost */
2573 3 /* scatter_store_elt_cost */
2576 static const aarch64_scalar_vec_issue_info neoversev2_scalar_issue_info
=
2578 3, /* loads_stores_per_cycle */
2579 2, /* stores_per_cycle */
2580 6, /* general_ops_per_cycle */
2581 0, /* fp_simd_load_general_ops */
2582 1 /* fp_simd_store_general_ops */
2585 static const aarch64_advsimd_vec_issue_info neoversev2_advsimd_issue_info
=
2588 3, /* loads_stores_per_cycle */
2589 2, /* stores_per_cycle */
2590 4, /* general_ops_per_cycle */
2591 0, /* fp_simd_load_general_ops */
2592 1 /* fp_simd_store_general_ops */
2594 2, /* ld2_st2_general_ops */
2595 2, /* ld3_st3_general_ops */
2596 3 /* ld4_st4_general_ops */
2599 static const aarch64_sve_vec_issue_info neoversev2_sve_issue_info
=
2603 3, /* loads_per_cycle */
2604 2, /* stores_per_cycle */
2605 4, /* general_ops_per_cycle */
2606 0, /* fp_simd_load_general_ops */
2607 1 /* fp_simd_store_general_ops */
2609 2, /* ld2_st2_general_ops */
2610 3, /* ld3_st3_general_ops */
2611 3 /* ld4_st4_general_ops */
2613 2, /* pred_ops_per_cycle */
2614 2, /* while_pred_ops */
2615 2, /* int_cmp_pred_ops */
2616 1, /* fp_cmp_pred_ops */
2617 1, /* gather_scatter_pair_general_ops */
2618 1 /* gather_scatter_pair_pred_ops */
2621 static const aarch64_vec_issue_info neoversev2_vec_issue_info
=
2623 &neoversev2_scalar_issue_info
,
2624 &neoversev2_advsimd_issue_info
,
2625 &neoversev2_sve_issue_info
2628 /* Demeter costs for vector insn classes. */
2629 static const struct cpu_vector_cost neoversev2_vector_cost
=
2631 1, /* scalar_int_stmt_cost */
2632 2, /* scalar_fp_stmt_cost */
2633 4, /* scalar_load_cost */
2634 1, /* scalar_store_cost */
2635 1, /* cond_taken_branch_cost */
2636 1, /* cond_not_taken_branch_cost */
2637 &neoversev2_advsimd_vector_cost
, /* advsimd */
2638 &neoversev2_sve_vector_cost
, /* sve */
2639 &neoversev2_vec_issue_info
/* issue_info */
2642 static const struct tune_params neoversev2_tunings
=
2644 &cortexa76_extra_costs
,
2645 &neoversev2_addrcost_table
,
2646 &neoversev2_regmove_cost
,
2647 &neoversev2_vector_cost
,
2648 &generic_branch_cost
,
2649 &generic_approx_modes
,
2650 SVE_128
, /* sve_width */
2651 { 4, /* load_int. */
2657 }, /* memmov_cost. */
2659 (AARCH64_FUSE_AES_AESMC
| AARCH64_FUSE_CMP_BRANCH
), /* fusible_ops */
2660 "32:16", /* function_align. */
2661 "4", /* jump_align. */
2662 "32:16", /* loop_align. */
2663 3, /* int_reassoc_width. */
2664 6, /* fp_reassoc_width. */
2665 4, /* fma_reassoc_width. */
2666 3, /* vec_reassoc_width. */
2667 2, /* min_div_recip_mul_sf. */
2668 2, /* min_div_recip_mul_df. */
2669 0, /* max_case_values. */
2670 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
2671 (AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND
2672 | AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
2673 | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
2674 | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT
), /* tune_flags. */
2675 &generic_prefetch_tune
2678 static const struct tune_params a64fx_tunings
=
2681 &a64fx_addrcost_table
,
2682 &a64fx_regmove_cost
,
2684 &generic_branch_cost
,
2685 &generic_approx_modes
,
2686 SVE_512
, /* sve_width */
2687 { 4, /* load_int. */
2693 }, /* memmov_cost. */
2695 (AARCH64_FUSE_AES_AESMC
| AARCH64_FUSE_CMP_BRANCH
), /* fusible_ops */
2696 "32", /* function_align. */
2697 "16", /* jump_align. */
2698 "32", /* loop_align. */
2699 4, /* int_reassoc_width. */
2700 2, /* fp_reassoc_width. */
2701 1, /* fma_reassoc_width. */
2702 2, /* vec_reassoc_width. */
2703 2, /* min_div_recip_mul_sf. */
2704 2, /* min_div_recip_mul_df. */
2705 0, /* max_case_values. */
2706 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
2707 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
2708 &a64fx_prefetch_tune
2711 /* Support for fine-grained override of the tuning structures. */
2712 struct aarch64_tuning_override_function
2715 void (*parse_override
)(const char*, struct tune_params
*);
2718 static void aarch64_parse_fuse_string (const char*, struct tune_params
*);
2719 static void aarch64_parse_tune_string (const char*, struct tune_params
*);
2720 static void aarch64_parse_sve_width_string (const char*, struct tune_params
*);
2722 static const struct aarch64_tuning_override_function
2723 aarch64_tuning_override_functions
[] =
2725 { "fuse", aarch64_parse_fuse_string
},
2726 { "tune", aarch64_parse_tune_string
},
2727 { "sve_width", aarch64_parse_sve_width_string
},
2731 /* A processor implementing AArch64. */
2735 aarch64_processor ident
;
2736 aarch64_processor sched_core
;
2738 aarch64_feature_flags flags
;
2739 const tune_params
*tune
;
2742 /* Architectures implementing AArch64. */
2743 static CONSTEXPR
const processor all_architectures
[] =
2745 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, D, E) \
2746 {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, \
2747 feature_deps::ARCH_IDENT ().enable, NULL},
2748 #include "aarch64-arches.def"
2749 {NULL
, aarch64_none
, aarch64_none
, aarch64_no_arch
, 0, NULL
}
2752 /* Processor cores implementing AArch64. */
2753 static const struct processor all_cores
[] =
2755 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, E, COSTS, G, H, I) \
2756 {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH, \
2757 feature_deps::cpu_##IDENT, &COSTS##_tunings},
2758 #include "aarch64-cores.def"
2759 {"generic", generic
, cortexa53
, AARCH64_ARCH_V8A
,
2760 feature_deps::V8A ().enable
, &generic_tunings
},
2761 {NULL
, aarch64_none
, aarch64_none
, aarch64_no_arch
, 0, NULL
}
2764 /* The current tuning set. */
2765 struct tune_params aarch64_tune_params
= generic_tunings
;
2767 /* Check whether an 'aarch64_vector_pcs' attribute is valid. */
2770 handle_aarch64_vector_pcs_attribute (tree
*node
, tree name
, tree
,
2771 int, bool *no_add_attrs
)
2773 /* Since we set fn_type_req to true, the caller should have checked
2775 gcc_assert (FUNC_OR_METHOD_TYPE_P (*node
));
2776 switch ((arm_pcs
) fntype_abi (*node
).id ())
2778 case ARM_PCS_AAPCS64
:
2783 error ("the %qE attribute cannot be applied to an SVE function type",
2785 *no_add_attrs
= true;
2788 case ARM_PCS_TLSDESC
:
2789 case ARM_PCS_UNKNOWN
:
2795 /* Table of machine attributes. */
2796 static const struct attribute_spec aarch64_attribute_table
[] =
2798 /* { name, min_len, max_len, decl_req, type_req, fn_type_req,
2799 affects_type_identity, handler, exclude } */
2800 { "aarch64_vector_pcs", 0, 0, false, true, true, true,
2801 handle_aarch64_vector_pcs_attribute
, NULL
},
2802 { "arm_sve_vector_bits", 1, 1, false, true, false, true,
2803 aarch64_sve::handle_arm_sve_vector_bits_attribute
,
2805 { "Advanced SIMD type", 1, 1, false, true, false, true, NULL
, NULL
},
2806 { "SVE type", 3, 3, false, true, false, true, NULL
, NULL
},
2807 { "SVE sizeless type", 0, 0, false, true, false, true, NULL
, NULL
},
2808 { NULL
, 0, 0, false, false, false, false, NULL
, NULL
}
2811 /* An ISA extension in the co-processor and main instruction set space. */
2812 struct aarch64_option_extension
2814 const char *const name
;
2815 const unsigned long flags_on
;
2816 const unsigned long flags_off
;
2819 typedef enum aarch64_cond_code
2821 AARCH64_EQ
= 0, AARCH64_NE
, AARCH64_CS
, AARCH64_CC
, AARCH64_MI
, AARCH64_PL
,
2822 AARCH64_VS
, AARCH64_VC
, AARCH64_HI
, AARCH64_LS
, AARCH64_GE
, AARCH64_LT
,
2823 AARCH64_GT
, AARCH64_LE
, AARCH64_AL
, AARCH64_NV
2827 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
2829 struct aarch64_branch_protect_type
2831 /* The type's name that the user passes to the branch-protection option
2834 /* Function to handle the protection type and set global variables.
2835 First argument is the string token corresponding with this type and the
2836 second argument is the next token in the option string.
2838 * AARCH64_PARSE_OK: Handling was sucessful.
2839 * AARCH64_INVALID_ARG: The type is invalid in this context and the caller
2840 should print an error.
2841 * AARCH64_INVALID_FEATURE: The type is invalid and the handler prints its
2843 enum aarch64_parse_opt_result (*handler
)(char*, char*);
2844 /* A list of types that can follow this type in the option string. */
2845 const aarch64_branch_protect_type
* subtypes
;
2846 unsigned int num_subtypes
;
2849 static enum aarch64_parse_opt_result
2850 aarch64_handle_no_branch_protection (char* str
, char* rest
)
2852 aarch64_ra_sign_scope
= AARCH64_FUNCTION_NONE
;
2853 aarch64_enable_bti
= 0;
2856 error ("unexpected %<%s%> after %<%s%>", rest
, str
);
2857 return AARCH64_PARSE_INVALID_FEATURE
;
2859 return AARCH64_PARSE_OK
;
2862 static enum aarch64_parse_opt_result
2863 aarch64_handle_standard_branch_protection (char* str
, char* rest
)
2865 aarch64_ra_sign_scope
= AARCH64_FUNCTION_NON_LEAF
;
2866 aarch64_ra_sign_key
= AARCH64_KEY_A
;
2867 aarch64_enable_bti
= 1;
2870 error ("unexpected %<%s%> after %<%s%>", rest
, str
);
2871 return AARCH64_PARSE_INVALID_FEATURE
;
2873 return AARCH64_PARSE_OK
;
2876 static enum aarch64_parse_opt_result
2877 aarch64_handle_pac_ret_protection (char* str ATTRIBUTE_UNUSED
,
2878 char* rest ATTRIBUTE_UNUSED
)
2880 aarch64_ra_sign_scope
= AARCH64_FUNCTION_NON_LEAF
;
2881 aarch64_ra_sign_key
= AARCH64_KEY_A
;
2882 return AARCH64_PARSE_OK
;
2885 static enum aarch64_parse_opt_result
2886 aarch64_handle_pac_ret_leaf (char* str ATTRIBUTE_UNUSED
,
2887 char* rest ATTRIBUTE_UNUSED
)
2889 aarch64_ra_sign_scope
= AARCH64_FUNCTION_ALL
;
2890 return AARCH64_PARSE_OK
;
2893 static enum aarch64_parse_opt_result
2894 aarch64_handle_pac_ret_b_key (char* str ATTRIBUTE_UNUSED
,
2895 char* rest ATTRIBUTE_UNUSED
)
2897 aarch64_ra_sign_key
= AARCH64_KEY_B
;
2898 return AARCH64_PARSE_OK
;
2901 static enum aarch64_parse_opt_result
2902 aarch64_handle_bti_protection (char* str ATTRIBUTE_UNUSED
,
2903 char* rest ATTRIBUTE_UNUSED
)
2905 aarch64_enable_bti
= 1;
2906 return AARCH64_PARSE_OK
;
2909 static const struct aarch64_branch_protect_type aarch64_pac_ret_subtypes
[] = {
2910 { "leaf", aarch64_handle_pac_ret_leaf
, NULL
, 0 },
2911 { "b-key", aarch64_handle_pac_ret_b_key
, NULL
, 0 },
2912 { NULL
, NULL
, NULL
, 0 }
2915 static const struct aarch64_branch_protect_type aarch64_branch_protect_types
[] = {
2916 { "none", aarch64_handle_no_branch_protection
, NULL
, 0 },
2917 { "standard", aarch64_handle_standard_branch_protection
, NULL
, 0 },
2918 { "pac-ret", aarch64_handle_pac_ret_protection
, aarch64_pac_ret_subtypes
,
2919 ARRAY_SIZE (aarch64_pac_ret_subtypes
) },
2920 { "bti", aarch64_handle_bti_protection
, NULL
, 0 },
2921 { NULL
, NULL
, NULL
, 0 }
2924 /* The condition codes of the processor, and the inverse function. */
2925 static const char * const aarch64_condition_codes
[] =
2927 "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
2928 "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
2931 /* The preferred condition codes for SVE conditions. */
2932 static const char *const aarch64_sve_condition_codes
[] =
2934 "none", "any", "nlast", "last", "first", "nfrst", "vs", "vc",
2935 "pmore", "plast", "tcont", "tstop", "gt", "le", "al", "nv"
2938 /* Return the assembly token for svpattern value VALUE. */
2941 svpattern_token (enum aarch64_svpattern pattern
)
2945 #define CASE(UPPER, LOWER, VALUE) case AARCH64_SV_##UPPER: return #LOWER;
2946 AARCH64_FOR_SVPATTERN (CASE
)
2948 case AARCH64_NUM_SVPATTERNS
:
2954 /* Return the location of a piece that is known to be passed or returned
2955 in registers. FIRST_ZR is the first unused vector argument register
2956 and FIRST_PR is the first unused predicate argument register. */
2959 pure_scalable_type_info::piece::get_rtx (unsigned int first_zr
,
2960 unsigned int first_pr
) const
2962 gcc_assert (VECTOR_MODE_P (mode
)
2963 && first_zr
+ num_zr
<= V0_REGNUM
+ NUM_FP_ARG_REGS
2964 && first_pr
+ num_pr
<= P0_REGNUM
+ NUM_PR_ARG_REGS
);
2966 if (num_zr
> 0 && num_pr
== 0)
2967 return gen_rtx_REG (mode
, first_zr
);
2969 if (num_zr
== 0 && num_pr
== 1)
2970 return gen_rtx_REG (mode
, first_pr
);
2975 /* Return the total number of vector registers required by the PST. */
2978 pure_scalable_type_info::num_zr () const
2980 unsigned int res
= 0;
2981 for (unsigned int i
= 0; i
< pieces
.length (); ++i
)
2982 res
+= pieces
[i
].num_zr
;
2986 /* Return the total number of predicate registers required by the PST. */
2989 pure_scalable_type_info::num_pr () const
2991 unsigned int res
= 0;
2992 for (unsigned int i
= 0; i
< pieces
.length (); ++i
)
2993 res
+= pieces
[i
].num_pr
;
2997 /* Return the location of a PST that is known to be passed or returned
2998 in registers. FIRST_ZR is the first unused vector argument register
2999 and FIRST_PR is the first unused predicate argument register. */
3002 pure_scalable_type_info::get_rtx (machine_mode mode
,
3003 unsigned int first_zr
,
3004 unsigned int first_pr
) const
3006 /* Try to return a single REG if possible. This leads to better
3007 code generation; it isn't required for correctness. */
3008 if (mode
== pieces
[0].mode
)
3010 gcc_assert (pieces
.length () == 1);
3011 return pieces
[0].get_rtx (first_zr
, first_pr
);
3014 /* Build up a PARALLEL that contains the individual pieces. */
3015 rtvec rtxes
= rtvec_alloc (pieces
.length ());
3016 for (unsigned int i
= 0; i
< pieces
.length (); ++i
)
3018 rtx reg
= pieces
[i
].get_rtx (first_zr
, first_pr
);
3019 rtx offset
= gen_int_mode (pieces
[i
].offset
, Pmode
);
3020 RTVEC_ELT (rtxes
, i
) = gen_rtx_EXPR_LIST (VOIDmode
, reg
, offset
);
3021 first_zr
+= pieces
[i
].num_zr
;
3022 first_pr
+= pieces
[i
].num_pr
;
3024 return gen_rtx_PARALLEL (mode
, rtxes
);
3027 /* Analyze whether TYPE is a Pure Scalable Type according to the rules
3030 pure_scalable_type_info::analysis_result
3031 pure_scalable_type_info::analyze (const_tree type
)
3033 /* Prevent accidental reuse. */
3034 gcc_assert (pieces
.is_empty ());
3036 /* No code will be generated for erroneous types, so we won't establish
3038 if (type
== error_mark_node
)
3039 return NO_ABI_IDENTITY
;
3041 /* Zero-sized types disappear in the language->ABI mapping. */
3042 if (TYPE_SIZE (type
) && integer_zerop (TYPE_SIZE (type
)))
3043 return NO_ABI_IDENTITY
;
3045 /* Check for SVTs, SPTs, and built-in tuple types that map to PSTs. */
3047 if (aarch64_sve::builtin_type_p (type
, &p
.num_zr
, &p
.num_pr
))
3049 machine_mode mode
= TYPE_MODE_RAW (type
);
3050 gcc_assert (VECTOR_MODE_P (mode
)
3051 && (!TARGET_SVE
|| aarch64_sve_mode_p (mode
)));
3053 p
.mode
= p
.orig_mode
= mode
;
3058 /* Check for user-defined PSTs. */
3059 if (TREE_CODE (type
) == ARRAY_TYPE
)
3060 return analyze_array (type
);
3061 if (TREE_CODE (type
) == RECORD_TYPE
)
3062 return analyze_record (type
);
3067 /* Analyze a type that is known not to be passed or returned in memory.
3068 Return true if it has an ABI identity and is a Pure Scalable Type. */
3071 pure_scalable_type_info::analyze_registers (const_tree type
)
3073 analysis_result result
= analyze (type
);
3074 gcc_assert (result
!= DOESNT_MATTER
);
3075 return result
== IS_PST
;
3078 /* Subroutine of analyze for handling ARRAY_TYPEs. */
3080 pure_scalable_type_info::analysis_result
3081 pure_scalable_type_info::analyze_array (const_tree type
)
3083 /* Analyze the element type. */
3084 pure_scalable_type_info element_info
;
3085 analysis_result result
= element_info
.analyze (TREE_TYPE (type
));
3086 if (result
!= IS_PST
)
3089 /* An array of unknown, flexible or variable length will be passed and
3090 returned by reference whatever we do. */
3091 tree nelts_minus_one
= array_type_nelts (type
);
3092 if (!tree_fits_uhwi_p (nelts_minus_one
))
3093 return DOESNT_MATTER
;
3095 /* Likewise if the array is constant-sized but too big to be interesting.
3096 The double checks against MAX_PIECES are to protect against overflow. */
3097 unsigned HOST_WIDE_INT count
= tree_to_uhwi (nelts_minus_one
);
3098 if (count
> MAX_PIECES
)
3099 return DOESNT_MATTER
;
3101 if (count
* element_info
.pieces
.length () > MAX_PIECES
)
3102 return DOESNT_MATTER
;
3104 /* The above checks should have weeded out elements of unknown size. */
3105 poly_uint64 element_bytes
;
3106 if (!poly_int_tree_p (TYPE_SIZE_UNIT (TREE_TYPE (type
)), &element_bytes
))
3109 /* Build up the list of individual vectors and predicates. */
3110 gcc_assert (!element_info
.pieces
.is_empty ());
3111 for (unsigned int i
= 0; i
< count
; ++i
)
3112 for (unsigned int j
= 0; j
< element_info
.pieces
.length (); ++j
)
3114 piece p
= element_info
.pieces
[j
];
3115 p
.offset
+= i
* element_bytes
;
3121 /* Subroutine of analyze for handling RECORD_TYPEs. */
3123 pure_scalable_type_info::analysis_result
3124 pure_scalable_type_info::analyze_record (const_tree type
)
3126 for (tree field
= TYPE_FIELDS (type
); field
; field
= TREE_CHAIN (field
))
3128 if (TREE_CODE (field
) != FIELD_DECL
)
3131 /* Zero-sized fields disappear in the language->ABI mapping. */
3132 if (DECL_SIZE (field
) && integer_zerop (DECL_SIZE (field
)))
3135 /* All fields with an ABI identity must be PSTs for the record as
3136 a whole to be a PST. If any individual field is too big to be
3137 interesting then the record is too. */
3138 pure_scalable_type_info field_info
;
3139 analysis_result subresult
= field_info
.analyze (TREE_TYPE (field
));
3140 if (subresult
== NO_ABI_IDENTITY
)
3142 if (subresult
!= IS_PST
)
3145 /* Since all previous fields are PSTs, we ought to be able to track
3146 the field offset using poly_ints. */
3147 tree bitpos
= bit_position (field
);
3148 gcc_assert (poly_int_tree_p (bitpos
));
3150 /* For the same reason, it shouldn't be possible to create a PST field
3151 whose offset isn't byte-aligned. */
3152 poly_widest_int wide_bytepos
= exact_div (wi::to_poly_widest (bitpos
),
3155 /* Punt if the record is too big to be interesting. */
3156 poly_uint64 bytepos
;
3157 if (!wide_bytepos
.to_uhwi (&bytepos
)
3158 || pieces
.length () + field_info
.pieces
.length () > MAX_PIECES
)
3159 return DOESNT_MATTER
;
3161 /* Add the individual vectors and predicates in the field to the
3163 gcc_assert (!field_info
.pieces
.is_empty ());
3164 for (unsigned int i
= 0; i
< field_info
.pieces
.length (); ++i
)
3166 piece p
= field_info
.pieces
[i
];
3167 p
.offset
+= bytepos
;
3171 /* Empty structures disappear in the language->ABI mapping. */
3172 return pieces
.is_empty () ? NO_ABI_IDENTITY
: IS_PST
;
3175 /* Add P to the list of pieces in the type. */
3178 pure_scalable_type_info::add_piece (const piece
&p
)
3180 /* Try to fold the new piece into the previous one to form a
3181 single-mode PST. For example, if we see three consecutive vectors
3182 of the same mode, we can represent them using the corresponding
3185 This is purely an optimization. */
3186 if (!pieces
.is_empty ())
3188 piece
&prev
= pieces
.last ();
3189 gcc_assert (VECTOR_MODE_P (p
.mode
) && VECTOR_MODE_P (prev
.mode
));
3190 unsigned int nelems1
, nelems2
;
3191 if (prev
.orig_mode
== p
.orig_mode
3192 && known_eq (prev
.offset
+ GET_MODE_SIZE (prev
.mode
), p
.offset
)
3193 && constant_multiple_p (GET_MODE_NUNITS (prev
.mode
),
3194 GET_MODE_NUNITS (p
.orig_mode
), &nelems1
)
3195 && constant_multiple_p (GET_MODE_NUNITS (p
.mode
),
3196 GET_MODE_NUNITS (p
.orig_mode
), &nelems2
)
3197 && targetm
.array_mode (p
.orig_mode
,
3198 nelems1
+ nelems2
).exists (&prev
.mode
))
3200 prev
.num_zr
+= p
.num_zr
;
3201 prev
.num_pr
+= p
.num_pr
;
3205 pieces
.quick_push (p
);
3208 /* Return true if at least one possible value of type TYPE includes at
3209 least one object of Pure Scalable Type, in the sense of the AAPCS64.
3211 This is a relatively expensive test for some types, so it should
3212 generally be made as late as possible. */
3215 aarch64_some_values_include_pst_objects_p (const_tree type
)
3217 if (TYPE_SIZE (type
) && integer_zerop (TYPE_SIZE (type
)))
3220 if (aarch64_sve::builtin_type_p (type
))
3223 if (TREE_CODE (type
) == ARRAY_TYPE
|| TREE_CODE (type
) == COMPLEX_TYPE
)
3224 return aarch64_some_values_include_pst_objects_p (TREE_TYPE (type
));
3226 if (RECORD_OR_UNION_TYPE_P (type
))
3227 for (tree field
= TYPE_FIELDS (type
); field
; field
= TREE_CHAIN (field
))
3228 if (TREE_CODE (field
) == FIELD_DECL
3229 && aarch64_some_values_include_pst_objects_p (TREE_TYPE (field
)))
3235 /* Return the descriptor of the SIMD ABI. */
3237 static const predefined_function_abi
&
3238 aarch64_simd_abi (void)
3240 predefined_function_abi
&simd_abi
= function_abis
[ARM_PCS_SIMD
];
3241 if (!simd_abi
.initialized_p ())
3243 HARD_REG_SET full_reg_clobbers
3244 = default_function_abi
.full_reg_clobbers ();
3245 for (int regno
= 0; regno
< FIRST_PSEUDO_REGISTER
; regno
++)
3246 if (FP_SIMD_SAVED_REGNUM_P (regno
))
3247 CLEAR_HARD_REG_BIT (full_reg_clobbers
, regno
);
3248 simd_abi
.initialize (ARM_PCS_SIMD
, full_reg_clobbers
);
3253 /* Return the descriptor of the SVE PCS. */
3255 static const predefined_function_abi
&
3256 aarch64_sve_abi (void)
3258 predefined_function_abi
&sve_abi
= function_abis
[ARM_PCS_SVE
];
3259 if (!sve_abi
.initialized_p ())
3261 HARD_REG_SET full_reg_clobbers
3262 = default_function_abi
.full_reg_clobbers ();
3263 for (int regno
= V8_REGNUM
; regno
<= V23_REGNUM
; ++regno
)
3264 CLEAR_HARD_REG_BIT (full_reg_clobbers
, regno
);
3265 for (int regno
= P4_REGNUM
; regno
<= P15_REGNUM
; ++regno
)
3266 CLEAR_HARD_REG_BIT (full_reg_clobbers
, regno
);
3267 sve_abi
.initialize (ARM_PCS_SVE
, full_reg_clobbers
);
3272 /* If X is an UNSPEC_SALT_ADDR expression, return the address that it
3273 wraps, otherwise return X itself. */
3279 if (GET_CODE (search
) == CONST
)
3280 search
= XEXP (search
, 0);
3281 if (GET_CODE (search
) == UNSPEC
&& XINT (search
, 1) == UNSPEC_SALT_ADDR
)
3282 x
= XVECEXP (search
, 0, 0);
3286 /* Like strip_offset, but also strip any UNSPEC_SALT_ADDR from the
3290 strip_offset_and_salt (rtx addr
, poly_int64
*offset
)
3292 return strip_salt (strip_offset (addr
, offset
));
3295 /* Generate code to enable conditional branches in functions over 1 MiB. */
3297 aarch64_gen_far_branch (rtx
* operands
, int pos_label
, const char * dest
,
3298 const char * branch_format
)
3300 rtx_code_label
* tmp_label
= gen_label_rtx ();
3301 char label_buf
[256];
3303 ASM_GENERATE_INTERNAL_LABEL (label_buf
, dest
,
3304 CODE_LABEL_NUMBER (tmp_label
));
3305 const char *label_ptr
= targetm
.strip_name_encoding (label_buf
);
3306 rtx dest_label
= operands
[pos_label
];
3307 operands
[pos_label
] = tmp_label
;
3309 snprintf (buffer
, sizeof (buffer
), "%s%s", branch_format
, label_ptr
);
3310 output_asm_insn (buffer
, operands
);
3312 snprintf (buffer
, sizeof (buffer
), "b\t%%l%d\n%s:", pos_label
, label_ptr
);
3313 operands
[pos_label
] = dest_label
;
3314 output_asm_insn (buffer
, operands
);
3319 aarch64_err_no_fpadvsimd (machine_mode mode
)
3321 if (TARGET_GENERAL_REGS_ONLY
)
3322 if (FLOAT_MODE_P (mode
))
3323 error ("%qs is incompatible with the use of floating-point types",
3324 "-mgeneral-regs-only");
3326 error ("%qs is incompatible with the use of vector types",
3327 "-mgeneral-regs-only");
3329 if (FLOAT_MODE_P (mode
))
3330 error ("%qs feature modifier is incompatible with the use of"
3331 " floating-point types", "+nofp");
3333 error ("%qs feature modifier is incompatible with the use of"
3334 " vector types", "+nofp");
3337 /* Report when we try to do something that requires SVE when SVE is disabled.
3338 This is an error of last resort and isn't very high-quality. It usually
3339 involves attempts to measure the vector length in some way. */
3341 aarch64_report_sve_required (void)
3343 static bool reported_p
= false;
3345 /* Avoid reporting a slew of messages for a single oversight. */
3349 error ("this operation requires the SVE ISA extension");
3350 inform (input_location
, "you can enable SVE using the command-line"
3351 " option %<-march%>, or by using the %<target%>"
3352 " attribute or pragma");
3356 /* Return true if REGNO is P0-P15 or one of the special FFR-related
3359 pr_or_ffr_regnum_p (unsigned int regno
)
3361 return PR_REGNUM_P (regno
) || regno
== FFR_REGNUM
|| regno
== FFRT_REGNUM
;
3364 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
3365 The register allocator chooses POINTER_AND_FP_REGS if FP_REGS and
3366 GENERAL_REGS have the same cost - even if POINTER_AND_FP_REGS has a much
3367 higher cost. POINTER_AND_FP_REGS is also used if the cost of both FP_REGS
3368 and GENERAL_REGS is lower than the memory cost (in this case the best class
3369 is the lowest cost one). Using POINTER_AND_FP_REGS irrespectively of its
3370 cost results in bad allocations with many redundant int<->FP moves which
3371 are expensive on various cores.
3372 To avoid this we don't allow POINTER_AND_FP_REGS as the allocno class, but
3373 force a decision between FP_REGS and GENERAL_REGS. We use the allocno class
3374 if it isn't POINTER_AND_FP_REGS. Similarly, use the best class if it isn't
3375 POINTER_AND_FP_REGS. Otherwise set the allocno class depending on the mode.
3376 The result of this is that it is no longer inefficient to have a higher
3377 memory move cost than the register move cost.
3381 aarch64_ira_change_pseudo_allocno_class (int regno
, reg_class_t allocno_class
,
3382 reg_class_t best_class
)
3386 if (!reg_class_subset_p (GENERAL_REGS
, allocno_class
)
3387 || !reg_class_subset_p (FP_REGS
, allocno_class
))
3388 return allocno_class
;
3390 if (!reg_class_subset_p (GENERAL_REGS
, best_class
)
3391 || !reg_class_subset_p (FP_REGS
, best_class
))
3394 mode
= PSEUDO_REGNO_MODE (regno
);
3395 return FLOAT_MODE_P (mode
) || VECTOR_MODE_P (mode
) ? FP_REGS
: GENERAL_REGS
;
3399 aarch64_min_divisions_for_recip_mul (machine_mode mode
)
3401 if (GET_MODE_UNIT_SIZE (mode
) == 4)
3402 return aarch64_tune_params
.min_div_recip_mul_sf
;
3403 return aarch64_tune_params
.min_div_recip_mul_df
;
3406 /* Return the reassociation width of treeop OPC with mode MODE. */
3408 aarch64_reassociation_width (unsigned opc
, machine_mode mode
)
3410 if (VECTOR_MODE_P (mode
))
3411 return aarch64_tune_params
.vec_reassoc_width
;
3412 if (INTEGRAL_MODE_P (mode
))
3413 return aarch64_tune_params
.int_reassoc_width
;
3414 /* Reassociation reduces the number of FMAs which may result in worse
3415 performance. Use a per-CPU setting for FMA reassociation which allows
3416 narrow CPUs with few FP pipes to switch it off (value of 1), and wider
3417 CPUs with many FP pipes to enable reassociation.
3418 Since the reassociation pass doesn't understand FMA at all, assume
3419 that any FP addition might turn into FMA. */
3420 if (FLOAT_MODE_P (mode
))
3421 return opc
== PLUS_EXPR
? aarch64_tune_params
.fma_reassoc_width
3422 : aarch64_tune_params
.fp_reassoc_width
;
3426 /* Provide a mapping from gcc register numbers to dwarf register numbers. */
3428 aarch64_debugger_regno (unsigned regno
)
3430 if (GP_REGNUM_P (regno
))
3431 return AARCH64_DWARF_R0
+ regno
- R0_REGNUM
;
3432 else if (regno
== SP_REGNUM
)
3433 return AARCH64_DWARF_SP
;
3434 else if (FP_REGNUM_P (regno
))
3435 return AARCH64_DWARF_V0
+ regno
- V0_REGNUM
;
3436 else if (PR_REGNUM_P (regno
))
3437 return AARCH64_DWARF_P0
+ regno
- P0_REGNUM
;
3438 else if (regno
== VG_REGNUM
)
3439 return AARCH64_DWARF_VG
;
3441 /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
3442 equivalent DWARF register. */
3443 return DWARF_FRAME_REGISTERS
;
3446 /* Implement TARGET_DWARF_FRAME_REG_MODE. */
3448 aarch64_dwarf_frame_reg_mode (int regno
)
3450 /* Predicate registers are call-clobbered in the EH ABI (which is
3451 ARM_PCS_AAPCS64), so they should not be described by CFI.
3452 Their size changes as VL changes, so any values computed by
3453 __builtin_init_dwarf_reg_size_table might not be valid for
3455 if (PR_REGNUM_P (regno
))
3457 return default_dwarf_frame_reg_mode (regno
);
3460 /* If X is a CONST_DOUBLE, return its bit representation as a constant
3461 integer, otherwise return X unmodified. */
3463 aarch64_bit_representation (rtx x
)
3465 if (CONST_DOUBLE_P (x
))
3466 x
= gen_lowpart (int_mode_for_mode (GET_MODE (x
)).require (), x
);
3470 /* Return an estimate for the number of quadwords in an SVE vector. This is
3471 equivalent to the number of Advanced SIMD vectors in an SVE vector. */
3473 aarch64_estimated_sve_vq ()
3475 return estimated_poly_value (BITS_PER_SVE_VECTOR
) / 128;
3478 /* Return true if MODE is an SVE predicate mode. */
3480 aarch64_sve_pred_mode_p (machine_mode mode
)
3483 && (mode
== VNx16BImode
3484 || mode
== VNx8BImode
3485 || mode
== VNx4BImode
3486 || mode
== VNx2BImode
));
3489 /* Three mutually-exclusive flags describing a vector or predicate type. */
3490 const unsigned int VEC_ADVSIMD
= 1;
3491 const unsigned int VEC_SVE_DATA
= 2;
3492 const unsigned int VEC_SVE_PRED
= 4;
3493 /* Can be used in combination with VEC_ADVSIMD or VEC_SVE_DATA to indicate
3494 a structure of 2, 3 or 4 vectors. */
3495 const unsigned int VEC_STRUCT
= 8;
3496 /* Can be used in combination with VEC_SVE_DATA to indicate that the
3497 vector has fewer significant bytes than a full SVE vector. */
3498 const unsigned int VEC_PARTIAL
= 16;
3499 /* Useful combinations of the above. */
3500 const unsigned int VEC_ANY_SVE
= VEC_SVE_DATA
| VEC_SVE_PRED
;
3501 const unsigned int VEC_ANY_DATA
= VEC_ADVSIMD
| VEC_SVE_DATA
;
3503 /* Return a set of flags describing the vector properties of mode MODE.
3504 Ignore modes that are not supported by the current target. */
3506 aarch64_classify_vector_mode (machine_mode mode
)
3508 if (aarch64_sve_pred_mode_p (mode
))
3509 return VEC_SVE_PRED
;
3511 /* Make the decision based on the mode's enum value rather than its
3512 properties, so that we keep the correct classification regardless
3513 of -msve-vector-bits. */
3516 /* Partial SVE QI vectors. */
3520 /* Partial SVE HI vectors. */
3523 /* Partial SVE SI vector. */
3525 /* Partial SVE HF vectors. */
3528 /* Partial SVE BF vectors. */
3531 /* Partial SVE SF vector. */
3533 return TARGET_SVE
? VEC_SVE_DATA
| VEC_PARTIAL
: 0;
3543 return TARGET_SVE
? VEC_SVE_DATA
: 0;
3545 /* x2 SVE vectors. */
3554 /* x3 SVE vectors. */
3563 /* x4 SVE vectors. */
3572 return TARGET_SVE
? VEC_SVE_DATA
| VEC_STRUCT
: 0;
3577 return TARGET_FLOAT
? VEC_ADVSIMD
| VEC_STRUCT
: 0;
3579 /* Structures of 64-bit Advanced SIMD vectors. */
3604 return TARGET_FLOAT
? VEC_ADVSIMD
| VEC_STRUCT
| VEC_PARTIAL
: 0;
3606 /* Structures of 128-bit Advanced SIMD vectors. */
3631 return TARGET_FLOAT
? VEC_ADVSIMD
| VEC_STRUCT
: 0;
3633 /* 64-bit Advanced SIMD vectors. */
3642 /* 128-bit Advanced SIMD vectors. */
3651 return TARGET_FLOAT
? VEC_ADVSIMD
: 0;
3658 /* Return true if MODE is any of the Advanced SIMD structure modes. */
3660 aarch64_advsimd_struct_mode_p (machine_mode mode
)
3662 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
3663 return (vec_flags
& VEC_ADVSIMD
) && (vec_flags
& VEC_STRUCT
);
3666 /* Return true if MODE is an Advanced SIMD D-register structure mode. */
3668 aarch64_advsimd_partial_struct_mode_p (machine_mode mode
)
3670 return (aarch64_classify_vector_mode (mode
)
3671 == (VEC_ADVSIMD
| VEC_STRUCT
| VEC_PARTIAL
));
3674 /* Return true if MODE is an Advanced SIMD Q-register structure mode. */
3676 aarch64_advsimd_full_struct_mode_p (machine_mode mode
)
3678 return (aarch64_classify_vector_mode (mode
) == (VEC_ADVSIMD
| VEC_STRUCT
));
3681 /* Return true if MODE is any of the data vector modes, including
3684 aarch64_vector_data_mode_p (machine_mode mode
)
3686 return aarch64_classify_vector_mode (mode
) & VEC_ANY_DATA
;
3689 /* Return true if MODE is any form of SVE mode, including predicates,
3690 vectors and structures. */
3692 aarch64_sve_mode_p (machine_mode mode
)
3694 return aarch64_classify_vector_mode (mode
) & VEC_ANY_SVE
;
3697 /* Return true if MODE is an SVE data vector mode; either a single vector
3698 or a structure of vectors. */
3700 aarch64_sve_data_mode_p (machine_mode mode
)
3702 return aarch64_classify_vector_mode (mode
) & VEC_SVE_DATA
;
3705 /* Return the number of defined bytes in one constituent vector of
3706 SVE mode MODE, which has vector flags VEC_FLAGS. */
3708 aarch64_vl_bytes (machine_mode mode
, unsigned int vec_flags
)
3710 if (vec_flags
& VEC_PARTIAL
)
3711 /* A single partial vector. */
3712 return GET_MODE_SIZE (mode
);
3714 if (vec_flags
& VEC_SVE_DATA
)
3715 /* A single vector or a tuple. */
3716 return BYTES_PER_SVE_VECTOR
;
3718 /* A single predicate. */
3719 gcc_assert (vec_flags
& VEC_SVE_PRED
);
3720 return BYTES_PER_SVE_PRED
;
3723 /* If MODE holds an array of vectors, return the number of vectors
3724 in the array, otherwise return 1. */
3727 aarch64_ldn_stn_vectors (machine_mode mode
)
3729 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
3730 if (vec_flags
== (VEC_ADVSIMD
| VEC_PARTIAL
| VEC_STRUCT
))
3731 return exact_div (GET_MODE_SIZE (mode
), 8).to_constant ();
3732 if (vec_flags
== (VEC_ADVSIMD
| VEC_STRUCT
))
3733 return exact_div (GET_MODE_SIZE (mode
), 16).to_constant ();
3734 if (vec_flags
== (VEC_SVE_DATA
| VEC_STRUCT
))
3735 return exact_div (GET_MODE_SIZE (mode
),
3736 BYTES_PER_SVE_VECTOR
).to_constant ();
3740 /* Given an Advanced SIMD vector mode MODE and a tuple size NELEMS, return the
3741 corresponding vector structure mode. */
3742 static opt_machine_mode
3743 aarch64_advsimd_vector_array_mode (machine_mode mode
,
3744 unsigned HOST_WIDE_INT nelems
)
3746 unsigned int flags
= VEC_ADVSIMD
| VEC_STRUCT
;
3747 if (known_eq (GET_MODE_SIZE (mode
), 8))
3748 flags
|= VEC_PARTIAL
;
3750 machine_mode struct_mode
;
3751 FOR_EACH_MODE_IN_CLASS (struct_mode
, GET_MODE_CLASS (mode
))
3752 if (aarch64_classify_vector_mode (struct_mode
) == flags
3753 && GET_MODE_INNER (struct_mode
) == GET_MODE_INNER (mode
)
3754 && known_eq (GET_MODE_NUNITS (struct_mode
),
3755 GET_MODE_NUNITS (mode
) * nelems
))
3757 return opt_machine_mode ();
3760 /* Return the SVE vector mode that has NUNITS elements of mode INNER_MODE. */
3763 aarch64_sve_data_mode (scalar_mode inner_mode
, poly_uint64 nunits
)
3765 enum mode_class mclass
= (is_a
<scalar_float_mode
> (inner_mode
)
3766 ? MODE_VECTOR_FLOAT
: MODE_VECTOR_INT
);
3768 FOR_EACH_MODE_IN_CLASS (mode
, mclass
)
3769 if (inner_mode
== GET_MODE_INNER (mode
)
3770 && known_eq (nunits
, GET_MODE_NUNITS (mode
))
3771 && aarch64_sve_data_mode_p (mode
))
3773 return opt_machine_mode ();
3776 /* Implement target hook TARGET_ARRAY_MODE. */
3777 static opt_machine_mode
3778 aarch64_array_mode (machine_mode mode
, unsigned HOST_WIDE_INT nelems
)
3780 if (aarch64_classify_vector_mode (mode
) == VEC_SVE_DATA
3781 && IN_RANGE (nelems
, 2, 4))
3782 return aarch64_sve_data_mode (GET_MODE_INNER (mode
),
3783 GET_MODE_NUNITS (mode
) * nelems
);
3784 if (aarch64_classify_vector_mode (mode
) == VEC_ADVSIMD
3785 && IN_RANGE (nelems
, 2, 4))
3786 return aarch64_advsimd_vector_array_mode (mode
, nelems
);
3788 return opt_machine_mode ();
3791 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P. */
3793 aarch64_array_mode_supported_p (machine_mode mode
,
3794 unsigned HOST_WIDE_INT nelems
)
3797 && (AARCH64_VALID_SIMD_QREG_MODE (mode
)
3798 || AARCH64_VALID_SIMD_DREG_MODE (mode
))
3799 && (nelems
>= 2 && nelems
<= 4))
3805 /* MODE is some form of SVE vector mode. For data modes, return the number
3806 of vector register bits that each element of MODE occupies, such as 64
3807 for both VNx2DImode and VNx2SImode (where each 32-bit value is stored
3808 in a 64-bit container). For predicate modes, return the number of
3809 data bits controlled by each significant predicate bit. */
3812 aarch64_sve_container_bits (machine_mode mode
)
3814 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
3815 poly_uint64 vector_bits
= (vec_flags
& (VEC_PARTIAL
| VEC_SVE_PRED
)
3816 ? BITS_PER_SVE_VECTOR
3817 : GET_MODE_BITSIZE (mode
));
3818 return vector_element_size (vector_bits
, GET_MODE_NUNITS (mode
));
3821 /* Return the SVE predicate mode to use for elements that have
3822 ELEM_NBYTES bytes, if such a mode exists. */
3825 aarch64_sve_pred_mode (unsigned int elem_nbytes
)
3829 if (elem_nbytes
== 1)
3831 if (elem_nbytes
== 2)
3833 if (elem_nbytes
== 4)
3835 if (elem_nbytes
== 8)
3838 return opt_machine_mode ();
3841 /* Return the SVE predicate mode that should be used to control
3845 aarch64_sve_pred_mode (machine_mode mode
)
3847 unsigned int bits
= aarch64_sve_container_bits (mode
);
3848 return aarch64_sve_pred_mode (bits
/ BITS_PER_UNIT
).require ();
3851 /* Implement TARGET_VECTORIZE_GET_MASK_MODE. */
3853 static opt_machine_mode
3854 aarch64_get_mask_mode (machine_mode mode
)
3856 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
3857 if (vec_flags
& VEC_SVE_DATA
)
3858 return aarch64_sve_pred_mode (mode
);
3860 return default_get_mask_mode (mode
);
3863 /* Return the integer element mode associated with SVE mode MODE. */
3865 static scalar_int_mode
3866 aarch64_sve_element_int_mode (machine_mode mode
)
3868 poly_uint64 vector_bits
= (GET_MODE_CLASS (mode
) == MODE_VECTOR_BOOL
3869 ? BITS_PER_SVE_VECTOR
3870 : GET_MODE_BITSIZE (mode
));
3871 unsigned int elt_bits
= vector_element_size (vector_bits
,
3872 GET_MODE_NUNITS (mode
));
3873 return int_mode_for_size (elt_bits
, 0).require ();
3876 /* Return an integer element mode that contains exactly
3877 aarch64_sve_container_bits (MODE) bits. This is wider than
3878 aarch64_sve_element_int_mode if MODE is a partial vector,
3879 otherwise it's the same. */
3881 static scalar_int_mode
3882 aarch64_sve_container_int_mode (machine_mode mode
)
3884 return int_mode_for_size (aarch64_sve_container_bits (mode
), 0).require ();
3887 /* Return the integer vector mode associated with SVE mode MODE.
3888 Unlike related_int_vector_mode, this can handle the case in which
3889 MODE is a predicate (and thus has a different total size). */
3892 aarch64_sve_int_mode (machine_mode mode
)
3894 scalar_int_mode int_mode
= aarch64_sve_element_int_mode (mode
);
3895 return aarch64_sve_data_mode (int_mode
, GET_MODE_NUNITS (mode
)).require ();
3898 /* Implement TARGET_VECTORIZE_RELATED_MODE. */
3900 static opt_machine_mode
3901 aarch64_vectorize_related_mode (machine_mode vector_mode
,
3902 scalar_mode element_mode
,
3905 unsigned int vec_flags
= aarch64_classify_vector_mode (vector_mode
);
3907 /* If we're operating on SVE vectors, try to return an SVE mode. */
3908 poly_uint64 sve_nunits
;
3909 if ((vec_flags
& VEC_SVE_DATA
)
3910 && multiple_p (BYTES_PER_SVE_VECTOR
,
3911 GET_MODE_SIZE (element_mode
), &sve_nunits
))
3913 machine_mode sve_mode
;
3914 if (maybe_ne (nunits
, 0U))
3916 /* Try to find a full or partial SVE mode with exactly
3918 if (multiple_p (sve_nunits
, nunits
)
3919 && aarch64_sve_data_mode (element_mode
,
3920 nunits
).exists (&sve_mode
))
3925 /* Take the preferred number of units from the number of bytes
3926 that fit in VECTOR_MODE. We always start by "autodetecting"
3927 a full vector mode with preferred_simd_mode, so vectors
3928 chosen here will also be full vector modes. Then
3929 autovectorize_vector_modes tries smaller starting modes
3930 and thus smaller preferred numbers of units. */
3931 sve_nunits
= ordered_min (sve_nunits
, GET_MODE_SIZE (vector_mode
));
3932 if (aarch64_sve_data_mode (element_mode
,
3933 sve_nunits
).exists (&sve_mode
))
3938 /* Prefer to use 1 128-bit vector instead of 2 64-bit vectors. */
3940 && (vec_flags
& VEC_ADVSIMD
)
3941 && known_eq (nunits
, 0U)
3942 && known_eq (GET_MODE_BITSIZE (vector_mode
), 64U)
3943 && maybe_ge (GET_MODE_BITSIZE (element_mode
)
3944 * GET_MODE_NUNITS (vector_mode
), 128U))
3946 machine_mode res
= aarch64_simd_container_mode (element_mode
, 128);
3947 if (VECTOR_MODE_P (res
))
3951 return default_vectorize_related_mode (vector_mode
, element_mode
, nunits
);
3954 /* Implement TARGET_PREFERRED_ELSE_VALUE. For binary operations,
3955 prefer to use the first arithmetic operand as the else value if
3956 the else value doesn't matter, since that exactly matches the SVE
3957 destructive merging form. For ternary operations we could either
3958 pick the first operand and use FMAD-like instructions or the last
3959 operand and use FMLA-like instructions; the latter seems more
3963 aarch64_preferred_else_value (unsigned, tree
, unsigned int nops
, tree
*ops
)
3965 return nops
== 3 ? ops
[2] : ops
[0];
3968 /* Implement TARGET_HARD_REGNO_NREGS. */
3971 aarch64_hard_regno_nregs (unsigned regno
, machine_mode mode
)
3973 /* ??? Logically we should only need to provide a value when
3974 HARD_REGNO_MODE_OK says that the combination is valid,
3975 but at the moment we need to handle all modes. Just ignore
3976 any runtime parts for registers that can't store them. */
3977 HOST_WIDE_INT lowest_size
= constant_lower_bound (GET_MODE_SIZE (mode
));
3978 switch (aarch64_regno_regclass (regno
))
3984 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
3985 if (vec_flags
& VEC_SVE_DATA
)
3986 return exact_div (GET_MODE_SIZE (mode
),
3987 aarch64_vl_bytes (mode
, vec_flags
)).to_constant ();
3988 if (vec_flags
== (VEC_ADVSIMD
| VEC_STRUCT
| VEC_PARTIAL
))
3989 return GET_MODE_SIZE (mode
).to_constant () / 8;
3990 return CEIL (lowest_size
, UNITS_PER_VREG
);
3996 case PR_AND_FFR_REGS
:
3999 return CEIL (lowest_size
, UNITS_PER_WORD
);
4004 /* Implement TARGET_HARD_REGNO_MODE_OK. */
4007 aarch64_hard_regno_mode_ok (unsigned regno
, machine_mode mode
)
4009 if (mode
== V8DImode
)
4010 return IN_RANGE (regno
, R0_REGNUM
, R23_REGNUM
)
4011 && multiple_p (regno
- R0_REGNUM
, 2);
4013 if (GET_MODE_CLASS (mode
) == MODE_CC
)
4014 return regno
== CC_REGNUM
;
4016 if (regno
== VG_REGNUM
)
4017 /* This must have the same size as _Unwind_Word. */
4018 return mode
== DImode
;
4020 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
4021 if (vec_flags
& VEC_SVE_PRED
)
4022 return pr_or_ffr_regnum_p (regno
);
4024 if (pr_or_ffr_regnum_p (regno
))
4027 if (regno
== SP_REGNUM
)
4028 /* The purpose of comparing with ptr_mode is to support the
4029 global register variable associated with the stack pointer
4030 register via the syntax of asm ("wsp") in ILP32. */
4031 return mode
== Pmode
|| mode
== ptr_mode
;
4033 if (regno
== FRAME_POINTER_REGNUM
|| regno
== ARG_POINTER_REGNUM
)
4034 return mode
== Pmode
;
4036 if (GP_REGNUM_P (regno
))
4038 if (vec_flags
& (VEC_ANY_SVE
| VEC_STRUCT
))
4040 if (known_le (GET_MODE_SIZE (mode
), 8))
4042 if (known_le (GET_MODE_SIZE (mode
), 16))
4043 return (regno
& 1) == 0;
4045 else if (FP_REGNUM_P (regno
))
4047 if (vec_flags
& VEC_STRUCT
)
4048 return end_hard_regno (mode
, regno
) - 1 <= V31_REGNUM
;
4050 return !VECTOR_MODE_P (mode
) || vec_flags
!= 0;
4056 /* Return true if a function with type FNTYPE returns its value in
4057 SVE vector or predicate registers. */
4060 aarch64_returns_value_in_sve_regs_p (const_tree fntype
)
4062 tree return_type
= TREE_TYPE (fntype
);
4064 pure_scalable_type_info pst_info
;
4065 switch (pst_info
.analyze (return_type
))
4067 case pure_scalable_type_info::IS_PST
:
4068 return (pst_info
.num_zr () <= NUM_FP_ARG_REGS
4069 && pst_info
.num_pr () <= NUM_PR_ARG_REGS
);
4071 case pure_scalable_type_info::DOESNT_MATTER
:
4072 gcc_assert (aarch64_return_in_memory_1 (return_type
));
4075 case pure_scalable_type_info::NO_ABI_IDENTITY
:
4076 case pure_scalable_type_info::ISNT_PST
:
4082 /* Return true if a function with type FNTYPE takes arguments in
4083 SVE vector or predicate registers. */
4086 aarch64_takes_arguments_in_sve_regs_p (const_tree fntype
)
4088 CUMULATIVE_ARGS args_so_far_v
;
4089 aarch64_init_cumulative_args (&args_so_far_v
, NULL_TREE
, NULL_RTX
,
4090 NULL_TREE
, 0, true);
4091 cumulative_args_t args_so_far
= pack_cumulative_args (&args_so_far_v
);
4093 for (tree chain
= TYPE_ARG_TYPES (fntype
);
4094 chain
&& chain
!= void_list_node
;
4095 chain
= TREE_CHAIN (chain
))
4097 tree arg_type
= TREE_VALUE (chain
);
4098 if (arg_type
== error_mark_node
)
4101 function_arg_info
arg (arg_type
, /*named=*/true);
4102 apply_pass_by_reference_rules (&args_so_far_v
, arg
);
4103 pure_scalable_type_info pst_info
;
4104 if (pst_info
.analyze_registers (arg
.type
))
4106 unsigned int end_zr
= args_so_far_v
.aapcs_nvrn
+ pst_info
.num_zr ();
4107 unsigned int end_pr
= args_so_far_v
.aapcs_nprn
+ pst_info
.num_pr ();
4108 gcc_assert (end_zr
<= NUM_FP_ARG_REGS
&& end_pr
<= NUM_PR_ARG_REGS
);
4112 targetm
.calls
.function_arg_advance (args_so_far
, arg
);
4117 /* Implement TARGET_FNTYPE_ABI. */
4119 static const predefined_function_abi
&
4120 aarch64_fntype_abi (const_tree fntype
)
4122 if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (fntype
)))
4123 return aarch64_simd_abi ();
4125 if (aarch64_returns_value_in_sve_regs_p (fntype
)
4126 || aarch64_takes_arguments_in_sve_regs_p (fntype
))
4127 return aarch64_sve_abi ();
4129 return default_function_abi
;
4132 /* Implement TARGET_COMPATIBLE_VECTOR_TYPES_P. */
4135 aarch64_compatible_vector_types_p (const_tree type1
, const_tree type2
)
4137 return (aarch64_sve::builtin_type_p (type1
)
4138 == aarch64_sve::builtin_type_p (type2
));
4141 /* Return true if we should emit CFI for register REGNO. */
4144 aarch64_emit_cfi_for_reg_p (unsigned int regno
)
4146 return (GP_REGNUM_P (regno
)
4147 || !default_function_abi
.clobbers_full_reg_p (regno
));
4150 /* Return the mode we should use to save and restore register REGNO. */
4153 aarch64_reg_save_mode (unsigned int regno
)
4155 if (GP_REGNUM_P (regno
))
4158 if (FP_REGNUM_P (regno
))
4159 switch (crtl
->abi
->id ())
4161 case ARM_PCS_AAPCS64
:
4162 /* Only the low 64 bits are saved by the base PCS. */
4166 /* The vector PCS saves the low 128 bits (which is the full
4167 register on non-SVE targets). */
4171 /* Use vectors of DImode for registers that need frame
4172 information, so that the first 64 bytes of the save slot
4173 are always the equivalent of what storing D<n> would give. */
4174 if (aarch64_emit_cfi_for_reg_p (regno
))
4177 /* Use vectors of bytes otherwise, so that the layout is
4178 endian-agnostic, and so that we can use LDR and STR for
4179 big-endian targets. */
4182 case ARM_PCS_TLSDESC
:
4183 case ARM_PCS_UNKNOWN
:
4187 if (PR_REGNUM_P (regno
))
4188 /* Save the full predicate register. */
4194 /* Implement TARGET_INSN_CALLEE_ABI. */
4196 const predefined_function_abi
&
4197 aarch64_insn_callee_abi (const rtx_insn
*insn
)
4199 rtx pat
= PATTERN (insn
);
4200 gcc_assert (GET_CODE (pat
) == PARALLEL
);
4201 rtx unspec
= XVECEXP (pat
, 0, 1);
4202 gcc_assert (GET_CODE (unspec
) == UNSPEC
4203 && XINT (unspec
, 1) == UNSPEC_CALLEE_ABI
);
4204 return function_abis
[INTVAL (XVECEXP (unspec
, 0, 0))];
4207 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED. The callee only saves
4208 the lower 64 bits of a 128-bit register. Tell the compiler the callee
4209 clobbers the top 64 bits when restoring the bottom 64 bits. */
4212 aarch64_hard_regno_call_part_clobbered (unsigned int abi_id
,
4216 if (FP_REGNUM_P (regno
) && abi_id
!= ARM_PCS_SVE
)
4218 poly_int64 per_register_size
= GET_MODE_SIZE (mode
);
4219 unsigned int nregs
= hard_regno_nregs (regno
, mode
);
4221 per_register_size
= exact_div (per_register_size
, nregs
);
4222 if (abi_id
== ARM_PCS_SIMD
|| abi_id
== ARM_PCS_TLSDESC
)
4223 return maybe_gt (per_register_size
, 16);
4224 return maybe_gt (per_register_size
, 8);
4229 /* Implement REGMODE_NATURAL_SIZE. */
4231 aarch64_regmode_natural_size (machine_mode mode
)
4233 /* The natural size for SVE data modes is one SVE data vector,
4234 and similarly for predicates. We can't independently modify
4235 anything smaller than that. */
4236 /* ??? For now, only do this for variable-width SVE registers.
4237 Doing it for constant-sized registers breaks lower-subreg.cc. */
4238 /* ??? And once that's fixed, we should probably have similar
4239 code for Advanced SIMD. */
4240 if (!aarch64_sve_vg
.is_constant ())
4242 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
4243 if (vec_flags
& VEC_SVE_PRED
)
4244 return BYTES_PER_SVE_PRED
;
4245 if (vec_flags
& VEC_SVE_DATA
)
4246 return BYTES_PER_SVE_VECTOR
;
4248 return UNITS_PER_WORD
;
4251 /* Implement HARD_REGNO_CALLER_SAVE_MODE. */
4253 aarch64_hard_regno_caller_save_mode (unsigned regno
, unsigned,
4256 /* The predicate mode determines which bits are significant and
4257 which are "don't care". Decreasing the number of lanes would
4258 lose data while increasing the number of lanes would make bits
4259 unnecessarily significant. */
4260 if (PR_REGNUM_P (regno
))
4262 if (known_ge (GET_MODE_SIZE (mode
), 4))
4268 /* Return true if I's bits are consecutive ones from the MSB. */
4270 aarch64_high_bits_all_ones_p (HOST_WIDE_INT i
)
4272 return exact_log2 (-i
) != HOST_WIDE_INT_M1
;
4275 /* Implement TARGET_CONSTANT_ALIGNMENT. Make strings word-aligned so
4276 that strcpy from constants will be faster. */
4278 static HOST_WIDE_INT
4279 aarch64_constant_alignment (const_tree exp
, HOST_WIDE_INT align
)
4281 if (TREE_CODE (exp
) == STRING_CST
&& !optimize_size
)
4282 return MAX (align
, BITS_PER_WORD
);
4286 /* Return true if calls to DECL should be treated as
4287 long-calls (ie called via a register). */
4289 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED
)
4294 /* Return true if calls to symbol-ref SYM should be treated as
4295 long-calls (ie called via a register). */
4297 aarch64_is_long_call_p (rtx sym
)
4299 return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym
));
4302 /* Return true if calls to symbol-ref SYM should not go through
4306 aarch64_is_noplt_call_p (rtx sym
)
4308 const_tree decl
= SYMBOL_REF_DECL (sym
);
4313 || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl
)))
4314 && !targetm
.binds_local_p (decl
))
4320 /* Emit an insn that's a simple single-set. Both the operands must be
4321 known to be valid. */
4322 inline static rtx_insn
*
4323 emit_set_insn (rtx x
, rtx y
)
4325 return emit_insn (gen_rtx_SET (x
, y
));
4328 /* X and Y are two things to compare using CODE. Emit the compare insn and
4329 return the rtx for register 0 in the proper mode. */
4331 aarch64_gen_compare_reg (RTX_CODE code
, rtx x
, rtx y
)
4333 machine_mode cmp_mode
= GET_MODE (x
);
4334 machine_mode cc_mode
;
4337 if (cmp_mode
== TImode
)
4339 gcc_assert (code
== NE
);
4342 cc_reg
= gen_rtx_REG (cc_mode
, CC_REGNUM
);
4344 rtx x_lo
= operand_subword (x
, 0, 0, TImode
);
4345 rtx y_lo
= operand_subword (y
, 0, 0, TImode
);
4346 emit_set_insn (cc_reg
, gen_rtx_COMPARE (cc_mode
, x_lo
, y_lo
));
4348 rtx x_hi
= operand_subword (x
, 1, 0, TImode
);
4349 rtx y_hi
= operand_subword (y
, 1, 0, TImode
);
4350 emit_insn (gen_ccmpccdi (cc_reg
, cc_reg
, x_hi
, y_hi
,
4351 gen_rtx_EQ (cc_mode
, cc_reg
, const0_rtx
),
4352 GEN_INT (AARCH64_EQ
)));
4356 cc_mode
= SELECT_CC_MODE (code
, x
, y
);
4357 cc_reg
= gen_rtx_REG (cc_mode
, CC_REGNUM
);
4358 emit_set_insn (cc_reg
, gen_rtx_COMPARE (cc_mode
, x
, y
));
4363 /* Similarly, but maybe zero-extend Y if Y_MODE < SImode. */
4366 aarch64_gen_compare_reg_maybe_ze (RTX_CODE code
, rtx x
, rtx y
,
4367 machine_mode y_mode
)
4369 if (y_mode
== E_QImode
|| y_mode
== E_HImode
)
4371 if (CONST_INT_P (y
))
4373 y
= GEN_INT (INTVAL (y
) & GET_MODE_MASK (y_mode
));
4379 machine_mode cc_mode
;
4381 t
= gen_rtx_ZERO_EXTEND (SImode
, y
);
4382 t
= gen_rtx_COMPARE (CC_SWPmode
, t
, x
);
4383 cc_mode
= CC_SWPmode
;
4384 cc_reg
= gen_rtx_REG (cc_mode
, CC_REGNUM
);
4385 emit_set_insn (cc_reg
, t
);
4390 if (!aarch64_plus_operand (y
, y_mode
))
4391 y
= force_reg (y_mode
, y
);
4393 return aarch64_gen_compare_reg (code
, x
, y
);
4396 /* Consider the operation:
4398 OPERANDS[0] = CODE (OPERANDS[1], OPERANDS[2]) + OPERANDS[3]
4402 - CODE is [SU]MAX or [SU]MIN
4403 - OPERANDS[2] and OPERANDS[3] are constant integers
4404 - OPERANDS[3] is a positive or negative shifted 12-bit immediate
4405 - all operands have mode MODE
4407 Decide whether it is possible to implement the operation using:
4409 SUBS <tmp>, OPERANDS[1], -OPERANDS[3]
4411 ADDS <tmp>, OPERANDS[1], OPERANDS[3]
4415 <insn> OPERANDS[0], <tmp>, [wx]zr, <cond>
4417 where <insn> is one of CSEL, CSINV or CSINC. Return true if so.
4418 If GENERATE_P is true, also update OPERANDS as follows:
4420 OPERANDS[4] = -OPERANDS[3]
4421 OPERANDS[5] = the rtl condition representing <cond>
4423 OPERANDS[7] = 0 for CSEL, -1 for CSINV or 1 for CSINC. */
4425 aarch64_maxmin_plus_const (rtx_code code
, rtx
*operands
, bool generate_p
)
4427 signop sgn
= (code
== UMAX
|| code
== UMIN
? UNSIGNED
: SIGNED
);
4428 rtx dst
= operands
[0];
4429 rtx maxmin_op
= operands
[2];
4430 rtx add_op
= operands
[3];
4431 machine_mode mode
= GET_MODE (dst
);
4433 /* max (x, y) - z == (x >= y + 1 ? x : y) - z
4434 == (x >= y ? x : y) - z
4435 == (x > y ? x : y) - z
4436 == (x > y - 1 ? x : y) - z
4438 min (x, y) - z == (x <= y - 1 ? x : y) - z
4439 == (x <= y ? x : y) - z
4440 == (x < y ? x : y) - z
4441 == (x < y + 1 ? x : y) - z
4443 Check whether z is in { y - 1, y, y + 1 } and pick the form(s) for
4444 which x is compared with z. Set DIFF to y - z. Thus the supported
4445 combinations are as follows, with DIFF being the value after the ":":
4447 max (x, y) - z == x >= y + 1 ? x - (y + 1) : -1 [z == y + 1]
4448 == x >= y ? x - y : 0 [z == y]
4449 == x > y ? x - y : 0 [z == y]
4450 == x > y - 1 ? x - (y - 1) : 1 [z == y - 1]
4452 min (x, y) - z == x <= y - 1 ? x - (y - 1) : 1 [z == y - 1]
4453 == x <= y ? x - y : 0 [z == y]
4454 == x < y ? x - y : 0 [z == y]
4455 == x < y + 1 ? x - (y + 1) : -1 [z == y + 1]. */
4456 auto maxmin_val
= rtx_mode_t (maxmin_op
, mode
);
4457 auto add_val
= rtx_mode_t (add_op
, mode
);
4458 auto sub_val
= wi::neg (add_val
);
4459 auto diff
= wi::sub (maxmin_val
, sub_val
);
4461 || (diff
== 1 && wi::gt_p (maxmin_val
, sub_val
, sgn
))
4462 || (diff
== -1 && wi::lt_p (maxmin_val
, sub_val
, sgn
))))
4472 cmp
= diff
== 1 ? GT
: GE
;
4475 cmp
= diff
== 1 ? GTU
: GEU
;
4478 cmp
= diff
== -1 ? LT
: LE
;
4481 cmp
= diff
== -1 ? LTU
: LEU
;
4486 rtx cc
= gen_rtx_REG (CCmode
, CC_REGNUM
);
4488 operands
[4] = immed_wide_int_const (sub_val
, mode
);
4489 operands
[5] = gen_rtx_fmt_ee (cmp
, VOIDmode
, cc
, const0_rtx
);
4490 if (can_create_pseudo_p ())
4491 operands
[6] = gen_reg_rtx (mode
);
4494 operands
[7] = immed_wide_int_const (diff
, mode
);
4500 /* Build the SYMBOL_REF for __tls_get_addr. */
4502 static GTY(()) rtx tls_get_addr_libfunc
;
4505 aarch64_tls_get_addr (void)
4507 if (!tls_get_addr_libfunc
)
4508 tls_get_addr_libfunc
= init_one_libfunc ("__tls_get_addr");
4509 return tls_get_addr_libfunc
;
4512 /* Return the TLS model to use for ADDR. */
4514 static enum tls_model
4515 tls_symbolic_operand_type (rtx addr
)
4517 enum tls_model tls_kind
= TLS_MODEL_NONE
;
4519 addr
= strip_offset_and_salt (addr
, &offset
);
4520 if (SYMBOL_REF_P (addr
))
4521 tls_kind
= SYMBOL_REF_TLS_MODEL (addr
);
4526 /* We'll allow lo_sum's in addresses in our legitimate addresses
4527 so that combine would take care of combining addresses where
4528 necessary, but for generation purposes, we'll generate the address
4531 tmp = hi (symbol_ref); adrp x1, foo
4532 dest = lo_sum (tmp, symbol_ref); add dest, x1, :lo_12:foo
4536 adrp x1, :got:foo adrp tmp, :tlsgd:foo
4537 ldr x1, [:got_lo12:foo] add dest, tmp, :tlsgd_lo12:foo
4541 Load TLS symbol, depending on TLS mechanism and TLS access model.
4543 Global Dynamic - Traditional TLS:
4544 adrp tmp, :tlsgd:imm
4545 add dest, tmp, #:tlsgd_lo12:imm
4548 Global Dynamic - TLS Descriptors:
4549 adrp dest, :tlsdesc:imm
4550 ldr tmp, [dest, #:tlsdesc_lo12:imm]
4551 add dest, dest, #:tlsdesc_lo12:imm
4558 adrp tmp, :gottprel:imm
4559 ldr dest, [tmp, #:gottprel_lo12:imm]
4564 add t0, tp, #:tprel_hi12:imm, lsl #12
4565 add t0, t0, #:tprel_lo12_nc:imm
4569 aarch64_load_symref_appropriately (rtx dest
, rtx imm
,
4570 enum aarch64_symbol_type type
)
4574 case SYMBOL_SMALL_ABSOLUTE
:
4576 /* In ILP32, the mode of dest can be either SImode or DImode. */
4578 machine_mode mode
= GET_MODE (dest
);
4580 gcc_assert (mode
== Pmode
|| mode
== ptr_mode
);
4582 if (can_create_pseudo_p ())
4583 tmp_reg
= gen_reg_rtx (mode
);
4585 emit_move_insn (tmp_reg
, gen_rtx_HIGH (mode
, copy_rtx (imm
)));
4586 emit_insn (gen_add_losym (dest
, tmp_reg
, imm
));
4590 case SYMBOL_TINY_ABSOLUTE
:
4591 emit_insn (gen_rtx_SET (dest
, imm
));
4594 case SYMBOL_SMALL_GOT_28K
:
4596 machine_mode mode
= GET_MODE (dest
);
4597 rtx gp_rtx
= pic_offset_table_rtx
;
4601 /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
4602 here before rtl expand. Tree IVOPT will generate rtl pattern to
4603 decide rtx costs, in which case pic_offset_table_rtx is not
4604 initialized. For that case no need to generate the first adrp
4605 instruction as the final cost for global variable access is
4609 /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
4610 using the page base as GOT base, the first page may be wasted,
4611 in the worst scenario, there is only 28K space for GOT).
4613 The generate instruction sequence for accessing global variable
4616 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
4618 Only one instruction needed. But we must initialize
4619 pic_offset_table_rtx properly. We generate initialize insn for
4620 every global access, and allow CSE to remove all redundant.
4622 The final instruction sequences will look like the following
4623 for multiply global variables access.
4625 adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
4627 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
4628 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
4629 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
4632 rtx s
= gen_rtx_SYMBOL_REF (Pmode
, "_GLOBAL_OFFSET_TABLE_");
4633 crtl
->uses_pic_offset_table
= 1;
4634 emit_move_insn (gp_rtx
, gen_rtx_HIGH (Pmode
, s
));
4636 if (mode
!= GET_MODE (gp_rtx
))
4637 gp_rtx
= gen_lowpart (mode
, gp_rtx
);
4641 if (mode
== ptr_mode
)
4644 insn
= gen_ldr_got_small_28k_di (dest
, gp_rtx
, imm
);
4646 insn
= gen_ldr_got_small_28k_si (dest
, gp_rtx
, imm
);
4648 mem
= XVECEXP (SET_SRC (insn
), 0, 0);
4652 gcc_assert (mode
== Pmode
);
4654 insn
= gen_ldr_got_small_28k_sidi (dest
, gp_rtx
, imm
);
4655 mem
= XVECEXP (XEXP (SET_SRC (insn
), 0), 0, 0);
4658 /* The operand is expected to be MEM. Whenever the related insn
4659 pattern changed, above code which calculate mem should be
4661 gcc_assert (MEM_P (mem
));
4662 MEM_READONLY_P (mem
) = 1;
4663 MEM_NOTRAP_P (mem
) = 1;
4668 case SYMBOL_SMALL_GOT_4G
:
4669 emit_insn (gen_rtx_SET (dest
, imm
));
4672 case SYMBOL_SMALL_TLSGD
:
4675 /* The return type of __tls_get_addr is the C pointer type
4677 rtx result
= gen_rtx_REG (ptr_mode
, R0_REGNUM
);
4680 if (GET_MODE (dest
) != ptr_mode
)
4681 tmp_reg
= can_create_pseudo_p () ? gen_reg_rtx (ptr_mode
) : result
;
4684 if (ptr_mode
== SImode
)
4685 aarch64_emit_call_insn (gen_tlsgd_small_si (result
, imm
));
4687 aarch64_emit_call_insn (gen_tlsgd_small_di (result
, imm
));
4688 insns
= get_insns ();
4691 RTL_CONST_CALL_P (insns
) = 1;
4692 emit_libcall_block (insns
, tmp_reg
, result
, imm
);
4693 /* Convert back to the mode of the dest adding a zero_extend
4694 from SImode (ptr_mode) to DImode (Pmode). */
4695 if (dest
!= tmp_reg
)
4696 convert_move (dest
, tmp_reg
, true);
4700 case SYMBOL_SMALL_TLSDESC
:
4702 machine_mode mode
= GET_MODE (dest
);
4703 rtx x0
= gen_rtx_REG (mode
, R0_REGNUM
);
4706 gcc_assert (mode
== Pmode
|| mode
== ptr_mode
);
4708 /* In ILP32, the got entry is always of SImode size. Unlike
4709 small GOT, the dest is fixed at reg 0. */
4711 emit_insn (gen_tlsdesc_small_si (imm
));
4713 emit_insn (gen_tlsdesc_small_di (imm
));
4714 tp
= aarch64_load_tp (NULL
);
4717 tp
= gen_lowpart (mode
, tp
);
4719 emit_insn (gen_rtx_SET (dest
, gen_rtx_PLUS (mode
, tp
, x0
)));
4721 set_unique_reg_note (get_last_insn (), REG_EQUIV
, imm
);
4725 case SYMBOL_SMALL_TLSIE
:
4727 /* In ILP32, the mode of dest can be either SImode or DImode,
4728 while the got entry is always of SImode size. The mode of
4729 dest depends on how dest is used: if dest is assigned to a
4730 pointer (e.g. in the memory), it has SImode; it may have
4731 DImode if dest is dereferenced to access the memeory.
4732 This is why we have to handle three different tlsie_small
4733 patterns here (two patterns for ILP32). */
4734 machine_mode mode
= GET_MODE (dest
);
4735 rtx tmp_reg
= gen_reg_rtx (mode
);
4736 rtx tp
= aarch64_load_tp (NULL
);
4738 if (mode
== ptr_mode
)
4741 emit_insn (gen_tlsie_small_di (tmp_reg
, imm
));
4744 emit_insn (gen_tlsie_small_si (tmp_reg
, imm
));
4745 tp
= gen_lowpart (mode
, tp
);
4750 gcc_assert (mode
== Pmode
);
4751 emit_insn (gen_tlsie_small_sidi (tmp_reg
, imm
));
4754 emit_insn (gen_rtx_SET (dest
, gen_rtx_PLUS (mode
, tp
, tmp_reg
)));
4756 set_unique_reg_note (get_last_insn (), REG_EQUIV
, imm
);
4760 case SYMBOL_TLSLE12
:
4761 case SYMBOL_TLSLE24
:
4762 case SYMBOL_TLSLE32
:
4763 case SYMBOL_TLSLE48
:
4765 machine_mode mode
= GET_MODE (dest
);
4766 rtx tp
= aarch64_load_tp (NULL
);
4769 tp
= gen_lowpart (mode
, tp
);
4773 case SYMBOL_TLSLE12
:
4774 emit_insn ((mode
== DImode
? gen_tlsle12_di
: gen_tlsle12_si
)
4777 case SYMBOL_TLSLE24
:
4778 emit_insn ((mode
== DImode
? gen_tlsle24_di
: gen_tlsle24_si
)
4781 case SYMBOL_TLSLE32
:
4782 emit_insn ((mode
== DImode
? gen_tlsle32_di
: gen_tlsle32_si
)
4784 emit_insn ((mode
== DImode
? gen_adddi3
: gen_addsi3
)
4787 case SYMBOL_TLSLE48
:
4788 emit_insn ((mode
== DImode
? gen_tlsle48_di
: gen_tlsle48_si
)
4790 emit_insn ((mode
== DImode
? gen_adddi3
: gen_addsi3
)
4798 set_unique_reg_note (get_last_insn (), REG_EQUIV
, imm
);
4802 case SYMBOL_TINY_GOT
:
4805 machine_mode mode
= GET_MODE (dest
);
4807 if (mode
== ptr_mode
)
4808 insn
= gen_ldr_got_tiny (mode
, dest
, imm
);
4811 gcc_assert (mode
== Pmode
);
4812 insn
= gen_ldr_got_tiny_sidi (dest
, imm
);
4819 case SYMBOL_TINY_TLSIE
:
4821 machine_mode mode
= GET_MODE (dest
);
4822 rtx tp
= aarch64_load_tp (NULL
);
4824 if (mode
== ptr_mode
)
4827 emit_insn (gen_tlsie_tiny_di (dest
, imm
, tp
));
4830 tp
= gen_lowpart (mode
, tp
);
4831 emit_insn (gen_tlsie_tiny_si (dest
, imm
, tp
));
4836 gcc_assert (mode
== Pmode
);
4837 emit_insn (gen_tlsie_tiny_sidi (dest
, imm
, tp
));
4841 set_unique_reg_note (get_last_insn (), REG_EQUIV
, imm
);
4850 /* Emit a move from SRC to DEST. Assume that the move expanders can
4851 handle all moves if !can_create_pseudo_p (). The distinction is
4852 important because, unlike emit_move_insn, the move expanders know
4853 how to force Pmode objects into the constant pool even when the
4854 constant pool address is not itself legitimate. */
4856 aarch64_emit_move (rtx dest
, rtx src
)
4858 return (can_create_pseudo_p ()
4859 ? emit_move_insn (dest
, src
)
4860 : emit_move_insn_1 (dest
, src
));
4863 /* Apply UNOPTAB to OP and store the result in DEST. */
4866 aarch64_emit_unop (rtx dest
, optab unoptab
, rtx op
)
4868 rtx tmp
= expand_unop (GET_MODE (dest
), unoptab
, op
, dest
, 0);
4870 emit_move_insn (dest
, tmp
);
4873 /* Apply BINOPTAB to OP0 and OP1 and store the result in DEST. */
4876 aarch64_emit_binop (rtx dest
, optab binoptab
, rtx op0
, rtx op1
)
4878 rtx tmp
= expand_binop (GET_MODE (dest
), binoptab
, op0
, op1
, dest
, 0,
4881 emit_move_insn (dest
, tmp
);
4884 /* Split a 128-bit move operation into two 64-bit move operations,
4885 taking care to handle partial overlap of register to register
4886 copies. Special cases are needed when moving between GP regs and
4887 FP regs. SRC can be a register, constant or memory; DST a register
4888 or memory. If either operand is memory it must not have any side
4891 aarch64_split_128bit_move (rtx dst
, rtx src
)
4896 machine_mode mode
= GET_MODE (dst
);
4898 gcc_assert (mode
== TImode
|| mode
== TFmode
|| mode
== TDmode
);
4899 gcc_assert (!(side_effects_p (src
) || side_effects_p (dst
)));
4900 gcc_assert (mode
== GET_MODE (src
) || GET_MODE (src
) == VOIDmode
);
4902 if (REG_P (dst
) && REG_P (src
))
4904 int src_regno
= REGNO (src
);
4905 int dst_regno
= REGNO (dst
);
4907 /* Handle FP <-> GP regs. */
4908 if (FP_REGNUM_P (dst_regno
) && GP_REGNUM_P (src_regno
))
4910 src_lo
= gen_lowpart (word_mode
, src
);
4911 src_hi
= gen_highpart (word_mode
, src
);
4913 emit_insn (gen_aarch64_movlow_di (mode
, dst
, src_lo
));
4914 emit_insn (gen_aarch64_movhigh_di (mode
, dst
, src_hi
));
4917 else if (GP_REGNUM_P (dst_regno
) && FP_REGNUM_P (src_regno
))
4919 dst_lo
= gen_lowpart (word_mode
, dst
);
4920 dst_hi
= gen_highpart (word_mode
, dst
);
4922 emit_insn (gen_aarch64_movdi_low (mode
, dst_lo
, src
));
4923 emit_insn (gen_aarch64_movdi_high (mode
, dst_hi
, src
));
4928 dst_lo
= gen_lowpart (word_mode
, dst
);
4929 dst_hi
= gen_highpart (word_mode
, dst
);
4930 src_lo
= gen_lowpart (word_mode
, src
);
4931 src_hi
= gen_highpart_mode (word_mode
, mode
, src
);
4933 /* At most one pairing may overlap. */
4934 if (reg_overlap_mentioned_p (dst_lo
, src_hi
))
4936 aarch64_emit_move (dst_hi
, src_hi
);
4937 aarch64_emit_move (dst_lo
, src_lo
);
4941 aarch64_emit_move (dst_lo
, src_lo
);
4942 aarch64_emit_move (dst_hi
, src_hi
);
4946 /* Return true if we should split a move from 128-bit value SRC
4947 to 128-bit register DEST. */
4950 aarch64_split_128bit_move_p (rtx dst
, rtx src
)
4952 if (FP_REGNUM_P (REGNO (dst
)))
4953 return REG_P (src
) && !FP_REGNUM_P (REGNO (src
));
4954 /* All moves to GPRs need to be split. */
4958 /* Split a complex SIMD move. */
4961 aarch64_split_simd_move (rtx dst
, rtx src
)
4963 machine_mode src_mode
= GET_MODE (src
);
4964 machine_mode dst_mode
= GET_MODE (dst
);
4966 gcc_assert (VECTOR_MODE_P (dst_mode
));
4968 if (REG_P (dst
) && REG_P (src
))
4970 gcc_assert (VECTOR_MODE_P (src_mode
));
4971 emit_insn (gen_aarch64_split_simd_mov (src_mode
, dst
, src
));
4976 aarch64_zero_extend_const_eq (machine_mode xmode
, rtx x
,
4977 machine_mode ymode
, rtx y
)
4979 rtx r
= simplify_const_unary_operation (ZERO_EXTEND
, xmode
, y
, ymode
);
4980 gcc_assert (r
!= NULL
);
4981 return rtx_equal_p (x
, r
);
4984 /* Return TARGET if it is nonnull and a register of mode MODE.
4985 Otherwise, return a fresh register of mode MODE if we can,
4986 or TARGET reinterpreted as MODE if we can't. */
4989 aarch64_target_reg (rtx target
, machine_mode mode
)
4991 if (target
&& REG_P (target
) && GET_MODE (target
) == mode
)
4993 if (!can_create_pseudo_p ())
4995 gcc_assert (target
);
4996 return gen_lowpart (mode
, target
);
4998 return gen_reg_rtx (mode
);
5001 /* Return a register that contains the constant in BUILDER, given that
5002 the constant is a legitimate move operand. Use TARGET as the register
5003 if it is nonnull and convenient. */
5006 aarch64_emit_set_immediate (rtx target
, rtx_vector_builder
&builder
)
5008 rtx src
= builder
.build ();
5009 target
= aarch64_target_reg (target
, GET_MODE (src
));
5010 emit_insn (gen_rtx_SET (target
, src
));
5015 aarch64_force_temporary (machine_mode mode
, rtx x
, rtx value
)
5017 if (can_create_pseudo_p ())
5018 return force_reg (mode
, value
);
5022 aarch64_emit_move (x
, value
);
5027 /* Return true if predicate value X is a constant in which every element
5028 is a CONST_INT. When returning true, describe X in BUILDER as a VNx16BI
5029 value, i.e. as a predicate in which all bits are significant. */
5032 aarch64_get_sve_pred_bits (rtx_vector_builder
&builder
, rtx x
)
5034 if (!CONST_VECTOR_P (x
))
5037 unsigned int factor
= vector_element_size (GET_MODE_NUNITS (VNx16BImode
),
5038 GET_MODE_NUNITS (GET_MODE (x
)));
5039 unsigned int npatterns
= CONST_VECTOR_NPATTERNS (x
) * factor
;
5040 unsigned int nelts_per_pattern
= CONST_VECTOR_NELTS_PER_PATTERN (x
);
5041 builder
.new_vector (VNx16BImode
, npatterns
, nelts_per_pattern
);
5043 unsigned int nelts
= const_vector_encoded_nelts (x
);
5044 for (unsigned int i
= 0; i
< nelts
; ++i
)
5046 rtx elt
= CONST_VECTOR_ENCODED_ELT (x
, i
);
5047 if (!CONST_INT_P (elt
))
5050 builder
.quick_push (elt
);
5051 for (unsigned int j
= 1; j
< factor
; ++j
)
5052 builder
.quick_push (const0_rtx
);
5054 builder
.finalize ();
5058 /* BUILDER contains a predicate constant of mode VNx16BI. Return the
5059 widest predicate element size it can have (that is, the largest size
5060 for which each element would still be 0 or 1). */
5063 aarch64_widest_sve_pred_elt_size (rtx_vector_builder
&builder
)
5065 /* Start with the most optimistic assumption: that we only need
5066 one bit per pattern. This is what we will use if only the first
5067 bit in each pattern is ever set. */
5068 unsigned int mask
= GET_MODE_SIZE (DImode
);
5069 mask
|= builder
.npatterns ();
5071 /* Look for set bits. */
5072 unsigned int nelts
= builder
.encoded_nelts ();
5073 for (unsigned int i
= 1; i
< nelts
; ++i
)
5074 if (INTVAL (builder
.elt (i
)) != 0)
5080 return mask
& -mask
;
5083 /* If VNx16BImode rtx X is a canonical PTRUE for a predicate mode,
5084 return that predicate mode, otherwise return opt_machine_mode (). */
5087 aarch64_ptrue_all_mode (rtx x
)
5089 gcc_assert (GET_MODE (x
) == VNx16BImode
);
5090 if (!CONST_VECTOR_P (x
)
5091 || !CONST_VECTOR_DUPLICATE_P (x
)
5092 || !CONST_INT_P (CONST_VECTOR_ENCODED_ELT (x
, 0))
5093 || INTVAL (CONST_VECTOR_ENCODED_ELT (x
, 0)) == 0)
5094 return opt_machine_mode ();
5096 unsigned int nelts
= const_vector_encoded_nelts (x
);
5097 for (unsigned int i
= 1; i
< nelts
; ++i
)
5098 if (CONST_VECTOR_ENCODED_ELT (x
, i
) != const0_rtx
)
5099 return opt_machine_mode ();
5101 return aarch64_sve_pred_mode (nelts
);
5104 /* BUILDER is a predicate constant of mode VNx16BI. Consider the value
5105 that the constant would have with predicate element size ELT_SIZE
5106 (ignoring the upper bits in each element) and return:
5108 * -1 if all bits are set
5109 * N if the predicate has N leading set bits followed by all clear bits
5110 * 0 if the predicate does not have any of these forms. */
5113 aarch64_partial_ptrue_length (rtx_vector_builder
&builder
,
5114 unsigned int elt_size
)
5116 /* If nelts_per_pattern is 3, we have set bits followed by clear bits
5117 followed by set bits. */
5118 if (builder
.nelts_per_pattern () == 3)
5121 /* Skip over leading set bits. */
5122 unsigned int nelts
= builder
.encoded_nelts ();
5124 for (; i
< nelts
; i
+= elt_size
)
5125 if (INTVAL (builder
.elt (i
)) == 0)
5127 unsigned int vl
= i
/ elt_size
;
5129 /* Check for the all-true case. */
5133 /* If nelts_per_pattern is 1, then either VL is zero, or we have a
5134 repeating pattern of set bits followed by clear bits. */
5135 if (builder
.nelts_per_pattern () != 2)
5138 /* We have a "foreground" value and a duplicated "background" value.
5139 If the background might repeat and the last set bit belongs to it,
5140 we might have set bits followed by clear bits followed by set bits. */
5141 if (i
> builder
.npatterns () && maybe_ne (nelts
, builder
.full_nelts ()))
5144 /* Make sure that the rest are all clear. */
5145 for (; i
< nelts
; i
+= elt_size
)
5146 if (INTVAL (builder
.elt (i
)) != 0)
5152 /* See if there is an svpattern that encodes an SVE predicate of mode
5153 PRED_MODE in which the first VL bits are set and the rest are clear.
5154 Return the pattern if so, otherwise return AARCH64_NUM_SVPATTERNS.
5155 A VL of -1 indicates an all-true vector. */
5158 aarch64_svpattern_for_vl (machine_mode pred_mode
, int vl
)
5161 return AARCH64_SV_ALL
;
5163 if (maybe_gt (vl
, GET_MODE_NUNITS (pred_mode
)))
5164 return AARCH64_NUM_SVPATTERNS
;
5166 if (vl
>= 1 && vl
<= 8)
5167 return aarch64_svpattern (AARCH64_SV_VL1
+ (vl
- 1));
5169 if (vl
>= 16 && vl
<= 256 && pow2p_hwi (vl
))
5170 return aarch64_svpattern (AARCH64_SV_VL16
+ (exact_log2 (vl
) - 4));
5173 if (GET_MODE_NUNITS (pred_mode
).is_constant (&max_vl
))
5175 if (vl
== (max_vl
/ 3) * 3)
5176 return AARCH64_SV_MUL3
;
5177 /* These would only trigger for non-power-of-2 lengths. */
5178 if (vl
== (max_vl
& -4))
5179 return AARCH64_SV_MUL4
;
5180 if (vl
== (1 << floor_log2 (max_vl
)))
5181 return AARCH64_SV_POW2
;
5183 return AARCH64_SV_ALL
;
5185 return AARCH64_NUM_SVPATTERNS
;
5188 /* Return a VNx16BImode constant in which every sequence of ELT_SIZE
5189 bits has the lowest bit set and the upper bits clear. This is the
5190 VNx16BImode equivalent of a PTRUE for controlling elements of
5191 ELT_SIZE bytes. However, because the constant is VNx16BImode,
5192 all bits are significant, even the upper zeros. */
5195 aarch64_ptrue_all (unsigned int elt_size
)
5197 rtx_vector_builder
builder (VNx16BImode
, elt_size
, 1);
5198 builder
.quick_push (const1_rtx
);
5199 for (unsigned int i
= 1; i
< elt_size
; ++i
)
5200 builder
.quick_push (const0_rtx
);
5201 return builder
.build ();
5204 /* Return an all-true predicate register of mode MODE. */
5207 aarch64_ptrue_reg (machine_mode mode
)
5209 gcc_assert (GET_MODE_CLASS (mode
) == MODE_VECTOR_BOOL
);
5210 rtx reg
= force_reg (VNx16BImode
, CONSTM1_RTX (VNx16BImode
));
5211 return gen_lowpart (mode
, reg
);
5214 /* Return an all-false predicate register of mode MODE. */
5217 aarch64_pfalse_reg (machine_mode mode
)
5219 gcc_assert (GET_MODE_CLASS (mode
) == MODE_VECTOR_BOOL
);
5220 rtx reg
= force_reg (VNx16BImode
, CONST0_RTX (VNx16BImode
));
5221 return gen_lowpart (mode
, reg
);
5224 /* PRED1[0] is a PTEST predicate and PRED1[1] is an aarch64_sve_ptrue_flag
5225 for it. PRED2[0] is the predicate for the instruction whose result
5226 is tested by the PTEST and PRED2[1] is again an aarch64_sve_ptrue_flag
5227 for it. Return true if we can prove that the two predicates are
5228 equivalent for PTEST purposes; that is, if we can replace PRED2[0]
5229 with PRED1[0] without changing behavior. */
5232 aarch64_sve_same_pred_for_ptest_p (rtx
*pred1
, rtx
*pred2
)
5234 machine_mode mode
= GET_MODE (pred1
[0]);
5235 gcc_assert (GET_MODE_CLASS (mode
) == MODE_VECTOR_BOOL
5236 && mode
== GET_MODE (pred2
[0])
5237 && aarch64_sve_ptrue_flag (pred1
[1], SImode
)
5238 && aarch64_sve_ptrue_flag (pred2
[1], SImode
));
5240 bool ptrue1_p
= (pred1
[0] == CONSTM1_RTX (mode
)
5241 || INTVAL (pred1
[1]) == SVE_KNOWN_PTRUE
);
5242 bool ptrue2_p
= (pred2
[0] == CONSTM1_RTX (mode
)
5243 || INTVAL (pred2
[1]) == SVE_KNOWN_PTRUE
);
5244 return (ptrue1_p
&& ptrue2_p
) || rtx_equal_p (pred1
[0], pred2
[0]);
5247 /* Emit a comparison CMP between OP0 and OP1, both of which have mode
5248 DATA_MODE, and return the result in a predicate of mode PRED_MODE.
5249 Use TARGET as the target register if nonnull and convenient. */
5252 aarch64_sve_emit_int_cmp (rtx target
, machine_mode pred_mode
, rtx_code cmp
,
5253 machine_mode data_mode
, rtx op1
, rtx op2
)
5255 insn_code icode
= code_for_aarch64_pred_cmp (cmp
, data_mode
);
5256 expand_operand ops
[5];
5257 create_output_operand (&ops
[0], target
, pred_mode
);
5258 create_input_operand (&ops
[1], CONSTM1_RTX (pred_mode
), pred_mode
);
5259 create_integer_operand (&ops
[2], SVE_KNOWN_PTRUE
);
5260 create_input_operand (&ops
[3], op1
, data_mode
);
5261 create_input_operand (&ops
[4], op2
, data_mode
);
5262 expand_insn (icode
, 5, ops
);
5263 return ops
[0].value
;
5266 /* Use a comparison to convert integer vector SRC into MODE, which is
5267 the corresponding SVE predicate mode. Use TARGET for the result
5268 if it's nonnull and convenient. */
5271 aarch64_convert_sve_data_to_pred (rtx target
, machine_mode mode
, rtx src
)
5273 machine_mode src_mode
= GET_MODE (src
);
5274 return aarch64_sve_emit_int_cmp (target
, mode
, NE
, src_mode
,
5275 src
, CONST0_RTX (src_mode
));
5278 /* Return the assembly token for svprfop value PRFOP. */
5281 svprfop_token (enum aarch64_svprfop prfop
)
5285 #define CASE(UPPER, LOWER, VALUE) case AARCH64_SV_##UPPER: return #LOWER;
5286 AARCH64_FOR_SVPRFOP (CASE
)
5288 case AARCH64_NUM_SVPRFOPS
:
5294 /* Return the assembly string for an SVE prefetch operation with
5295 mnemonic MNEMONIC, given that PRFOP_RTX is the prefetch operation
5296 and that SUFFIX is the format for the remaining operands. */
5299 aarch64_output_sve_prefetch (const char *mnemonic
, rtx prfop_rtx
,
5302 static char buffer
[128];
5303 aarch64_svprfop prfop
= (aarch64_svprfop
) INTVAL (prfop_rtx
);
5304 unsigned int written
= snprintf (buffer
, sizeof (buffer
), "%s\t%s, %s",
5305 mnemonic
, svprfop_token (prfop
), suffix
);
5306 gcc_assert (written
< sizeof (buffer
));
5310 /* Check whether we can calculate the number of elements in PATTERN
5311 at compile time, given that there are NELTS_PER_VQ elements per
5312 128-bit block. Return the value if so, otherwise return -1. */
5315 aarch64_fold_sve_cnt_pat (aarch64_svpattern pattern
, unsigned int nelts_per_vq
)
5317 unsigned int vl
, const_vg
;
5318 if (pattern
>= AARCH64_SV_VL1
&& pattern
<= AARCH64_SV_VL8
)
5319 vl
= 1 + (pattern
- AARCH64_SV_VL1
);
5320 else if (pattern
>= AARCH64_SV_VL16
&& pattern
<= AARCH64_SV_VL256
)
5321 vl
= 16 << (pattern
- AARCH64_SV_VL16
);
5322 else if (aarch64_sve_vg
.is_constant (&const_vg
))
5324 /* There are two vector granules per quadword. */
5325 unsigned int nelts
= (const_vg
/ 2) * nelts_per_vq
;
5328 case AARCH64_SV_POW2
: return 1 << floor_log2 (nelts
);
5329 case AARCH64_SV_MUL4
: return nelts
& -4;
5330 case AARCH64_SV_MUL3
: return (nelts
/ 3) * 3;
5331 case AARCH64_SV_ALL
: return nelts
;
5332 default: gcc_unreachable ();
5338 /* There are two vector granules per quadword. */
5339 poly_uint64 nelts_all
= exact_div (aarch64_sve_vg
, 2) * nelts_per_vq
;
5340 if (known_le (vl
, nelts_all
))
5343 /* Requesting more elements than are available results in a PFALSE. */
5344 if (known_gt (vl
, nelts_all
))
5350 /* Return true if we can move VALUE into a register using a single
5351 CNT[BHWD] instruction. */
5354 aarch64_sve_cnt_immediate_p (poly_int64 value
)
5356 HOST_WIDE_INT factor
= value
.coeffs
[0];
5357 /* The coefficient must be [1, 16] * {2, 4, 8, 16}. */
5358 return (value
.coeffs
[1] == factor
5359 && IN_RANGE (factor
, 2, 16 * 16)
5360 && (factor
& 1) == 0
5361 && factor
<= 16 * (factor
& -factor
));
5364 /* Likewise for rtx X. */
5367 aarch64_sve_cnt_immediate_p (rtx x
)
5370 return poly_int_rtx_p (x
, &value
) && aarch64_sve_cnt_immediate_p (value
);
5373 /* Return the asm string for an instruction with a CNT-like vector size
5374 operand (a vector pattern followed by a multiplier in the range [1, 16]).
5375 PREFIX is the mnemonic without the size suffix and OPERANDS is the
5376 first part of the operands template (the part that comes before the
5377 vector size itself). PATTERN is the pattern to use. FACTOR is the
5378 number of quadwords. NELTS_PER_VQ, if nonzero, is the number of elements
5379 in each quadword. If it is zero, we can use any element size. */
5382 aarch64_output_sve_cnt_immediate (const char *prefix
, const char *operands
,
5383 aarch64_svpattern pattern
,
5384 unsigned int factor
,
5385 unsigned int nelts_per_vq
)
5387 static char buffer
[sizeof ("sqincd\t%x0, %w0, vl256, mul #16")];
5389 if (nelts_per_vq
== 0)
5390 /* There is some overlap in the ranges of the four CNT instructions.
5391 Here we always use the smallest possible element size, so that the
5392 multiplier is 1 whereever possible. */
5393 nelts_per_vq
= factor
& -factor
;
5394 int shift
= std::min (exact_log2 (nelts_per_vq
), 4);
5395 gcc_assert (IN_RANGE (shift
, 1, 4));
5396 char suffix
= "dwhb"[shift
- 1];
5399 unsigned int written
;
5400 if (pattern
== AARCH64_SV_ALL
&& factor
== 1)
5401 written
= snprintf (buffer
, sizeof (buffer
), "%s%c\t%s",
5402 prefix
, suffix
, operands
);
5403 else if (factor
== 1)
5404 written
= snprintf (buffer
, sizeof (buffer
), "%s%c\t%s, %s",
5405 prefix
, suffix
, operands
, svpattern_token (pattern
));
5407 written
= snprintf (buffer
, sizeof (buffer
), "%s%c\t%s, %s, mul #%d",
5408 prefix
, suffix
, operands
, svpattern_token (pattern
),
5410 gcc_assert (written
< sizeof (buffer
));
5414 /* Return the asm string for an instruction with a CNT-like vector size
5415 operand (a vector pattern followed by a multiplier in the range [1, 16]).
5416 PREFIX is the mnemonic without the size suffix and OPERANDS is the
5417 first part of the operands template (the part that comes before the
5418 vector size itself). X is the value of the vector size operand,
5419 as a polynomial integer rtx; we need to convert this into an "all"
5420 pattern with a multiplier. */
5423 aarch64_output_sve_cnt_immediate (const char *prefix
, const char *operands
,
5426 poly_int64 value
= rtx_to_poly_int64 (x
);
5427 gcc_assert (aarch64_sve_cnt_immediate_p (value
));
5428 return aarch64_output_sve_cnt_immediate (prefix
, operands
, AARCH64_SV_ALL
,
5429 value
.coeffs
[1], 0);
5432 /* Return the asm string for an instruction with a CNT-like vector size
5433 operand (a vector pattern followed by a multiplier in the range [1, 16]).
5434 PREFIX is the mnemonic without the size suffix and OPERANDS is the
5435 first part of the operands template (the part that comes before the
5436 vector size itself). CNT_PAT[0..2] are the operands of the
5437 UNSPEC_SVE_CNT_PAT; see aarch64_sve_cnt_pat for details. */
5440 aarch64_output_sve_cnt_pat_immediate (const char *prefix
,
5441 const char *operands
, rtx
*cnt_pat
)
5443 aarch64_svpattern pattern
= (aarch64_svpattern
) INTVAL (cnt_pat
[0]);
5444 unsigned int nelts_per_vq
= INTVAL (cnt_pat
[1]);
5445 unsigned int factor
= INTVAL (cnt_pat
[2]) * nelts_per_vq
;
5446 return aarch64_output_sve_cnt_immediate (prefix
, operands
, pattern
,
5447 factor
, nelts_per_vq
);
5450 /* Return true if we can add X using a single SVE INC or DEC instruction. */
5453 aarch64_sve_scalar_inc_dec_immediate_p (rtx x
)
5456 return (poly_int_rtx_p (x
, &value
)
5457 && (aarch64_sve_cnt_immediate_p (value
)
5458 || aarch64_sve_cnt_immediate_p (-value
)));
5461 /* Return the asm string for adding SVE INC/DEC immediate OFFSET to
5465 aarch64_output_sve_scalar_inc_dec (rtx offset
)
5467 poly_int64 offset_value
= rtx_to_poly_int64 (offset
);
5468 gcc_assert (offset_value
.coeffs
[0] == offset_value
.coeffs
[1]);
5469 if (offset_value
.coeffs
[1] > 0)
5470 return aarch64_output_sve_cnt_immediate ("inc", "%x0", AARCH64_SV_ALL
,
5471 offset_value
.coeffs
[1], 0);
5473 return aarch64_output_sve_cnt_immediate ("dec", "%x0", AARCH64_SV_ALL
,
5474 -offset_value
.coeffs
[1], 0);
5477 /* Return true if we can add VALUE to a register using a single ADDVL
5478 or ADDPL instruction. */
5481 aarch64_sve_addvl_addpl_immediate_p (poly_int64 value
)
5483 HOST_WIDE_INT factor
= value
.coeffs
[0];
5484 if (factor
== 0 || value
.coeffs
[1] != factor
)
5486 /* FACTOR counts VG / 2, so a value of 2 is one predicate width
5487 and a value of 16 is one vector width. */
5488 return (((factor
& 15) == 0 && IN_RANGE (factor
, -32 * 16, 31 * 16))
5489 || ((factor
& 1) == 0 && IN_RANGE (factor
, -32 * 2, 31 * 2)));
5492 /* Likewise for rtx X. */
5495 aarch64_sve_addvl_addpl_immediate_p (rtx x
)
5498 return (poly_int_rtx_p (x
, &value
)
5499 && aarch64_sve_addvl_addpl_immediate_p (value
));
5502 /* Return the asm string for adding ADDVL or ADDPL immediate OFFSET
5503 to operand 1 and storing the result in operand 0. */
5506 aarch64_output_sve_addvl_addpl (rtx offset
)
5508 static char buffer
[sizeof ("addpl\t%x0, %x1, #-") + 3 * sizeof (int)];
5509 poly_int64 offset_value
= rtx_to_poly_int64 (offset
);
5510 gcc_assert (aarch64_sve_addvl_addpl_immediate_p (offset_value
));
5512 int factor
= offset_value
.coeffs
[1];
5513 if ((factor
& 15) == 0)
5514 snprintf (buffer
, sizeof (buffer
), "addvl\t%%x0, %%x1, #%d", factor
/ 16);
5516 snprintf (buffer
, sizeof (buffer
), "addpl\t%%x0, %%x1, #%d", factor
/ 2);
5520 /* Return true if X is a valid immediate for an SVE vector INC or DEC
5521 instruction. If it is, store the number of elements in each vector
5522 quadword in *NELTS_PER_VQ_OUT (if nonnull) and store the multiplication
5523 factor in *FACTOR_OUT (if nonnull). */
5526 aarch64_sve_vector_inc_dec_immediate_p (rtx x
, int *factor_out
,
5527 unsigned int *nelts_per_vq_out
)
5532 if (!const_vec_duplicate_p (x
, &elt
)
5533 || !poly_int_rtx_p (elt
, &value
))
5536 unsigned int nelts_per_vq
= 128 / GET_MODE_UNIT_BITSIZE (GET_MODE (x
));
5537 if (nelts_per_vq
!= 8 && nelts_per_vq
!= 4 && nelts_per_vq
!= 2)
5538 /* There's no vector INCB. */
5541 HOST_WIDE_INT factor
= value
.coeffs
[0];
5542 if (value
.coeffs
[1] != factor
)
5545 /* The coefficient must be [1, 16] * NELTS_PER_VQ. */
5546 if ((factor
% nelts_per_vq
) != 0
5547 || !IN_RANGE (abs (factor
), nelts_per_vq
, 16 * nelts_per_vq
))
5551 *factor_out
= factor
;
5552 if (nelts_per_vq_out
)
5553 *nelts_per_vq_out
= nelts_per_vq
;
5557 /* Return true if X is a valid immediate for an SVE vector INC or DEC
5561 aarch64_sve_vector_inc_dec_immediate_p (rtx x
)
5563 return aarch64_sve_vector_inc_dec_immediate_p (x
, NULL
, NULL
);
5566 /* Return the asm template for an SVE vector INC or DEC instruction.
5567 OPERANDS gives the operands before the vector count and X is the
5568 value of the vector count operand itself. */
5571 aarch64_output_sve_vector_inc_dec (const char *operands
, rtx x
)
5574 unsigned int nelts_per_vq
;
5575 if (!aarch64_sve_vector_inc_dec_immediate_p (x
, &factor
, &nelts_per_vq
))
5578 return aarch64_output_sve_cnt_immediate ("dec", operands
, AARCH64_SV_ALL
,
5579 -factor
, nelts_per_vq
);
5581 return aarch64_output_sve_cnt_immediate ("inc", operands
, AARCH64_SV_ALL
,
5582 factor
, nelts_per_vq
);
5585 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2. */
5587 static const unsigned HOST_WIDE_INT bitmask_imm_mul
[] =
5589 0x0000000100000001ull
,
5590 0x0001000100010001ull
,
5591 0x0101010101010101ull
,
5592 0x1111111111111111ull
,
5593 0x5555555555555555ull
,
5598 /* Return true if 64-bit VAL is a valid bitmask immediate. */
5600 aarch64_bitmask_imm (unsigned HOST_WIDE_INT val
)
5602 unsigned HOST_WIDE_INT tmp
, mask
, first_one
, next_one
;
5605 /* Check for a single sequence of one bits and return quickly if so.
5606 The special cases of all ones and all zeroes returns false. */
5607 tmp
= val
+ (val
& -val
);
5609 if (tmp
== (tmp
& -tmp
))
5610 return (val
+ 1) > 1;
5612 /* Invert if the immediate doesn't start with a zero bit - this means we
5613 only need to search for sequences of one bits. */
5617 /* Find the first set bit and set tmp to val with the first sequence of one
5618 bits removed. Return success if there is a single sequence of ones. */
5619 first_one
= val
& -val
;
5620 tmp
= val
& (val
+ first_one
);
5625 /* Find the next set bit and compute the difference in bit position. */
5626 next_one
= tmp
& -tmp
;
5627 bits
= clz_hwi (first_one
) - clz_hwi (next_one
);
5630 /* Check the bit position difference is a power of 2, and that the first
5631 sequence of one bits fits within 'bits' bits. */
5632 if ((mask
>> bits
) != 0 || bits
!= (bits
& -bits
))
5635 /* Check the sequence of one bits is repeated 64/bits times. */
5636 return val
== mask
* bitmask_imm_mul
[__builtin_clz (bits
) - 26];
5640 /* Return true if VAL is a valid bitmask immediate for MODE. */
5642 aarch64_bitmask_imm (unsigned HOST_WIDE_INT val
, machine_mode mode
)
5645 return aarch64_bitmask_imm (val
);
5648 return aarch64_bitmask_imm ((val
& 0xffffffff) | (val
<< 32));
5650 /* Replicate small immediates to fit 64 bits. */
5651 int size
= GET_MODE_UNIT_PRECISION (mode
);
5652 val
&= (HOST_WIDE_INT_1U
<< size
) - 1;
5653 val
*= bitmask_imm_mul
[__builtin_clz (size
) - 26];
5655 return aarch64_bitmask_imm (val
);
5659 /* Return true if the immediate VAL can be a bitfield immediate
5660 by changing the given MASK bits in VAL to zeroes, ones or bits
5661 from the other half of VAL. Return the new immediate in VAL2. */
5663 aarch64_check_bitmask (unsigned HOST_WIDE_INT val
,
5664 unsigned HOST_WIDE_INT
&val2
,
5665 unsigned HOST_WIDE_INT mask
)
5668 if (val2
!= val
&& aarch64_bitmask_imm (val2
))
5671 if (val2
!= val
&& aarch64_bitmask_imm (val2
))
5674 val2
= val
| (((val
>> 32) | (val
<< 32)) & mask
);
5675 if (val2
!= val
&& aarch64_bitmask_imm (val2
))
5677 val2
= val
| (((val
>> 16) | (val
<< 48)) & mask
);
5678 if (val2
!= val
&& aarch64_bitmask_imm (val2
))
5684 /* Return true if VAL is a valid MOVZ immediate. */
5686 aarch64_is_movz (unsigned HOST_WIDE_INT val
)
5688 return (val
>> (ctz_hwi (val
) & 48)) < 65536;
5692 /* Return true if immediate VAL can be created by a 64-bit MOVI/MOVN/MOVZ. */
5694 aarch64_is_mov_xn_imm (unsigned HOST_WIDE_INT val
)
5696 return aarch64_is_movz (val
) || aarch64_is_movz (~val
)
5697 || aarch64_bitmask_imm (val
);
5701 /* Return true if VAL is an immediate that can be created by a single
5704 aarch64_move_imm (unsigned HOST_WIDE_INT val
, machine_mode mode
)
5706 gcc_assert (mode
== SImode
|| mode
== DImode
);
5711 unsigned HOST_WIDE_INT mask
=
5712 (val
>> 32) == 0 || mode
== SImode
? 0xffffffff : HOST_WIDE_INT_M1U
;
5714 if (aarch64_is_movz (val
& mask
) || aarch64_is_movz (~val
& mask
))
5717 val
= (val
& mask
) | ((val
<< 32) & ~mask
);
5718 return aarch64_bitmask_imm (val
);
5723 aarch64_internal_mov_immediate (rtx dest
, rtx imm
, bool generate
,
5727 unsigned HOST_WIDE_INT val
, val2
, mask
;
5728 int one_match
, zero_match
;
5731 gcc_assert (mode
== SImode
|| mode
== DImode
);
5735 if (aarch64_move_imm (val
, mode
))
5738 emit_insn (gen_rtx_SET (dest
, imm
));
5742 if ((val
>> 32) == 0 || mode
== SImode
)
5746 emit_insn (gen_rtx_SET (dest
, GEN_INT (val
& 0xffff)));
5748 emit_insn (gen_insv_immsi (dest
, GEN_INT (16),
5749 GEN_INT ((val
>> 16) & 0xffff)));
5751 emit_insn (gen_insv_immdi (dest
, GEN_INT (16),
5752 GEN_INT ((val
>> 16) & 0xffff)));
5757 /* Remaining cases are all for DImode. */
5760 zero_match
= ((val
& mask
) == 0) + ((val
& (mask
<< 16)) == 0) +
5761 ((val
& (mask
<< 32)) == 0) + ((val
& (mask
<< 48)) == 0);
5762 one_match
= ((~val
& mask
) == 0) + ((~val
& (mask
<< 16)) == 0) +
5763 ((~val
& (mask
<< 32)) == 0) + ((~val
& (mask
<< 48)) == 0);
5765 /* Try a bitmask immediate and a movk to generate the immediate
5766 in 2 instructions. */
5768 if (zero_match
< 2 && one_match
< 2)
5770 for (i
= 0; i
< 64; i
+= 16)
5772 if (aarch64_check_bitmask (val
, val2
, mask
<< i
))
5775 val2
= val
& ~(mask
<< i
);
5776 if ((val2
>> 32) == 0 && aarch64_move_imm (val2
, DImode
))
5784 emit_insn (gen_rtx_SET (dest
, GEN_INT (val2
)));
5785 emit_insn (gen_insv_immdi (dest
, GEN_INT (i
),
5786 GEN_INT ((val
>> i
) & 0xffff)));
5792 /* Try a bitmask plus 2 movk to generate the immediate in 3 instructions. */
5793 if (zero_match
+ one_match
== 0)
5795 for (i
= 0; i
< 48; i
+= 16)
5796 for (int j
= i
+ 16; j
< 64; j
+= 16)
5797 if (aarch64_check_bitmask (val
, val2
, (mask
<< i
) | (mask
<< j
)))
5801 emit_insn (gen_rtx_SET (dest
, GEN_INT (val2
)));
5802 emit_insn (gen_insv_immdi (dest
, GEN_INT (i
),
5803 GEN_INT ((val
>> i
) & 0xffff)));
5804 emit_insn (gen_insv_immdi (dest
, GEN_INT (j
),
5805 GEN_INT ((val
>> j
) & 0xffff)));
5811 /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
5812 are emitted by the initial mov. If one_match > zero_match, skip set bits,
5813 otherwise skip zero bits. */
5817 val2
= one_match
> zero_match
? ~val
: val
;
5818 i
= (val2
& mask
) != 0 ? 0 : (val2
& (mask
<< 16)) != 0 ? 16 : 32;
5821 emit_insn (gen_rtx_SET (dest
, GEN_INT (one_match
> zero_match
5822 ? (val
| ~(mask
<< i
))
5823 : (val
& (mask
<< i
)))));
5824 for (i
+= 16; i
< 64; i
+= 16)
5826 if ((val2
& (mask
<< i
)) == 0)
5829 emit_insn (gen_insv_immdi (dest
, GEN_INT (i
),
5830 GEN_INT ((val
>> i
) & 0xffff)));
5837 /* Return whether imm is a 128-bit immediate which is simple enough to
5840 aarch64_mov128_immediate (rtx imm
)
5842 if (CONST_INT_P (imm
))
5845 gcc_assert (CONST_WIDE_INT_NUNITS (imm
) == 2);
5847 rtx lo
= GEN_INT (CONST_WIDE_INT_ELT (imm
, 0));
5848 rtx hi
= GEN_INT (CONST_WIDE_INT_ELT (imm
, 1));
5850 return aarch64_internal_mov_immediate (NULL_RTX
, lo
, false, DImode
)
5851 + aarch64_internal_mov_immediate (NULL_RTX
, hi
, false, DImode
) <= 4;
5855 /* Return true if val can be encoded as a 12-bit unsigned immediate with
5856 a left shift of 0 or 12 bits. */
5858 aarch64_uimm12_shift (unsigned HOST_WIDE_INT val
)
5860 return val
< 4096 || (val
& 0xfff000) == val
;
5863 /* Returns the nearest value to VAL that will fit as a 12-bit unsigned immediate
5864 that can be created with a left shift of 0 or 12. */
5865 static HOST_WIDE_INT
5866 aarch64_clamp_to_uimm12_shift (unsigned HOST_WIDE_INT val
)
5868 /* Check to see if the value fits in 24 bits, as that is the maximum we can
5869 handle correctly. */
5870 gcc_assert (val
< 0x1000000);
5875 return val
& 0xfff000;
5881 X = (X & AND_VAL) | IOR_VAL;
5883 can be implemented using:
5885 MOVK X, #(IOR_VAL >> shift), LSL #shift
5887 Return the shift if so, otherwise return -1. */
5889 aarch64_movk_shift (const wide_int_ref
&and_val
,
5890 const wide_int_ref
&ior_val
)
5892 unsigned int precision
= and_val
.get_precision ();
5893 unsigned HOST_WIDE_INT mask
= 0xffff;
5894 for (unsigned int shift
= 0; shift
< precision
; shift
+= 16)
5896 if (and_val
== ~mask
&& (ior_val
& mask
) == ior_val
)
5903 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
5904 Assumed precondition: VAL_IN Is not zero. */
5906 unsigned HOST_WIDE_INT
5907 aarch64_and_split_imm1 (HOST_WIDE_INT val_in
)
5909 int lowest_bit_set
= ctz_hwi (val_in
);
5910 int highest_bit_set
= floor_log2 (val_in
);
5911 gcc_assert (val_in
!= 0);
5913 return ((HOST_WIDE_INT_UC (2) << highest_bit_set
) -
5914 (HOST_WIDE_INT_1U
<< lowest_bit_set
));
5917 /* Create constant where bits outside of lowest bit set to highest bit set
5920 unsigned HOST_WIDE_INT
5921 aarch64_and_split_imm2 (HOST_WIDE_INT val_in
)
5923 return val_in
| ~aarch64_and_split_imm1 (val_in
);
5926 /* Return true if VAL_IN is a valid 'and' bitmask immediate. */
5929 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in
, machine_mode mode
)
5931 scalar_int_mode int_mode
;
5932 if (!is_a
<scalar_int_mode
> (mode
, &int_mode
))
5935 if (aarch64_bitmask_imm (val_in
, int_mode
))
5938 if (aarch64_move_imm (val_in
, int_mode
))
5941 unsigned HOST_WIDE_INT imm2
= aarch64_and_split_imm2 (val_in
);
5943 return aarch64_bitmask_imm (imm2
, int_mode
);
5946 /* Return the number of temporary registers that aarch64_add_offset_1
5947 would need to add OFFSET to a register. */
5950 aarch64_add_offset_1_temporaries (HOST_WIDE_INT offset
)
5952 return absu_hwi (offset
) < 0x1000000 ? 0 : 1;
5955 /* A subroutine of aarch64_add_offset. Set DEST to SRC + OFFSET for
5956 a non-polynomial OFFSET. MODE is the mode of the addition.
5957 FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
5958 be set and CFA adjustments added to the generated instructions.
5960 TEMP1, if nonnull, is a register of mode MODE that can be used as a
5961 temporary if register allocation is already complete. This temporary
5962 register may overlap DEST but must not overlap SRC. If TEMP1 is known
5963 to hold abs (OFFSET), EMIT_MOVE_IMM can be set to false to avoid emitting
5964 the immediate again.
5966 Since this function may be used to adjust the stack pointer, we must
5967 ensure that it cannot cause transient stack deallocation (for example
5968 by first incrementing SP and then decrementing when adjusting by a
5969 large immediate). */
5972 aarch64_add_offset_1 (scalar_int_mode mode
, rtx dest
,
5973 rtx src
, HOST_WIDE_INT offset
, rtx temp1
,
5974 bool frame_related_p
, bool emit_move_imm
)
5976 gcc_assert (emit_move_imm
|| temp1
!= NULL_RTX
);
5977 gcc_assert (temp1
== NULL_RTX
|| !reg_overlap_mentioned_p (temp1
, src
));
5979 unsigned HOST_WIDE_INT moffset
= absu_hwi (offset
);
5984 if (!rtx_equal_p (dest
, src
))
5986 insn
= emit_insn (gen_rtx_SET (dest
, src
));
5987 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
5992 /* Single instruction adjustment. */
5993 if (aarch64_uimm12_shift (moffset
))
5995 insn
= emit_insn (gen_add3_insn (dest
, src
, GEN_INT (offset
)));
5996 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
6000 /* Emit 2 additions/subtractions if the adjustment is less than 24 bits
6003 a) the offset cannot be loaded by a 16-bit move or
6004 b) there is no spare register into which we can move it. */
6005 if (moffset
< 0x1000000
6006 && ((!temp1
&& !can_create_pseudo_p ())
6007 || !aarch64_move_imm (moffset
, mode
)))
6009 HOST_WIDE_INT low_off
= moffset
& 0xfff;
6011 low_off
= offset
< 0 ? -low_off
: low_off
;
6012 insn
= emit_insn (gen_add3_insn (dest
, src
, GEN_INT (low_off
)));
6013 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
6014 insn
= emit_insn (gen_add2_insn (dest
, GEN_INT (offset
- low_off
)));
6015 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
6019 /* Emit a move immediate if required and an addition/subtraction. */
6022 gcc_assert (temp1
!= NULL_RTX
|| can_create_pseudo_p ());
6023 temp1
= aarch64_force_temporary (mode
, temp1
,
6024 gen_int_mode (moffset
, mode
));
6026 insn
= emit_insn (offset
< 0
6027 ? gen_sub3_insn (dest
, src
, temp1
)
6028 : gen_add3_insn (dest
, src
, temp1
));
6029 if (frame_related_p
)
6031 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
6032 rtx adj
= plus_constant (mode
, src
, offset
);
6033 add_reg_note (insn
, REG_CFA_ADJUST_CFA
, gen_rtx_SET (dest
, adj
));
6037 /* Return the number of temporary registers that aarch64_add_offset
6038 would need to move OFFSET into a register or add OFFSET to a register;
6039 ADD_P is true if we want the latter rather than the former. */
6042 aarch64_offset_temporaries (bool add_p
, poly_int64 offset
)
6044 /* This follows the same structure as aarch64_add_offset. */
6045 if (add_p
&& aarch64_sve_addvl_addpl_immediate_p (offset
))
6048 unsigned int count
= 0;
6049 HOST_WIDE_INT factor
= offset
.coeffs
[1];
6050 HOST_WIDE_INT constant
= offset
.coeffs
[0] - factor
;
6051 poly_int64
poly_offset (factor
, factor
);
6052 if (add_p
&& aarch64_sve_addvl_addpl_immediate_p (poly_offset
))
6053 /* Need one register for the ADDVL/ADDPL result. */
6055 else if (factor
!= 0)
6057 factor
= abs (factor
);
6058 if (factor
> 16 * (factor
& -factor
))
6059 /* Need one register for the CNT result and one for the multiplication
6060 factor. If necessary, the second temporary can be reused for the
6061 constant part of the offset. */
6063 /* Need one register for the CNT result (which might then
6067 return count
+ aarch64_add_offset_1_temporaries (constant
);
6070 /* If X can be represented as a poly_int64, return the number
6071 of temporaries that are required to add it to a register.
6072 Return -1 otherwise. */
6075 aarch64_add_offset_temporaries (rtx x
)
6078 if (!poly_int_rtx_p (x
, &offset
))
6080 return aarch64_offset_temporaries (true, offset
);
6083 /* Set DEST to SRC + OFFSET. MODE is the mode of the addition.
6084 FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
6085 be set and CFA adjustments added to the generated instructions.
6087 TEMP1, if nonnull, is a register of mode MODE that can be used as a
6088 temporary if register allocation is already complete. This temporary
6089 register may overlap DEST if !FRAME_RELATED_P but must not overlap SRC.
6090 If TEMP1 is known to hold abs (OFFSET), EMIT_MOVE_IMM can be set to
6091 false to avoid emitting the immediate again.
6093 TEMP2, if nonnull, is a second temporary register that doesn't
6094 overlap either DEST or REG.
6096 Since this function may be used to adjust the stack pointer, we must
6097 ensure that it cannot cause transient stack deallocation (for example
6098 by first incrementing SP and then decrementing when adjusting by a
6099 large immediate). */
6102 aarch64_add_offset (scalar_int_mode mode
, rtx dest
, rtx src
,
6103 poly_int64 offset
, rtx temp1
, rtx temp2
,
6104 bool frame_related_p
, bool emit_move_imm
= true)
6106 gcc_assert (emit_move_imm
|| temp1
!= NULL_RTX
);
6107 gcc_assert (temp1
== NULL_RTX
|| !reg_overlap_mentioned_p (temp1
, src
));
6108 gcc_assert (temp1
== NULL_RTX
6110 || !reg_overlap_mentioned_p (temp1
, dest
));
6111 gcc_assert (temp2
== NULL_RTX
|| !reg_overlap_mentioned_p (dest
, temp2
));
6113 /* Try using ADDVL or ADDPL to add the whole value. */
6114 if (src
!= const0_rtx
&& aarch64_sve_addvl_addpl_immediate_p (offset
))
6116 rtx offset_rtx
= gen_int_mode (offset
, mode
);
6117 rtx_insn
*insn
= emit_insn (gen_add3_insn (dest
, src
, offset_rtx
));
6118 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
6122 /* Coefficient 1 is multiplied by the number of 128-bit blocks in an
6123 SVE vector register, over and above the minimum size of 128 bits.
6124 This is equivalent to half the value returned by CNTD with a
6125 vector shape of ALL. */
6126 HOST_WIDE_INT factor
= offset
.coeffs
[1];
6127 HOST_WIDE_INT constant
= offset
.coeffs
[0] - factor
;
6129 /* Try using ADDVL or ADDPL to add the VG-based part. */
6130 poly_int64
poly_offset (factor
, factor
);
6131 if (src
!= const0_rtx
6132 && aarch64_sve_addvl_addpl_immediate_p (poly_offset
))
6134 rtx offset_rtx
= gen_int_mode (poly_offset
, mode
);
6135 if (frame_related_p
)
6137 rtx_insn
*insn
= emit_insn (gen_add3_insn (dest
, src
, offset_rtx
));
6138 RTX_FRAME_RELATED_P (insn
) = true;
6143 rtx addr
= gen_rtx_PLUS (mode
, src
, offset_rtx
);
6144 src
= aarch64_force_temporary (mode
, temp1
, addr
);
6149 /* Otherwise use a CNT-based sequence. */
6150 else if (factor
!= 0)
6152 /* Use a subtraction if we have a negative factor. */
6153 rtx_code code
= PLUS
;
6160 /* Calculate CNTD * FACTOR / 2. First try to fold the division
6161 into the multiplication. */
6165 /* Use a right shift by 1. */
6169 HOST_WIDE_INT low_bit
= factor
& -factor
;
6170 if (factor
<= 16 * low_bit
)
6172 if (factor
> 16 * 8)
6174 /* "CNTB Xn, ALL, MUL #FACTOR" is out of range, so calculate
6175 the value with the minimum multiplier and shift it into
6177 int extra_shift
= exact_log2 (low_bit
);
6178 shift
+= extra_shift
;
6179 factor
>>= extra_shift
;
6181 val
= gen_int_mode (poly_int64 (factor
* 2, factor
* 2), mode
);
6185 /* Base the factor on LOW_BIT if we can calculate LOW_BIT
6186 directly, since that should increase the chances of being
6187 able to use a shift and add sequence. If LOW_BIT itself
6188 is out of range, just use CNTD. */
6189 if (low_bit
<= 16 * 8)
6194 val
= gen_int_mode (poly_int64 (low_bit
* 2, low_bit
* 2), mode
);
6195 val
= aarch64_force_temporary (mode
, temp1
, val
);
6197 if (can_create_pseudo_p ())
6199 rtx coeff1
= gen_int_mode (factor
, mode
);
6200 val
= expand_mult (mode
, val
, coeff1
, NULL_RTX
, true, true);
6204 /* Go back to using a negative multiplication factor if we have
6205 no register from which to subtract. */
6206 if (code
== MINUS
&& src
== const0_rtx
)
6211 rtx coeff1
= gen_int_mode (factor
, mode
);
6212 coeff1
= aarch64_force_temporary (mode
, temp2
, coeff1
);
6213 val
= gen_rtx_MULT (mode
, val
, coeff1
);
6219 /* Multiply by 1 << SHIFT. */
6220 val
= aarch64_force_temporary (mode
, temp1
, val
);
6221 val
= gen_rtx_ASHIFT (mode
, val
, GEN_INT (shift
));
6223 else if (shift
== -1)
6226 val
= aarch64_force_temporary (mode
, temp1
, val
);
6227 val
= gen_rtx_ASHIFTRT (mode
, val
, const1_rtx
);
6230 /* Calculate SRC +/- CNTD * FACTOR / 2. */
6231 if (src
!= const0_rtx
)
6233 val
= aarch64_force_temporary (mode
, temp1
, val
);
6234 val
= gen_rtx_fmt_ee (code
, mode
, src
, val
);
6236 else if (code
== MINUS
)
6238 val
= aarch64_force_temporary (mode
, temp1
, val
);
6239 val
= gen_rtx_NEG (mode
, val
);
6242 if (constant
== 0 || frame_related_p
)
6244 rtx_insn
*insn
= emit_insn (gen_rtx_SET (dest
, val
));
6245 if (frame_related_p
)
6247 RTX_FRAME_RELATED_P (insn
) = true;
6248 add_reg_note (insn
, REG_CFA_ADJUST_CFA
,
6249 gen_rtx_SET (dest
, plus_constant (Pmode
, src
,
6258 src
= aarch64_force_temporary (mode
, temp1
, val
);
6263 emit_move_imm
= true;
6266 aarch64_add_offset_1 (mode
, dest
, src
, constant
, temp1
,
6267 frame_related_p
, emit_move_imm
);
6270 /* Like aarch64_add_offset, but the offset is given as an rtx rather
6271 than a poly_int64. */
6274 aarch64_split_add_offset (scalar_int_mode mode
, rtx dest
, rtx src
,
6275 rtx offset_rtx
, rtx temp1
, rtx temp2
)
6277 aarch64_add_offset (mode
, dest
, src
, rtx_to_poly_int64 (offset_rtx
),
6278 temp1
, temp2
, false);
6281 /* Add DELTA to the stack pointer, marking the instructions frame-related.
6282 TEMP1 is available as a temporary if nonnull. EMIT_MOVE_IMM is false
6283 if TEMP1 already contains abs (DELTA). */
6286 aarch64_add_sp (rtx temp1
, rtx temp2
, poly_int64 delta
, bool emit_move_imm
)
6288 aarch64_add_offset (Pmode
, stack_pointer_rtx
, stack_pointer_rtx
, delta
,
6289 temp1
, temp2
, true, emit_move_imm
);
6292 /* Subtract DELTA from the stack pointer, marking the instructions
6293 frame-related if FRAME_RELATED_P. TEMP1 is available as a temporary
6297 aarch64_sub_sp (rtx temp1
, rtx temp2
, poly_int64 delta
, bool frame_related_p
,
6298 bool emit_move_imm
= true)
6300 aarch64_add_offset (Pmode
, stack_pointer_rtx
, stack_pointer_rtx
, -delta
,
6301 temp1
, temp2
, frame_related_p
, emit_move_imm
);
6304 /* Set DEST to (vec_series BASE STEP). */
6307 aarch64_expand_vec_series (rtx dest
, rtx base
, rtx step
)
6309 machine_mode mode
= GET_MODE (dest
);
6310 scalar_mode inner
= GET_MODE_INNER (mode
);
6312 /* Each operand can be a register or an immediate in the range [-16, 15]. */
6313 if (!aarch64_sve_index_immediate_p (base
))
6314 base
= force_reg (inner
, base
);
6315 if (!aarch64_sve_index_immediate_p (step
))
6316 step
= force_reg (inner
, step
);
6318 emit_set_insn (dest
, gen_rtx_VEC_SERIES (mode
, base
, step
));
6321 /* Duplicate 128-bit Advanced SIMD vector SRC so that it fills an SVE
6322 register of mode MODE. Use TARGET for the result if it's nonnull
6325 The two vector modes must have the same element mode. The behavior
6326 is to duplicate architectural lane N of SRC into architectural lanes
6327 N + I * STEP of the result. On big-endian targets, architectural
6328 lane 0 of an Advanced SIMD vector is the last element of the vector
6329 in memory layout, so for big-endian targets this operation has the
6330 effect of reversing SRC before duplicating it. Callers need to
6331 account for this. */
6334 aarch64_expand_sve_dupq (rtx target
, machine_mode mode
, rtx src
)
6336 machine_mode src_mode
= GET_MODE (src
);
6337 gcc_assert (GET_MODE_INNER (mode
) == GET_MODE_INNER (src_mode
));
6338 insn_code icode
= (BYTES_BIG_ENDIAN
6339 ? code_for_aarch64_vec_duplicate_vq_be (mode
)
6340 : code_for_aarch64_vec_duplicate_vq_le (mode
));
6343 expand_operand ops
[3];
6344 create_output_operand (&ops
[i
++], target
, mode
);
6345 create_output_operand (&ops
[i
++], src
, src_mode
);
6346 if (BYTES_BIG_ENDIAN
)
6348 /* Create a PARALLEL describing the reversal of SRC. */
6349 unsigned int nelts_per_vq
= 128 / GET_MODE_UNIT_BITSIZE (mode
);
6350 rtx sel
= aarch64_gen_stepped_int_parallel (nelts_per_vq
,
6351 nelts_per_vq
- 1, -1);
6352 create_fixed_operand (&ops
[i
++], sel
);
6354 expand_insn (icode
, i
, ops
);
6355 return ops
[0].value
;
6358 /* Try to force 128-bit vector value SRC into memory and use LD1RQ to fetch
6359 the memory image into DEST. Return true on success. */
6362 aarch64_expand_sve_ld1rq (rtx dest
, rtx src
)
6364 src
= force_const_mem (GET_MODE (src
), src
);
6368 /* Make sure that the address is legitimate. */
6369 if (!aarch64_sve_ld1rq_operand_p (src
))
6371 rtx addr
= force_reg (Pmode
, XEXP (src
, 0));
6372 src
= replace_equiv_address (src
, addr
);
6375 machine_mode mode
= GET_MODE (dest
);
6376 machine_mode pred_mode
= aarch64_sve_pred_mode (mode
);
6377 rtx ptrue
= aarch64_ptrue_reg (pred_mode
);
6378 emit_insn (gen_aarch64_sve_ld1rq (mode
, dest
, src
, ptrue
));
6382 /* SRC is an SVE CONST_VECTOR that contains N "foreground" values followed
6383 by N "background" values. Try to move it into TARGET using:
6385 PTRUE PRED.<T>, VL<N>
6386 MOV TRUE.<T>, #<foreground>
6387 MOV FALSE.<T>, #<background>
6388 SEL TARGET.<T>, PRED.<T>, TRUE.<T>, FALSE.<T>
6390 The PTRUE is always a single instruction but the MOVs might need a
6391 longer sequence. If the background value is zero (as it often is),
6392 the sequence can sometimes collapse to a PTRUE followed by a
6393 zero-predicated move.
6395 Return the target on success, otherwise return null. */
6398 aarch64_expand_sve_const_vector_sel (rtx target
, rtx src
)
6400 gcc_assert (CONST_VECTOR_NELTS_PER_PATTERN (src
) == 2);
6402 /* Make sure that the PTRUE is valid. */
6403 machine_mode mode
= GET_MODE (src
);
6404 machine_mode pred_mode
= aarch64_sve_pred_mode (mode
);
6405 unsigned int npatterns
= CONST_VECTOR_NPATTERNS (src
);
6406 if (aarch64_svpattern_for_vl (pred_mode
, npatterns
)
6407 == AARCH64_NUM_SVPATTERNS
)
6410 rtx_vector_builder
pred_builder (pred_mode
, npatterns
, 2);
6411 rtx_vector_builder
true_builder (mode
, npatterns
, 1);
6412 rtx_vector_builder
false_builder (mode
, npatterns
, 1);
6413 for (unsigned int i
= 0; i
< npatterns
; ++i
)
6415 true_builder
.quick_push (CONST_VECTOR_ENCODED_ELT (src
, i
));
6416 pred_builder
.quick_push (CONST1_RTX (BImode
));
6418 for (unsigned int i
= 0; i
< npatterns
; ++i
)
6420 false_builder
.quick_push (CONST_VECTOR_ENCODED_ELT (src
, i
+ npatterns
));
6421 pred_builder
.quick_push (CONST0_RTX (BImode
));
6423 expand_operand ops
[4];
6424 create_output_operand (&ops
[0], target
, mode
);
6425 create_input_operand (&ops
[1], true_builder
.build (), mode
);
6426 create_input_operand (&ops
[2], false_builder
.build (), mode
);
6427 create_input_operand (&ops
[3], pred_builder
.build (), pred_mode
);
6428 expand_insn (code_for_vcond_mask (mode
, mode
), 4, ops
);
6432 /* Return a register containing CONST_VECTOR SRC, given that SRC has an
6433 SVE data mode and isn't a legitimate constant. Use TARGET for the
6434 result if convenient.
6436 The returned register can have whatever mode seems most natural
6437 given the contents of SRC. */
6440 aarch64_expand_sve_const_vector (rtx target
, rtx src
)
6442 machine_mode mode
= GET_MODE (src
);
6443 unsigned int npatterns
= CONST_VECTOR_NPATTERNS (src
);
6444 unsigned int nelts_per_pattern
= CONST_VECTOR_NELTS_PER_PATTERN (src
);
6445 scalar_mode elt_mode
= GET_MODE_INNER (mode
);
6446 unsigned int elt_bits
= GET_MODE_BITSIZE (elt_mode
);
6447 unsigned int container_bits
= aarch64_sve_container_bits (mode
);
6448 unsigned int encoded_bits
= npatterns
* nelts_per_pattern
* container_bits
;
6450 if (nelts_per_pattern
== 1
6451 && encoded_bits
<= 128
6452 && container_bits
!= elt_bits
)
6454 /* We have a partial vector mode and a constant whose full-vector
6455 equivalent would occupy a repeating 128-bit sequence. Build that
6456 full-vector equivalent instead, so that we have the option of
6457 using LD1RQ and Advanced SIMD operations. */
6458 unsigned int repeat
= container_bits
/ elt_bits
;
6459 machine_mode full_mode
= aarch64_full_sve_mode (elt_mode
).require ();
6460 rtx_vector_builder
builder (full_mode
, npatterns
* repeat
, 1);
6461 for (unsigned int i
= 0; i
< npatterns
; ++i
)
6462 for (unsigned int j
= 0; j
< repeat
; ++j
)
6463 builder
.quick_push (CONST_VECTOR_ENCODED_ELT (src
, i
));
6464 target
= aarch64_target_reg (target
, full_mode
);
6465 return aarch64_expand_sve_const_vector (target
, builder
.build ());
6468 if (nelts_per_pattern
== 1 && encoded_bits
== 128)
6470 /* The constant is a duplicated quadword but can't be narrowed
6471 beyond a quadword. Get the memory image of the first quadword
6472 as a 128-bit vector and try using LD1RQ to load it from memory.
6474 The effect for both endiannesses is to load memory lane N into
6475 architectural lanes N + I * STEP of the result. On big-endian
6476 targets, the layout of the 128-bit vector in an Advanced SIMD
6477 register would be different from its layout in an SVE register,
6478 but this 128-bit vector is a memory value only. */
6479 machine_mode vq_mode
= aarch64_vq_mode (elt_mode
).require ();
6480 rtx vq_value
= simplify_gen_subreg (vq_mode
, src
, mode
, 0);
6481 if (vq_value
&& aarch64_expand_sve_ld1rq (target
, vq_value
))
6485 if (nelts_per_pattern
== 1 && encoded_bits
< 128)
6487 /* The vector is a repeating sequence of 64 bits or fewer.
6488 See if we can load them using an Advanced SIMD move and then
6489 duplicate it to fill a vector. This is better than using a GPR
6490 move because it keeps everything in the same register file. */
6491 machine_mode vq_mode
= aarch64_vq_mode (elt_mode
).require ();
6492 rtx_vector_builder
builder (vq_mode
, npatterns
, 1);
6493 for (unsigned int i
= 0; i
< npatterns
; ++i
)
6495 /* We want memory lane N to go into architectural lane N,
6496 so reverse for big-endian targets. The DUP .Q pattern
6497 has a compensating reverse built-in. */
6498 unsigned int srci
= BYTES_BIG_ENDIAN
? npatterns
- i
- 1 : i
;
6499 builder
.quick_push (CONST_VECTOR_ENCODED_ELT (src
, srci
));
6501 rtx vq_src
= builder
.build ();
6502 if (aarch64_simd_valid_immediate (vq_src
, NULL
))
6504 vq_src
= force_reg (vq_mode
, vq_src
);
6505 return aarch64_expand_sve_dupq (target
, mode
, vq_src
);
6508 /* Get an integer representation of the repeating part of Advanced
6509 SIMD vector VQ_SRC. This preserves the endianness of VQ_SRC,
6510 which for big-endian targets is lane-swapped wrt a normal
6511 Advanced SIMD vector. This means that for both endiannesses,
6512 memory lane N of SVE vector SRC corresponds to architectural
6513 lane N of a register holding VQ_SRC. This in turn means that
6514 memory lane 0 of SVE vector SRC is in the lsb of VQ_SRC (viewed
6515 as a single 128-bit value) and thus that memory lane 0 of SRC is
6516 in the lsb of the integer. Duplicating the integer therefore
6517 ensures that memory lane N of SRC goes into architectural lane
6518 N + I * INDEX of the SVE register. */
6519 scalar_mode int_mode
= int_mode_for_size (encoded_bits
, 0).require ();
6520 rtx elt_value
= simplify_gen_subreg (int_mode
, vq_src
, vq_mode
, 0);
6523 /* Pretend that we had a vector of INT_MODE to start with. */
6524 elt_mode
= int_mode
;
6525 mode
= aarch64_full_sve_mode (int_mode
).require ();
6527 /* If the integer can be moved into a general register by a
6528 single instruction, do that and duplicate the result. */
6529 if (CONST_INT_P (elt_value
)
6530 && aarch64_move_imm (INTVAL (elt_value
),
6531 encoded_bits
<= 32 ? SImode
: DImode
))
6533 elt_value
= force_reg (elt_mode
, elt_value
);
6534 return expand_vector_broadcast (mode
, elt_value
);
6537 else if (npatterns
== 1)
6538 /* We're duplicating a single value, but can't do better than
6539 force it to memory and load from there. This handles things
6540 like symbolic constants. */
6541 elt_value
= CONST_VECTOR_ENCODED_ELT (src
, 0);
6545 /* Load the element from memory if we can, otherwise move it into
6546 a register and use a DUP. */
6547 rtx op
= force_const_mem (elt_mode
, elt_value
);
6549 op
= force_reg (elt_mode
, elt_value
);
6550 return expand_vector_broadcast (mode
, op
);
6554 /* Try using INDEX. */
6556 if (const_vec_series_p (src
, &base
, &step
))
6558 aarch64_expand_vec_series (target
, base
, step
);
6562 /* From here on, it's better to force the whole constant to memory
6564 if (GET_MODE_NUNITS (mode
).is_constant ())
6567 if (nelts_per_pattern
== 2)
6568 if (rtx res
= aarch64_expand_sve_const_vector_sel (target
, src
))
6571 /* Expand each pattern individually. */
6572 gcc_assert (npatterns
> 1);
6573 rtx_vector_builder builder
;
6574 auto_vec
<rtx
, 16> vectors (npatterns
);
6575 for (unsigned int i
= 0; i
< npatterns
; ++i
)
6577 builder
.new_vector (mode
, 1, nelts_per_pattern
);
6578 for (unsigned int j
= 0; j
< nelts_per_pattern
; ++j
)
6579 builder
.quick_push (CONST_VECTOR_ELT (src
, i
+ j
* npatterns
));
6580 vectors
.quick_push (force_reg (mode
, builder
.build ()));
6583 /* Use permutes to interleave the separate vectors. */
6584 while (npatterns
> 1)
6587 for (unsigned int i
= 0; i
< npatterns
; ++i
)
6589 rtx tmp
= (npatterns
== 1 ? target
: gen_reg_rtx (mode
));
6590 rtvec v
= gen_rtvec (2, vectors
[i
], vectors
[i
+ npatterns
]);
6591 emit_set_insn (tmp
, gen_rtx_UNSPEC (mode
, v
, UNSPEC_ZIP1
));
6595 gcc_assert (vectors
[0] == target
);
6599 /* Use WHILE to set a predicate register of mode MODE in which the first
6600 VL bits are set and the rest are clear. Use TARGET for the register
6601 if it's nonnull and convenient. */
6604 aarch64_sve_move_pred_via_while (rtx target
, machine_mode mode
,
6607 rtx limit
= force_reg (DImode
, gen_int_mode (vl
, DImode
));
6608 target
= aarch64_target_reg (target
, mode
);
6609 emit_insn (gen_while (UNSPEC_WHILELO
, DImode
, mode
,
6610 target
, const0_rtx
, limit
));
6615 aarch64_expand_sve_const_pred_1 (rtx
, rtx_vector_builder
&, bool);
6617 /* BUILDER is a constant predicate in which the index of every set bit
6618 is a multiple of ELT_SIZE (which is <= 8). Try to load the constant
6619 by inverting every element at a multiple of ELT_SIZE and EORing the
6620 result with an ELT_SIZE PTRUE.
6622 Return a register that contains the constant on success, otherwise
6623 return null. Use TARGET as the register if it is nonnull and
6627 aarch64_expand_sve_const_pred_eor (rtx target
, rtx_vector_builder
&builder
,
6628 unsigned int elt_size
)
6630 /* Invert every element at a multiple of ELT_SIZE, keeping the
6632 rtx_vector_builder
inv_builder (VNx16BImode
, builder
.npatterns (),
6633 builder
.nelts_per_pattern ());
6634 for (unsigned int i
= 0; i
< builder
.encoded_nelts (); ++i
)
6635 if ((i
& (elt_size
- 1)) == 0 && INTVAL (builder
.elt (i
)) == 0)
6636 inv_builder
.quick_push (const1_rtx
);
6638 inv_builder
.quick_push (const0_rtx
);
6639 inv_builder
.finalize ();
6641 /* See if we can load the constant cheaply. */
6642 rtx inv
= aarch64_expand_sve_const_pred_1 (NULL_RTX
, inv_builder
, false);
6646 /* EOR the result with an ELT_SIZE PTRUE. */
6647 rtx mask
= aarch64_ptrue_all (elt_size
);
6648 mask
= force_reg (VNx16BImode
, mask
);
6649 inv
= gen_lowpart (VNx16BImode
, inv
);
6650 target
= aarch64_target_reg (target
, VNx16BImode
);
6651 emit_insn (gen_aarch64_pred_z (XOR
, VNx16BImode
, target
, mask
, inv
, mask
));
6655 /* BUILDER is a constant predicate in which the index of every set bit
6656 is a multiple of ELT_SIZE (which is <= 8). Try to load the constant
6657 using a TRN1 of size PERMUTE_SIZE, which is >= ELT_SIZE. Return the
6658 register on success, otherwise return null. Use TARGET as the register
6659 if nonnull and convenient. */
6662 aarch64_expand_sve_const_pred_trn (rtx target
, rtx_vector_builder
&builder
,
6663 unsigned int elt_size
,
6664 unsigned int permute_size
)
6666 /* We're going to split the constant into two new constants A and B,
6667 with element I of BUILDER going into A if (I & PERMUTE_SIZE) == 0
6668 and into B otherwise. E.g. for PERMUTE_SIZE == 4 && ELT_SIZE == 1:
6670 A: { 0, 1, 2, 3, _, _, _, _, 8, 9, 10, 11, _, _, _, _ }
6671 B: { 4, 5, 6, 7, _, _, _, _, 12, 13, 14, 15, _, _, _, _ }
6673 where _ indicates elements that will be discarded by the permute.
6675 First calculate the ELT_SIZEs for A and B. */
6676 unsigned int a_elt_size
= GET_MODE_SIZE (DImode
);
6677 unsigned int b_elt_size
= GET_MODE_SIZE (DImode
);
6678 for (unsigned int i
= 0; i
< builder
.encoded_nelts (); i
+= elt_size
)
6679 if (INTVAL (builder
.elt (i
)) != 0)
6681 if (i
& permute_size
)
6682 b_elt_size
|= i
- permute_size
;
6686 a_elt_size
&= -a_elt_size
;
6687 b_elt_size
&= -b_elt_size
;
6689 /* Now construct the vectors themselves. */
6690 rtx_vector_builder
a_builder (VNx16BImode
, builder
.npatterns (),
6691 builder
.nelts_per_pattern ());
6692 rtx_vector_builder
b_builder (VNx16BImode
, builder
.npatterns (),
6693 builder
.nelts_per_pattern ());
6694 unsigned int nelts
= builder
.encoded_nelts ();
6695 for (unsigned int i
= 0; i
< nelts
; ++i
)
6696 if (i
& (elt_size
- 1))
6698 a_builder
.quick_push (const0_rtx
);
6699 b_builder
.quick_push (const0_rtx
);
6701 else if ((i
& permute_size
) == 0)
6703 /* The A and B elements are significant. */
6704 a_builder
.quick_push (builder
.elt (i
));
6705 b_builder
.quick_push (builder
.elt (i
+ permute_size
));
6709 /* The A and B elements are going to be discarded, so pick whatever
6710 is likely to give a nice constant. We are targeting element
6711 sizes A_ELT_SIZE and B_ELT_SIZE for A and B respectively,
6712 with the aim of each being a sequence of ones followed by
6713 a sequence of zeros. So:
6715 * if X_ELT_SIZE <= PERMUTE_SIZE, the best approach is to
6716 duplicate the last X_ELT_SIZE element, to extend the
6717 current sequence of ones or zeros.
6719 * if X_ELT_SIZE > PERMUTE_SIZE, the best approach is to add a
6720 zero, so that the constant really does have X_ELT_SIZE and
6721 not a smaller size. */
6722 if (a_elt_size
> permute_size
)
6723 a_builder
.quick_push (const0_rtx
);
6725 a_builder
.quick_push (a_builder
.elt (i
- a_elt_size
));
6726 if (b_elt_size
> permute_size
)
6727 b_builder
.quick_push (const0_rtx
);
6729 b_builder
.quick_push (b_builder
.elt (i
- b_elt_size
));
6731 a_builder
.finalize ();
6732 b_builder
.finalize ();
6734 /* Try loading A into a register. */
6735 rtx_insn
*last
= get_last_insn ();
6736 rtx a
= aarch64_expand_sve_const_pred_1 (NULL_RTX
, a_builder
, false);
6740 /* Try loading B into a register. */
6742 if (a_builder
!= b_builder
)
6744 b
= aarch64_expand_sve_const_pred_1 (NULL_RTX
, b_builder
, false);
6747 delete_insns_since (last
);
6752 /* Emit the TRN1 itself. We emit a TRN that operates on VNx16BI
6753 operands but permutes them as though they had mode MODE. */
6754 machine_mode mode
= aarch64_sve_pred_mode (permute_size
).require ();
6755 target
= aarch64_target_reg (target
, GET_MODE (a
));
6756 rtx type_reg
= CONST0_RTX (mode
);
6757 emit_insn (gen_aarch64_sve_trn1_conv (mode
, target
, a
, b
, type_reg
));
6761 /* Subroutine of aarch64_expand_sve_const_pred. Try to load the VNx16BI
6762 constant in BUILDER into an SVE predicate register. Return the register
6763 on success, otherwise return null. Use TARGET for the register if
6764 nonnull and convenient.
6766 ALLOW_RECURSE_P is true if we can use methods that would call this
6767 function recursively. */
6770 aarch64_expand_sve_const_pred_1 (rtx target
, rtx_vector_builder
&builder
,
6771 bool allow_recurse_p
)
6773 if (builder
.encoded_nelts () == 1)
6774 /* A PFALSE or a PTRUE .B ALL. */
6775 return aarch64_emit_set_immediate (target
, builder
);
6777 unsigned int elt_size
= aarch64_widest_sve_pred_elt_size (builder
);
6778 if (int vl
= aarch64_partial_ptrue_length (builder
, elt_size
))
6780 /* If we can load the constant using PTRUE, use it as-is. */
6781 machine_mode mode
= aarch64_sve_pred_mode (elt_size
).require ();
6782 if (aarch64_svpattern_for_vl (mode
, vl
) != AARCH64_NUM_SVPATTERNS
)
6783 return aarch64_emit_set_immediate (target
, builder
);
6785 /* Otherwise use WHILE to set the first VL bits. */
6786 return aarch64_sve_move_pred_via_while (target
, mode
, vl
);
6789 if (!allow_recurse_p
)
6792 /* Try inverting the vector in element size ELT_SIZE and then EORing
6793 the result with an ELT_SIZE PTRUE. */
6794 if (INTVAL (builder
.elt (0)) == 0)
6795 if (rtx res
= aarch64_expand_sve_const_pred_eor (target
, builder
,
6799 /* Try using TRN1 to permute two simpler constants. */
6800 for (unsigned int i
= elt_size
; i
<= 8; i
*= 2)
6801 if (rtx res
= aarch64_expand_sve_const_pred_trn (target
, builder
,
6808 /* Return an SVE predicate register that contains the VNx16BImode
6809 constant in BUILDER, without going through the move expanders.
6811 The returned register can have whatever mode seems most natural
6812 given the contents of BUILDER. Use TARGET for the result if
6816 aarch64_expand_sve_const_pred (rtx target
, rtx_vector_builder
&builder
)
6818 /* Try loading the constant using pure predicate operations. */
6819 if (rtx res
= aarch64_expand_sve_const_pred_1 (target
, builder
, true))
6822 /* Try forcing the constant to memory. */
6823 if (builder
.full_nelts ().is_constant ())
6824 if (rtx mem
= force_const_mem (VNx16BImode
, builder
.build ()))
6826 target
= aarch64_target_reg (target
, VNx16BImode
);
6827 emit_move_insn (target
, mem
);
6831 /* The last resort is to load the constant as an integer and then
6832 compare it against zero. Use -1 for set bits in order to increase
6833 the changes of using SVE DUPM or an Advanced SIMD byte mask. */
6834 rtx_vector_builder
int_builder (VNx16QImode
, builder
.npatterns (),
6835 builder
.nelts_per_pattern ());
6836 for (unsigned int i
= 0; i
< builder
.encoded_nelts (); ++i
)
6837 int_builder
.quick_push (INTVAL (builder
.elt (i
))
6838 ? constm1_rtx
: const0_rtx
);
6839 return aarch64_convert_sve_data_to_pred (target
, VNx16BImode
,
6840 int_builder
.build ());
6843 /* Set DEST to immediate IMM. */
6846 aarch64_expand_mov_immediate (rtx dest
, rtx imm
)
6848 machine_mode mode
= GET_MODE (dest
);
6850 /* Check on what type of symbol it is. */
6851 scalar_int_mode int_mode
;
6852 if ((SYMBOL_REF_P (imm
)
6853 || LABEL_REF_P (imm
)
6854 || GET_CODE (imm
) == CONST
6855 || GET_CODE (imm
) == CONST_POLY_INT
)
6856 && is_a
<scalar_int_mode
> (mode
, &int_mode
))
6860 HOST_WIDE_INT const_offset
;
6861 enum aarch64_symbol_type sty
;
6863 /* If we have (const (plus symbol offset)), separate out the offset
6864 before we start classifying the symbol. */
6865 rtx base
= strip_offset (imm
, &offset
);
6867 /* We must always add an offset involving VL separately, rather than
6868 folding it into the relocation. */
6869 if (!offset
.is_constant (&const_offset
))
6873 aarch64_report_sve_required ();
6876 if (base
== const0_rtx
&& aarch64_sve_cnt_immediate_p (offset
))
6877 emit_insn (gen_rtx_SET (dest
, imm
));
6880 /* Do arithmetic on 32-bit values if the result is smaller
6882 if (partial_subreg_p (int_mode
, SImode
))
6884 /* It is invalid to do symbol calculations in modes
6885 narrower than SImode. */
6886 gcc_assert (base
== const0_rtx
);
6887 dest
= gen_lowpart (SImode
, dest
);
6890 if (base
!= const0_rtx
)
6892 base
= aarch64_force_temporary (int_mode
, dest
, base
);
6893 aarch64_add_offset (int_mode
, dest
, base
, offset
,
6894 NULL_RTX
, NULL_RTX
, false);
6897 aarch64_add_offset (int_mode
, dest
, base
, offset
,
6898 dest
, NULL_RTX
, false);
6903 sty
= aarch64_classify_symbol (base
, const_offset
);
6906 case SYMBOL_FORCE_TO_MEM
:
6907 if (int_mode
!= ptr_mode
)
6908 imm
= convert_memory_address (ptr_mode
, imm
);
6910 if (const_offset
!= 0
6911 && targetm
.cannot_force_const_mem (ptr_mode
, imm
))
6913 gcc_assert (can_create_pseudo_p ());
6914 base
= aarch64_force_temporary (int_mode
, dest
, base
);
6915 aarch64_add_offset (int_mode
, dest
, base
, const_offset
,
6916 NULL_RTX
, NULL_RTX
, false);
6920 mem
= force_const_mem (ptr_mode
, imm
);
6923 /* If we aren't generating PC relative literals, then
6924 we need to expand the literal pool access carefully.
6925 This is something that needs to be done in a number
6926 of places, so could well live as a separate function. */
6927 if (!aarch64_pcrelative_literal_loads
)
6929 gcc_assert (can_create_pseudo_p ());
6930 base
= gen_reg_rtx (ptr_mode
);
6931 aarch64_expand_mov_immediate (base
, XEXP (mem
, 0));
6932 if (ptr_mode
!= Pmode
)
6933 base
= convert_memory_address (Pmode
, base
);
6934 mem
= gen_rtx_MEM (ptr_mode
, base
);
6937 if (int_mode
!= ptr_mode
)
6938 mem
= gen_rtx_ZERO_EXTEND (int_mode
, mem
);
6940 emit_insn (gen_rtx_SET (dest
, mem
));
6944 case SYMBOL_SMALL_TLSGD
:
6945 case SYMBOL_SMALL_TLSDESC
:
6946 case SYMBOL_SMALL_TLSIE
:
6947 case SYMBOL_SMALL_GOT_28K
:
6948 case SYMBOL_SMALL_GOT_4G
:
6949 case SYMBOL_TINY_GOT
:
6950 case SYMBOL_TINY_TLSIE
:
6951 if (const_offset
!= 0)
6953 gcc_assert(can_create_pseudo_p ());
6954 base
= aarch64_force_temporary (int_mode
, dest
, base
);
6955 aarch64_add_offset (int_mode
, dest
, base
, const_offset
,
6956 NULL_RTX
, NULL_RTX
, false);
6961 case SYMBOL_SMALL_ABSOLUTE
:
6962 case SYMBOL_TINY_ABSOLUTE
:
6963 case SYMBOL_TLSLE12
:
6964 case SYMBOL_TLSLE24
:
6965 case SYMBOL_TLSLE32
:
6966 case SYMBOL_TLSLE48
:
6967 aarch64_load_symref_appropriately (dest
, imm
, sty
);
6975 if (!CONST_INT_P (imm
))
6977 if (GET_MODE_CLASS (mode
) == MODE_VECTOR_BOOL
)
6979 /* Only the low bit of each .H, .S and .D element is defined,
6980 so we can set the upper bits to whatever we like. If the
6981 predicate is all-true in MODE, prefer to set all the undefined
6982 bits as well, so that we can share a single .B predicate for
6984 if (imm
== CONSTM1_RTX (mode
))
6985 imm
= CONSTM1_RTX (VNx16BImode
);
6987 /* All methods for constructing predicate modes wider than VNx16BI
6988 will set the upper bits of each element to zero. Expose this
6989 by moving such constants as a VNx16BI, so that all bits are
6990 significant and so that constants for different modes can be
6991 shared. The wider constant will still be available as a
6993 rtx_vector_builder builder
;
6994 if (aarch64_get_sve_pred_bits (builder
, imm
))
6996 rtx res
= aarch64_expand_sve_const_pred (dest
, builder
);
6998 emit_move_insn (dest
, gen_lowpart (mode
, res
));
7003 if (GET_CODE (imm
) == HIGH
7004 || aarch64_simd_valid_immediate (imm
, NULL
))
7006 emit_insn (gen_rtx_SET (dest
, imm
));
7010 if (CONST_VECTOR_P (imm
) && aarch64_sve_data_mode_p (mode
))
7011 if (rtx res
= aarch64_expand_sve_const_vector (dest
, imm
))
7014 emit_insn (gen_aarch64_sve_reinterpret (mode
, dest
, res
));
7018 rtx mem
= force_const_mem (mode
, imm
);
7020 emit_move_insn (dest
, mem
);
7024 aarch64_internal_mov_immediate (dest
, imm
, true, mode
);
7027 /* Return the MEM rtx that provides the canary value that should be used
7028 for stack-smashing protection. MODE is the mode of the memory.
7029 For SSP_GLOBAL, DECL_RTL is the MEM rtx for the canary variable
7030 (__stack_chk_guard), otherwise it has no useful value. SALT_TYPE
7031 indicates whether the caller is performing a SET or a TEST operation. */
7034 aarch64_stack_protect_canary_mem (machine_mode mode
, rtx decl_rtl
,
7035 aarch64_salt_type salt_type
)
7038 if (aarch64_stack_protector_guard
== SSP_GLOBAL
)
7040 gcc_assert (MEM_P (decl_rtl
));
7041 addr
= XEXP (decl_rtl
, 0);
7043 rtx base
= strip_offset_and_salt (addr
, &offset
);
7044 if (!SYMBOL_REF_P (base
))
7047 rtvec v
= gen_rtvec (2, base
, GEN_INT (salt_type
));
7048 addr
= gen_rtx_UNSPEC (Pmode
, v
, UNSPEC_SALT_ADDR
);
7049 addr
= gen_rtx_CONST (Pmode
, addr
);
7050 addr
= plus_constant (Pmode
, addr
, offset
);
7054 /* Calculate the address from the system register. */
7055 rtx salt
= GEN_INT (salt_type
);
7056 addr
= gen_reg_rtx (mode
);
7058 emit_insn (gen_reg_stack_protect_address_di (addr
, salt
));
7061 emit_insn (gen_reg_stack_protect_address_si (addr
, salt
));
7062 addr
= convert_memory_address (Pmode
, addr
);
7064 addr
= plus_constant (Pmode
, addr
, aarch64_stack_protector_guard_offset
);
7066 return gen_rtx_MEM (mode
, force_reg (Pmode
, addr
));
7069 /* Emit an SVE predicated move from SRC to DEST. PRED is a predicate
7070 that is known to contain PTRUE. */
7073 aarch64_emit_sve_pred_move (rtx dest
, rtx pred
, rtx src
)
7075 expand_operand ops
[3];
7076 machine_mode mode
= GET_MODE (dest
);
7077 create_output_operand (&ops
[0], dest
, mode
);
7078 create_input_operand (&ops
[1], pred
, GET_MODE(pred
));
7079 create_input_operand (&ops
[2], src
, mode
);
7080 temporary_volatile_ok
v (true);
7081 expand_insn (code_for_aarch64_pred_mov (mode
), 3, ops
);
7084 /* Expand a pre-RA SVE data move from SRC to DEST in which at least one
7085 operand is in memory. In this case we need to use the predicated LD1
7086 and ST1 instead of LDR and STR, both for correctness on big-endian
7087 targets and because LD1 and ST1 support a wider range of addressing modes.
7088 PRED_MODE is the mode of the predicate.
7090 See the comment at the head of aarch64-sve.md for details about the
7091 big-endian handling. */
7094 aarch64_expand_sve_mem_move (rtx dest
, rtx src
, machine_mode pred_mode
)
7096 machine_mode mode
= GET_MODE (dest
);
7097 rtx ptrue
= aarch64_ptrue_reg (pred_mode
);
7098 if (!register_operand (src
, mode
)
7099 && !register_operand (dest
, mode
))
7101 rtx tmp
= gen_reg_rtx (mode
);
7103 aarch64_emit_sve_pred_move (tmp
, ptrue
, src
);
7105 emit_move_insn (tmp
, src
);
7108 aarch64_emit_sve_pred_move (dest
, ptrue
, src
);
7111 /* Called only on big-endian targets. See whether an SVE vector move
7112 from SRC to DEST is effectively a REV[BHW] instruction, because at
7113 least one operand is a subreg of an SVE vector that has wider or
7114 narrower elements. Return true and emit the instruction if so.
7118 (set (reg:VNx8HI R1) (subreg:VNx8HI (reg:VNx16QI R2) 0))
7120 represents a VIEW_CONVERT between the following vectors, viewed
7123 R2: { [0].high, [0].low, [1].high, [1].low, ... }
7124 R1: { [0], [1], [2], [3], ... }
7126 The high part of lane X in R2 should therefore correspond to lane X*2
7127 of R1, but the register representations are:
7130 R2: ...... [1].high [1].low [0].high [0].low
7131 R1: ...... [3] [2] [1] [0]
7133 where the low part of lane X in R2 corresponds to lane X*2 in R1.
7134 We therefore need a reverse operation to swap the high and low values
7137 This is purely an optimization. Without it we would spill the
7138 subreg operand to the stack in one mode and reload it in the
7139 other mode, which has the same effect as the REV. */
7142 aarch64_maybe_expand_sve_subreg_move (rtx dest
, rtx src
)
7144 gcc_assert (BYTES_BIG_ENDIAN
);
7146 /* Do not try to optimize subregs that LRA has created for matched
7147 reloads. These subregs only exist as a temporary measure to make
7148 the RTL well-formed, but they are exempt from the usual
7149 TARGET_CAN_CHANGE_MODE_CLASS rules.
7151 For example, if we have:
7153 (set (reg:VNx8HI R1) (foo:VNx8HI (reg:VNx4SI R2)))
7155 and the constraints require R1 and R2 to be in the same register,
7156 LRA may need to create RTL such as:
7158 (set (subreg:VNx4SI (reg:VNx8HI TMP) 0) (reg:VNx4SI R2))
7159 (set (reg:VNx8HI TMP) (foo:VNx8HI (subreg:VNx4SI (reg:VNx8HI TMP) 0)))
7160 (set (reg:VNx8HI R1) (reg:VNx8HI TMP))
7162 which forces both the input and output of the original instruction
7163 to use the same hard register. But for this to work, the normal
7164 rules have to be suppressed on the subreg input, otherwise LRA
7165 would need to reload that input too, meaning that the process
7166 would never terminate. To compensate for this, the normal rules
7167 are also suppressed for the subreg output of the first move.
7168 Ignoring the special case and handling the first move normally
7169 would therefore generate wrong code: we would reverse the elements
7170 for the first subreg but not reverse them back for the second subreg. */
7171 if (SUBREG_P (dest
) && !LRA_SUBREG_P (dest
))
7172 dest
= SUBREG_REG (dest
);
7173 if (SUBREG_P (src
) && !LRA_SUBREG_P (src
))
7174 src
= SUBREG_REG (src
);
7176 /* The optimization handles two single SVE REGs with different element
7180 || aarch64_classify_vector_mode (GET_MODE (dest
)) != VEC_SVE_DATA
7181 || aarch64_classify_vector_mode (GET_MODE (src
)) != VEC_SVE_DATA
7182 || (GET_MODE_UNIT_SIZE (GET_MODE (dest
))
7183 == GET_MODE_UNIT_SIZE (GET_MODE (src
))))
7186 /* Generate *aarch64_sve_mov<mode>_subreg_be. */
7187 rtx ptrue
= aarch64_ptrue_reg (VNx16BImode
);
7188 rtx unspec
= gen_rtx_UNSPEC (GET_MODE (dest
), gen_rtvec (2, ptrue
, src
),
7190 emit_insn (gen_rtx_SET (dest
, unspec
));
7194 /* Return a copy of X with mode MODE, without changing its other
7195 attributes. Unlike gen_lowpart, this doesn't care whether the
7196 mode change is valid. */
7199 aarch64_replace_reg_mode (rtx x
, machine_mode mode
)
7201 if (GET_MODE (x
) == mode
)
7204 x
= shallow_copy_rtx (x
);
7205 set_mode_and_regno (x
, mode
, REGNO (x
));
7209 /* Return the SVE REV[BHW] unspec for reversing quantites of mode MODE
7210 stored in wider integer containers. */
7213 aarch64_sve_rev_unspec (machine_mode mode
)
7215 switch (GET_MODE_UNIT_SIZE (mode
))
7217 case 1: return UNSPEC_REVB
;
7218 case 2: return UNSPEC_REVH
;
7219 case 4: return UNSPEC_REVW
;
7224 /* Split a *aarch64_sve_mov<mode>_subreg_be pattern with the given
7228 aarch64_split_sve_subreg_move (rtx dest
, rtx ptrue
, rtx src
)
7230 /* Decide which REV operation we need. The mode with wider elements
7231 determines the mode of the operands and the mode with the narrower
7232 elements determines the reverse width. */
7233 machine_mode mode_with_wider_elts
= aarch64_sve_int_mode (GET_MODE (dest
));
7234 machine_mode mode_with_narrower_elts
= aarch64_sve_int_mode (GET_MODE (src
));
7235 if (GET_MODE_UNIT_SIZE (mode_with_wider_elts
)
7236 < GET_MODE_UNIT_SIZE (mode_with_narrower_elts
))
7237 std::swap (mode_with_wider_elts
, mode_with_narrower_elts
);
7239 unsigned int unspec
= aarch64_sve_rev_unspec (mode_with_narrower_elts
);
7240 machine_mode pred_mode
= aarch64_sve_pred_mode (mode_with_wider_elts
);
7242 /* Get the operands in the appropriate modes and emit the instruction. */
7243 ptrue
= gen_lowpart (pred_mode
, ptrue
);
7244 dest
= aarch64_replace_reg_mode (dest
, mode_with_wider_elts
);
7245 src
= aarch64_replace_reg_mode (src
, mode_with_wider_elts
);
7246 emit_insn (gen_aarch64_pred (unspec
, mode_with_wider_elts
,
7251 aarch64_function_ok_for_sibcall (tree
, tree exp
)
7253 if (crtl
->abi
->id () != expr_callee_abi (exp
).id ())
7259 /* Subroutine of aarch64_pass_by_reference for arguments that are not
7260 passed in SVE registers. */
7263 aarch64_pass_by_reference_1 (CUMULATIVE_ARGS
*pcum
,
7264 const function_arg_info
&arg
)
7267 machine_mode dummymode
;
7270 /* GET_MODE_SIZE (BLKmode) is useless since it is 0. */
7271 if (arg
.mode
== BLKmode
&& arg
.type
)
7272 size
= int_size_in_bytes (arg
.type
);
7274 /* No frontends can create types with variable-sized modes, so we
7275 shouldn't be asked to pass or return them. */
7276 size
= GET_MODE_SIZE (arg
.mode
).to_constant ();
7278 /* Aggregates are passed by reference based on their size. */
7279 if (arg
.aggregate_type_p ())
7280 size
= int_size_in_bytes (arg
.type
);
7282 /* Variable sized arguments are always returned by reference. */
7286 /* Can this be a candidate to be passed in fp/simd register(s)? */
7287 if (aarch64_vfp_is_call_or_return_candidate (arg
.mode
, arg
.type
,
7288 &dummymode
, &nregs
, NULL
,
7289 !pcum
|| pcum
->silent_p
))
7292 /* Arguments which are variable sized or larger than 2 registers are
7293 passed by reference unless they are a homogenous floating point
7295 return size
> 2 * UNITS_PER_WORD
;
7298 /* Implement TARGET_PASS_BY_REFERENCE. */
7301 aarch64_pass_by_reference (cumulative_args_t pcum_v
,
7302 const function_arg_info
&arg
)
7304 CUMULATIVE_ARGS
*pcum
= get_cumulative_args (pcum_v
);
7307 return aarch64_pass_by_reference_1 (pcum
, arg
);
7309 pure_scalable_type_info pst_info
;
7310 switch (pst_info
.analyze (arg
.type
))
7312 case pure_scalable_type_info::IS_PST
:
7313 if (pcum
&& !pcum
->silent_p
&& !TARGET_SVE
)
7314 /* We can't gracefully recover at this point, so make this a
7316 fatal_error (input_location
, "arguments of type %qT require"
7317 " the SVE ISA extension", arg
.type
);
7319 /* Variadic SVE types are passed by reference. Normal non-variadic
7320 arguments are too if we've run out of registers. */
7322 || pcum
->aapcs_nvrn
+ pst_info
.num_zr () > NUM_FP_ARG_REGS
7323 || pcum
->aapcs_nprn
+ pst_info
.num_pr () > NUM_PR_ARG_REGS
);
7325 case pure_scalable_type_info::DOESNT_MATTER
:
7326 gcc_assert (aarch64_pass_by_reference_1 (pcum
, arg
));
7329 case pure_scalable_type_info::NO_ABI_IDENTITY
:
7330 case pure_scalable_type_info::ISNT_PST
:
7331 return aarch64_pass_by_reference_1 (pcum
, arg
);
7336 /* Return TRUE if VALTYPE is padded to its least significant bits. */
7338 aarch64_return_in_msb (const_tree valtype
)
7340 machine_mode dummy_mode
;
7343 /* Never happens in little-endian mode. */
7344 if (!BYTES_BIG_ENDIAN
)
7347 /* Only composite types smaller than or equal to 16 bytes can
7348 be potentially returned in registers. */
7349 if (!aarch64_composite_type_p (valtype
, TYPE_MODE (valtype
))
7350 || int_size_in_bytes (valtype
) <= 0
7351 || int_size_in_bytes (valtype
) > 16)
7354 /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
7355 or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
7356 is always passed/returned in the least significant bits of fp/simd
7358 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype
), valtype
,
7359 &dummy_mode
, &dummy_int
, NULL
,
7363 /* Likewise pure scalable types for SVE vector and predicate registers. */
7364 pure_scalable_type_info pst_info
;
7365 if (pst_info
.analyze_registers (valtype
))
7371 /* Implement TARGET_FUNCTION_VALUE.
7372 Define how to find the value returned by a function. */
7375 aarch64_function_value (const_tree type
, const_tree func
,
7376 bool outgoing ATTRIBUTE_UNUSED
)
7381 mode
= TYPE_MODE (type
);
7382 if (INTEGRAL_TYPE_P (type
))
7383 mode
= promote_function_mode (type
, mode
, &unsignedp
, func
, 1);
7385 pure_scalable_type_info pst_info
;
7386 if (type
&& pst_info
.analyze_registers (type
))
7387 return pst_info
.get_rtx (mode
, V0_REGNUM
, P0_REGNUM
);
7389 /* Generic vectors that map to full SVE modes with -msve-vector-bits=N
7390 are returned in memory, not by value. */
7391 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
7392 bool sve_p
= (vec_flags
& VEC_ANY_SVE
);
7394 if (aarch64_return_in_msb (type
))
7396 HOST_WIDE_INT size
= int_size_in_bytes (type
);
7398 if (size
% UNITS_PER_WORD
!= 0)
7400 size
+= UNITS_PER_WORD
- size
% UNITS_PER_WORD
;
7401 mode
= int_mode_for_size (size
* BITS_PER_UNIT
, 0).require ();
7406 machine_mode ag_mode
;
7407 if (aarch64_vfp_is_call_or_return_candidate (mode
, type
, &ag_mode
, &count
,
7410 gcc_assert (!sve_p
);
7411 if (!aarch64_composite_type_p (type
, mode
))
7413 gcc_assert (count
== 1 && mode
== ag_mode
);
7414 return gen_rtx_REG (mode
, V0_REGNUM
);
7416 else if (aarch64_advsimd_full_struct_mode_p (mode
)
7417 && known_eq (GET_MODE_SIZE (ag_mode
), 16))
7418 return gen_rtx_REG (mode
, V0_REGNUM
);
7419 else if (aarch64_advsimd_partial_struct_mode_p (mode
)
7420 && known_eq (GET_MODE_SIZE (ag_mode
), 8))
7421 return gen_rtx_REG (mode
, V0_REGNUM
);
7427 par
= gen_rtx_PARALLEL (mode
, rtvec_alloc (count
));
7428 for (i
= 0; i
< count
; i
++)
7430 rtx tmp
= gen_rtx_REG (ag_mode
, V0_REGNUM
+ i
);
7431 rtx offset
= gen_int_mode (i
* GET_MODE_SIZE (ag_mode
), Pmode
);
7432 tmp
= gen_rtx_EXPR_LIST (VOIDmode
, tmp
, offset
);
7433 XVECEXP (par
, 0, i
) = tmp
;
7442 /* Vector types can acquire a partial SVE mode using things like
7443 __attribute__((vector_size(N))), and this is potentially useful.
7444 However, the choice of mode doesn't affect the type's ABI
7445 identity, so we should treat the types as though they had
7446 the associated integer mode, just like they did before SVE
7449 We know that the vector must be 128 bits or smaller,
7450 otherwise we'd have returned it in memory instead. */
7452 && (aarch64_some_values_include_pst_objects_p (type
)
7453 || (vec_flags
& VEC_PARTIAL
)));
7455 scalar_int_mode int_mode
= int_mode_for_mode (mode
).require ();
7456 rtx reg
= gen_rtx_REG (int_mode
, R0_REGNUM
);
7457 rtx pair
= gen_rtx_EXPR_LIST (VOIDmode
, reg
, const0_rtx
);
7458 return gen_rtx_PARALLEL (mode
, gen_rtvec (1, pair
));
7460 return gen_rtx_REG (mode
, R0_REGNUM
);
7464 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
7465 Return true if REGNO is the number of a hard register in which the values
7466 of called function may come back. */
7469 aarch64_function_value_regno_p (const unsigned int regno
)
7471 /* Maximum of 16 bytes can be returned in the general registers. Examples
7472 of 16-byte return values are: 128-bit integers and 16-byte small
7473 structures (excluding homogeneous floating-point aggregates). */
7474 if (regno
== R0_REGNUM
|| regno
== R1_REGNUM
)
7477 /* Up to four fp/simd registers can return a function value, e.g. a
7478 homogeneous floating-point aggregate having four members. */
7479 if (regno
>= V0_REGNUM
&& regno
< V0_REGNUM
+ HA_MAX_NUM_FLDS
)
7480 return TARGET_FLOAT
;
7485 /* Subroutine for aarch64_return_in_memory for types that are not returned
7486 in SVE registers. */
7489 aarch64_return_in_memory_1 (const_tree type
)
7492 machine_mode ag_mode
;
7495 if (!AGGREGATE_TYPE_P (type
)
7496 && TREE_CODE (type
) != COMPLEX_TYPE
7497 && TREE_CODE (type
) != VECTOR_TYPE
)
7498 /* Simple scalar types always returned in registers. */
7501 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type
), type
,
7502 &ag_mode
, &count
, NULL
, false))
7505 /* Types larger than 2 registers returned in memory. */
7506 size
= int_size_in_bytes (type
);
7507 return (size
< 0 || size
> 2 * UNITS_PER_WORD
);
7510 /* Implement TARGET_RETURN_IN_MEMORY.
7512 If the type T of the result of a function is such that
7514 would require that arg be passed as a value in a register (or set of
7515 registers) according to the parameter passing rules, then the result
7516 is returned in the same registers as would be used for such an
7520 aarch64_return_in_memory (const_tree type
, const_tree fndecl ATTRIBUTE_UNUSED
)
7522 pure_scalable_type_info pst_info
;
7523 switch (pst_info
.analyze (type
))
7525 case pure_scalable_type_info::IS_PST
:
7526 return (pst_info
.num_zr () > NUM_FP_ARG_REGS
7527 || pst_info
.num_pr () > NUM_PR_ARG_REGS
);
7529 case pure_scalable_type_info::DOESNT_MATTER
:
7530 gcc_assert (aarch64_return_in_memory_1 (type
));
7533 case pure_scalable_type_info::NO_ABI_IDENTITY
:
7534 case pure_scalable_type_info::ISNT_PST
:
7535 return aarch64_return_in_memory_1 (type
);
7541 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v
, machine_mode mode
,
7542 const_tree type
, int *nregs
)
7544 CUMULATIVE_ARGS
*pcum
= get_cumulative_args (pcum_v
);
7545 return aarch64_vfp_is_call_or_return_candidate (mode
, type
,
7546 &pcum
->aapcs_vfp_rmode
,
7547 nregs
, NULL
, pcum
->silent_p
);
7550 /* Given MODE and TYPE of a function argument, return the alignment in
7551 bits. The idea is to suppress any stronger alignment requested by
7552 the user and opt for the natural alignment (specified in AAPCS64 \S
7553 4.1). ABI_BREAK is set to the old alignment if the alignment was
7554 incorrectly calculated in versions of GCC prior to GCC-9.
7555 ABI_BREAK_PACKED is set to the old alignment if it was incorrectly
7556 calculated in versions between GCC-9 and GCC-13. This is a helper
7557 function for local use only. */
7560 aarch64_function_arg_alignment (machine_mode mode
, const_tree type
,
7561 unsigned int *abi_break
,
7562 unsigned int *abi_break_packed
)
7565 *abi_break_packed
= 0;
7567 return GET_MODE_ALIGNMENT (mode
);
7569 if (integer_zerop (TYPE_SIZE (type
)))
7572 gcc_assert (TYPE_MODE (type
) == mode
);
7574 if (!AGGREGATE_TYPE_P (type
))
7575 return TYPE_ALIGN (TYPE_MAIN_VARIANT (type
));
7577 if (TREE_CODE (type
) == ARRAY_TYPE
)
7578 return TYPE_ALIGN (TREE_TYPE (type
));
7580 unsigned int alignment
= 0;
7581 unsigned int bitfield_alignment_with_packed
= 0;
7582 unsigned int bitfield_alignment
= 0;
7583 for (tree field
= TYPE_FIELDS (type
); field
; field
= DECL_CHAIN (field
))
7584 if (TREE_CODE (field
) == FIELD_DECL
)
7586 /* Note that we explicitly consider zero-sized fields here,
7587 even though they don't map to AAPCS64 machine types.
7590 struct __attribute__((aligned(8))) empty {};
7593 [[no_unique_address]] empty e;
7597 "s" contains only one Fundamental Data Type (the int field)
7598 but gains 8-byte alignment and size thanks to "e". */
7599 alignment
= std::max (alignment
, DECL_ALIGN (field
));
7600 if (DECL_BIT_FIELD_TYPE (field
))
7602 /* Take the bit-field type's alignment into account only
7603 if the user didn't reduce this field's alignment with
7604 the packed attribute. */
7605 if (!DECL_PACKED (field
))
7607 = std::max (bitfield_alignment
,
7608 TYPE_ALIGN (DECL_BIT_FIELD_TYPE (field
)));
7610 /* Compute the alignment even if the bit-field is
7611 packed, so that we can emit a warning in case the
7612 alignment changed between GCC versions. */
7613 bitfield_alignment_with_packed
7614 = std::max (bitfield_alignment_with_packed
,
7615 TYPE_ALIGN (DECL_BIT_FIELD_TYPE (field
)));
7619 /* Emit a warning if the alignment is different when taking the
7620 'packed' attribute into account. */
7621 if (bitfield_alignment
!= bitfield_alignment_with_packed
7622 && bitfield_alignment_with_packed
> alignment
)
7623 *abi_break_packed
= bitfield_alignment_with_packed
;
7625 if (bitfield_alignment
> alignment
)
7627 *abi_break
= alignment
;
7628 return bitfield_alignment
;
7634 /* Layout a function argument according to the AAPCS64 rules. The rule
7635 numbers refer to the rule numbers in the AAPCS64. ORIG_MODE is the
7636 mode that was originally given to us by the target hook, whereas the
7637 mode in ARG might be the result of replacing partial SVE modes with
7638 the equivalent integer mode. */
7641 aarch64_layout_arg (cumulative_args_t pcum_v
, const function_arg_info
&arg
)
7643 CUMULATIVE_ARGS
*pcum
= get_cumulative_args (pcum_v
);
7644 tree type
= arg
.type
;
7645 machine_mode mode
= arg
.mode
;
7646 int ncrn
, nvrn
, nregs
;
7647 bool allocate_ncrn
, allocate_nvrn
;
7649 unsigned int abi_break
;
7650 unsigned int abi_break_packed
;
7652 /* We need to do this once per argument. */
7653 if (pcum
->aapcs_arg_processed
)
7656 bool warn_pcs_change
7659 && (currently_expanding_function_start
7660 || currently_expanding_gimple_stmt
));
7662 /* There are several things to note here:
7664 - Both the C and AAPCS64 interpretations of a type's alignment should
7665 give a value that is no greater than the type's size.
7667 - Types bigger than 16 bytes are passed indirectly.
7669 - If an argument of type T is passed indirectly, TYPE and MODE describe
7670 a pointer to T rather than T iself.
7672 It follows that the AAPCS64 alignment of TYPE must be no greater
7675 Versions prior to GCC 9.1 ignored a bitfield's underlying type
7676 and so could calculate an alignment that was too small. If this
7677 happened for TYPE then ABI_BREAK is this older, too-small alignment.
7679 Although GCC 9.1 fixed that bug, it introduced a different one:
7680 it would consider the alignment of a bitfield's underlying type even
7681 if the field was packed (which should have the effect of overriding
7682 the alignment of the underlying type). This was fixed in GCC 13.1.
7684 As a result of this bug, GCC 9 to GCC 12 could calculate an alignment
7685 that was too big. If this happened for TYPE, ABI_BREAK_PACKED is
7686 this older, too-big alignment.
7688 Also, the fact that GCC 9 to GCC 12 considered irrelevant
7689 alignments meant they could calculate type alignments that were
7690 bigger than the type's size, contrary to the assumption above.
7691 The handling of register arguments was nevertheless (and justifiably)
7692 written to follow the assumption that the alignment can never be
7693 greater than the size. The same was not true for stack arguments;
7694 their alignment was instead handled by MIN bounds in
7695 aarch64_function_arg_boundary.
7697 The net effect is that, if GCC 9 to GCC 12 incorrectly calculated
7698 an alignment of more than 16 bytes for TYPE then:
7700 - If the argument was passed in registers, these GCC versions
7701 would treat the alignment as though it was *less than* 16 bytes.
7703 - If the argument was passed on the stack, these GCC versions
7704 would treat the alignment as though it was *equal to* 16 bytes.
7706 Both behaviors were wrong, but in different cases. */
7707 unsigned int alignment
7708 = aarch64_function_arg_alignment (mode
, type
, &abi_break
,
7710 gcc_assert (alignment
<= 16 * BITS_PER_UNIT
7711 && (!alignment
|| abi_break
< alignment
)
7712 && (!abi_break_packed
|| alignment
< abi_break_packed
));
7714 pcum
->aapcs_arg_processed
= true;
7716 pure_scalable_type_info pst_info
;
7717 if (type
&& pst_info
.analyze_registers (type
))
7719 /* aarch64_function_arg_alignment has never had an effect on
7722 /* The PCS says that it is invalid to pass an SVE value to an
7723 unprototyped function. There is no ABI-defined location we
7724 can return in this case, so we have no real choice but to raise
7725 an error immediately, even though this is only a query function. */
7726 if (arg
.named
&& pcum
->pcs_variant
!= ARM_PCS_SVE
)
7728 gcc_assert (!pcum
->silent_p
);
7729 error ("SVE type %qT cannot be passed to an unprototyped function",
7731 /* Avoid repeating the message, and avoid tripping the assert
7733 pcum
->pcs_variant
= ARM_PCS_SVE
;
7736 /* We would have converted the argument into pass-by-reference
7737 form if it didn't fit in registers. */
7738 pcum
->aapcs_nextnvrn
= pcum
->aapcs_nvrn
+ pst_info
.num_zr ();
7739 pcum
->aapcs_nextnprn
= pcum
->aapcs_nprn
+ pst_info
.num_pr ();
7740 gcc_assert (arg
.named
7741 && pcum
->pcs_variant
== ARM_PCS_SVE
7742 && pcum
->aapcs_nextnvrn
<= NUM_FP_ARG_REGS
7743 && pcum
->aapcs_nextnprn
<= NUM_PR_ARG_REGS
);
7744 pcum
->aapcs_reg
= pst_info
.get_rtx (mode
, V0_REGNUM
+ pcum
->aapcs_nvrn
,
7745 P0_REGNUM
+ pcum
->aapcs_nprn
);
7749 /* Generic vectors that map to full SVE modes with -msve-vector-bits=N
7750 are passed by reference, not by value. */
7751 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
7752 bool sve_p
= (vec_flags
& VEC_ANY_SVE
);
7754 /* Vector types can acquire a partial SVE mode using things like
7755 __attribute__((vector_size(N))), and this is potentially useful.
7756 However, the choice of mode doesn't affect the type's ABI
7757 identity, so we should treat the types as though they had
7758 the associated integer mode, just like they did before SVE
7761 We know that the vector must be 128 bits or smaller,
7762 otherwise we'd have passed it in memory instead. */
7764 && (aarch64_some_values_include_pst_objects_p (type
)
7765 || (vec_flags
& VEC_PARTIAL
)));
7767 /* Size in bytes, rounded to the nearest multiple of 8 bytes. */
7769 size
= int_size_in_bytes (type
);
7771 /* No frontends can create types with variable-sized modes, so we
7772 shouldn't be asked to pass or return them. */
7773 size
= GET_MODE_SIZE (mode
).to_constant ();
7774 size
= ROUND_UP (size
, UNITS_PER_WORD
);
7776 allocate_ncrn
= (type
) ? !(FLOAT_TYPE_P (type
)) : !FLOAT_MODE_P (mode
);
7777 allocate_nvrn
= aarch64_vfp_is_call_candidate (pcum_v
,
7781 gcc_assert (!sve_p
|| !allocate_nvrn
);
7783 /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
7784 The following code thus handles passing by SIMD/FP registers first. */
7786 nvrn
= pcum
->aapcs_nvrn
;
7788 /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
7789 and homogenous short-vector aggregates (HVA). */
7792 /* aarch64_function_arg_alignment has never had an effect on
7794 if (!pcum
->silent_p
&& !TARGET_FLOAT
)
7795 aarch64_err_no_fpadvsimd (mode
);
7797 if (nvrn
+ nregs
<= NUM_FP_ARG_REGS
)
7799 pcum
->aapcs_nextnvrn
= nvrn
+ nregs
;
7800 if (!aarch64_composite_type_p (type
, mode
))
7802 gcc_assert (nregs
== 1);
7803 pcum
->aapcs_reg
= gen_rtx_REG (mode
, V0_REGNUM
+ nvrn
);
7805 else if (aarch64_advsimd_full_struct_mode_p (mode
)
7806 && known_eq (GET_MODE_SIZE (pcum
->aapcs_vfp_rmode
), 16))
7807 pcum
->aapcs_reg
= gen_rtx_REG (mode
, V0_REGNUM
+ nvrn
);
7808 else if (aarch64_advsimd_partial_struct_mode_p (mode
)
7809 && known_eq (GET_MODE_SIZE (pcum
->aapcs_vfp_rmode
), 8))
7810 pcum
->aapcs_reg
= gen_rtx_REG (mode
, V0_REGNUM
+ nvrn
);
7815 par
= gen_rtx_PARALLEL (mode
, rtvec_alloc (nregs
));
7816 for (i
= 0; i
< nregs
; i
++)
7818 rtx tmp
= gen_rtx_REG (pcum
->aapcs_vfp_rmode
,
7819 V0_REGNUM
+ nvrn
+ i
);
7820 rtx offset
= gen_int_mode
7821 (i
* GET_MODE_SIZE (pcum
->aapcs_vfp_rmode
), Pmode
);
7822 tmp
= gen_rtx_EXPR_LIST (VOIDmode
, tmp
, offset
);
7823 XVECEXP (par
, 0, i
) = tmp
;
7825 pcum
->aapcs_reg
= par
;
7831 /* C.3 NSRN is set to 8. */
7832 pcum
->aapcs_nextnvrn
= NUM_FP_ARG_REGS
;
7837 ncrn
= pcum
->aapcs_ncrn
;
7838 nregs
= size
/ UNITS_PER_WORD
;
7840 /* C6 - C9. though the sign and zero extension semantics are
7841 handled elsewhere. This is the case where the argument fits
7842 entirely general registers. */
7843 if (allocate_ncrn
&& (ncrn
+ nregs
<= NUM_ARG_REGS
))
7845 gcc_assert (nregs
== 0 || nregs
== 1 || nregs
== 2);
7847 /* C.8 if the argument has an alignment of 16 then the NGRN is
7848 rounded up to the next even number. */
7852 /* Emit a warning if the alignment changed when taking the
7853 'packed' attribute into account. */
7856 && ((abi_break_packed
== 16 * BITS_PER_UNIT
)
7857 != (alignment
== 16 * BITS_PER_UNIT
)))
7858 inform (input_location
, "parameter passing for argument of type "
7859 "%qT changed in GCC 13.1", type
);
7861 /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
7862 comparison is there because for > 16 * BITS_PER_UNIT
7863 alignment nregs should be > 2 and therefore it should be
7864 passed by reference rather than value. */
7865 if (alignment
== 16 * BITS_PER_UNIT
)
7867 if (warn_pcs_change
&& abi_break
)
7868 inform (input_location
, "parameter passing for argument of type "
7869 "%qT changed in GCC 9.1", type
);
7871 gcc_assert (ncrn
+ nregs
<= NUM_ARG_REGS
);
7875 /* If an argument with an SVE mode needs to be shifted up to the
7876 high part of the register, treat it as though it had an integer mode.
7877 Using the normal (parallel [...]) would suppress the shifting. */
7880 && maybe_ne (GET_MODE_SIZE (mode
), nregs
* UNITS_PER_WORD
)
7881 && aarch64_pad_reg_upward (mode
, type
, false))
7883 mode
= int_mode_for_mode (mode
).require ();
7887 /* NREGS can be 0 when e.g. an empty structure is to be passed.
7888 A reg is still generated for it, but the caller should be smart
7889 enough not to use it. */
7891 || (nregs
== 1 && !sve_p
)
7892 || GET_MODE_CLASS (mode
) == MODE_INT
)
7893 pcum
->aapcs_reg
= gen_rtx_REG (mode
, R0_REGNUM
+ ncrn
);
7899 par
= gen_rtx_PARALLEL (mode
, rtvec_alloc (nregs
));
7900 for (i
= 0; i
< nregs
; i
++)
7902 scalar_int_mode reg_mode
= word_mode
;
7904 reg_mode
= int_mode_for_mode (mode
).require ();
7905 rtx tmp
= gen_rtx_REG (reg_mode
, R0_REGNUM
+ ncrn
+ i
);
7906 tmp
= gen_rtx_EXPR_LIST (VOIDmode
, tmp
,
7907 GEN_INT (i
* UNITS_PER_WORD
));
7908 XVECEXP (par
, 0, i
) = tmp
;
7910 pcum
->aapcs_reg
= par
;
7913 pcum
->aapcs_nextncrn
= ncrn
+ nregs
;
7918 pcum
->aapcs_nextncrn
= NUM_ARG_REGS
;
7920 /* The argument is passed on stack; record the needed number of words for
7921 this argument and align the total size if necessary. */
7923 pcum
->aapcs_stack_words
= size
/ UNITS_PER_WORD
;
7927 && ((abi_break_packed
>= 16 * BITS_PER_UNIT
)
7928 != (alignment
>= 16 * BITS_PER_UNIT
)))
7929 inform (input_location
, "parameter passing for argument of type "
7930 "%qT changed in GCC 13.1", type
);
7932 if (alignment
== 16 * BITS_PER_UNIT
)
7934 int new_size
= ROUND_UP (pcum
->aapcs_stack_size
, 16 / UNITS_PER_WORD
);
7935 if (pcum
->aapcs_stack_size
!= new_size
)
7937 if (warn_pcs_change
&& abi_break
)
7938 inform (input_location
, "parameter passing for argument of type "
7939 "%qT changed in GCC 9.1", type
);
7940 pcum
->aapcs_stack_size
= new_size
;
7946 /* Implement TARGET_FUNCTION_ARG. */
7949 aarch64_function_arg (cumulative_args_t pcum_v
, const function_arg_info
&arg
)
7951 CUMULATIVE_ARGS
*pcum
= get_cumulative_args (pcum_v
);
7952 gcc_assert (pcum
->pcs_variant
== ARM_PCS_AAPCS64
7953 || pcum
->pcs_variant
== ARM_PCS_SIMD
7954 || pcum
->pcs_variant
== ARM_PCS_SVE
);
7956 if (arg
.end_marker_p ())
7957 return gen_int_mode (pcum
->pcs_variant
, DImode
);
7959 aarch64_layout_arg (pcum_v
, arg
);
7960 return pcum
->aapcs_reg
;
7964 aarch64_init_cumulative_args (CUMULATIVE_ARGS
*pcum
,
7966 rtx libname ATTRIBUTE_UNUSED
,
7967 const_tree fndecl ATTRIBUTE_UNUSED
,
7968 unsigned n_named ATTRIBUTE_UNUSED
,
7971 pcum
->aapcs_ncrn
= 0;
7972 pcum
->aapcs_nvrn
= 0;
7973 pcum
->aapcs_nprn
= 0;
7974 pcum
->aapcs_nextncrn
= 0;
7975 pcum
->aapcs_nextnvrn
= 0;
7976 pcum
->aapcs_nextnprn
= 0;
7978 pcum
->pcs_variant
= (arm_pcs
) fntype_abi (fntype
).id ();
7980 pcum
->pcs_variant
= ARM_PCS_AAPCS64
;
7981 pcum
->aapcs_reg
= NULL_RTX
;
7982 pcum
->aapcs_arg_processed
= false;
7983 pcum
->aapcs_stack_words
= 0;
7984 pcum
->aapcs_stack_size
= 0;
7985 pcum
->silent_p
= silent_p
;
7989 && fntype
&& fntype
!= error_mark_node
)
7991 const_tree type
= TREE_TYPE (fntype
);
7992 machine_mode mode ATTRIBUTE_UNUSED
; /* To pass pointer as argument. */
7993 int nregs ATTRIBUTE_UNUSED
; /* Likewise. */
7994 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type
), type
,
7995 &mode
, &nregs
, NULL
, false))
7996 aarch64_err_no_fpadvsimd (TYPE_MODE (type
));
8001 && pcum
->pcs_variant
== ARM_PCS_SVE
)
8003 /* We can't gracefully recover at this point, so make this a
8006 fatal_error (input_location
, "%qE requires the SVE ISA extension",
8009 fatal_error (input_location
, "calls to functions of type %qT require"
8010 " the SVE ISA extension", fntype
);
8015 aarch64_function_arg_advance (cumulative_args_t pcum_v
,
8016 const function_arg_info
&arg
)
8018 CUMULATIVE_ARGS
*pcum
= get_cumulative_args (pcum_v
);
8019 if (pcum
->pcs_variant
== ARM_PCS_AAPCS64
8020 || pcum
->pcs_variant
== ARM_PCS_SIMD
8021 || pcum
->pcs_variant
== ARM_PCS_SVE
)
8023 aarch64_layout_arg (pcum_v
, arg
);
8024 gcc_assert ((pcum
->aapcs_reg
!= NULL_RTX
)
8025 != (pcum
->aapcs_stack_words
!= 0));
8026 pcum
->aapcs_arg_processed
= false;
8027 pcum
->aapcs_ncrn
= pcum
->aapcs_nextncrn
;
8028 pcum
->aapcs_nvrn
= pcum
->aapcs_nextnvrn
;
8029 pcum
->aapcs_nprn
= pcum
->aapcs_nextnprn
;
8030 pcum
->aapcs_stack_size
+= pcum
->aapcs_stack_words
;
8031 pcum
->aapcs_stack_words
= 0;
8032 pcum
->aapcs_reg
= NULL_RTX
;
8037 aarch64_function_arg_regno_p (unsigned regno
)
8039 return ((GP_REGNUM_P (regno
) && regno
< R0_REGNUM
+ NUM_ARG_REGS
)
8040 || (FP_REGNUM_P (regno
) && regno
< V0_REGNUM
+ NUM_FP_ARG_REGS
));
8043 /* Implement FUNCTION_ARG_BOUNDARY. Every parameter gets at least
8044 PARM_BOUNDARY bits of alignment, but will be given anything up
8045 to STACK_BOUNDARY bits if the type requires it. This makes sure
8046 that both before and after the layout of each argument, the Next
8047 Stacked Argument Address (NSAA) will have a minimum alignment of
8051 aarch64_function_arg_boundary (machine_mode mode
, const_tree type
)
8053 unsigned int abi_break
;
8054 unsigned int abi_break_packed
;
8055 unsigned int alignment
= aarch64_function_arg_alignment (mode
, type
,
8058 /* We rely on aarch64_layout_arg and aarch64_gimplify_va_arg_expr
8059 to emit warnings about ABI incompatibility. */
8060 alignment
= MIN (MAX (alignment
, PARM_BOUNDARY
), STACK_BOUNDARY
);
8064 /* Implement TARGET_GET_RAW_RESULT_MODE and TARGET_GET_RAW_ARG_MODE. */
8066 static fixed_size_mode
8067 aarch64_get_reg_raw_mode (int regno
)
8069 if (TARGET_SVE
&& FP_REGNUM_P (regno
))
8070 /* Don't use the SVE part of the register for __builtin_apply and
8071 __builtin_return. The SVE registers aren't used by the normal PCS,
8072 so using them there would be a waste of time. The PCS extensions
8073 for SVE types are fundamentally incompatible with the
8074 __builtin_return/__builtin_apply interface. */
8075 return as_a
<fixed_size_mode
> (V16QImode
);
8076 return default_get_reg_raw_mode (regno
);
8079 /* Implement TARGET_FUNCTION_ARG_PADDING.
8081 Small aggregate types are placed in the lowest memory address.
8083 The related parameter passing rules are B.4, C.3, C.5 and C.14. */
8085 static pad_direction
8086 aarch64_function_arg_padding (machine_mode mode
, const_tree type
)
8088 /* On little-endian targets, the least significant byte of every stack
8089 argument is passed at the lowest byte address of the stack slot. */
8090 if (!BYTES_BIG_ENDIAN
)
8093 /* Otherwise, integral, floating-point and pointer types are padded downward:
8094 the least significant byte of a stack argument is passed at the highest
8095 byte address of the stack slot. */
8097 ? (INTEGRAL_TYPE_P (type
) || SCALAR_FLOAT_TYPE_P (type
)
8098 || POINTER_TYPE_P (type
))
8099 : (SCALAR_INT_MODE_P (mode
) || SCALAR_FLOAT_MODE_P (mode
)))
8100 return PAD_DOWNWARD
;
8102 /* Everything else padded upward, i.e. data in first byte of stack slot. */
8106 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
8108 It specifies padding for the last (may also be the only)
8109 element of a block move between registers and memory. If
8110 assuming the block is in the memory, padding upward means that
8111 the last element is padded after its highest significant byte,
8112 while in downward padding, the last element is padded at the
8113 its least significant byte side.
8115 Small aggregates and small complex types are always padded
8118 We don't need to worry about homogeneous floating-point or
8119 short-vector aggregates; their move is not affected by the
8120 padding direction determined here. Regardless of endianness,
8121 each element of such an aggregate is put in the least
8122 significant bits of a fp/simd register.
8124 Return !BYTES_BIG_ENDIAN if the least significant byte of the
8125 register has useful data, and return the opposite if the most
8126 significant byte does. */
8129 aarch64_pad_reg_upward (machine_mode mode
, const_tree type
,
8130 bool first ATTRIBUTE_UNUSED
)
8133 /* Aside from pure scalable types, small composite types are always
8135 if (BYTES_BIG_ENDIAN
&& aarch64_composite_type_p (type
, mode
))
8139 size
= int_size_in_bytes (type
);
8141 /* No frontends can create types with variable-sized modes, so we
8142 shouldn't be asked to pass or return them. */
8143 size
= GET_MODE_SIZE (mode
).to_constant ();
8144 if (size
< 2 * UNITS_PER_WORD
)
8146 pure_scalable_type_info pst_info
;
8147 if (pst_info
.analyze_registers (type
))
8153 /* Otherwise, use the default padding. */
8154 return !BYTES_BIG_ENDIAN
;
8157 static scalar_int_mode
8158 aarch64_libgcc_cmp_return_mode (void)
8163 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
8165 /* We use the 12-bit shifted immediate arithmetic instructions so values
8166 must be multiple of (1 << 12), i.e. 4096. */
8167 #define ARITH_FACTOR 4096
8169 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
8170 #error Cannot use simple address calculation for stack probing
8173 /* Emit code to probe a range of stack addresses from FIRST to FIRST+POLY_SIZE,
8174 inclusive. These are offsets from the current stack pointer. */
8177 aarch64_emit_probe_stack_range (HOST_WIDE_INT first
, poly_int64 poly_size
)
8180 if (!poly_size
.is_constant (&size
))
8182 sorry ("stack probes for SVE frames");
8186 rtx reg1
= gen_rtx_REG (Pmode
, PROBE_STACK_FIRST_REGNUM
);
8188 /* See the same assertion on PROBE_INTERVAL above. */
8189 gcc_assert ((first
% ARITH_FACTOR
) == 0);
8191 /* See if we have a constant small number of probes to generate. If so,
8192 that's the easy case. */
8193 if (size
<= PROBE_INTERVAL
)
8195 const HOST_WIDE_INT base
= ROUND_UP (size
, ARITH_FACTOR
);
8197 emit_set_insn (reg1
,
8198 plus_constant (Pmode
,
8199 stack_pointer_rtx
, -(first
+ base
)));
8200 emit_stack_probe (plus_constant (Pmode
, reg1
, base
- size
));
8203 /* The run-time loop is made up of 8 insns in the generic case while the
8204 compile-time loop is made up of 4+2*(n-2) insns for n # of intervals. */
8205 else if (size
<= 4 * PROBE_INTERVAL
)
8207 HOST_WIDE_INT i
, rem
;
8209 emit_set_insn (reg1
,
8210 plus_constant (Pmode
,
8212 -(first
+ PROBE_INTERVAL
)));
8213 emit_stack_probe (reg1
);
8215 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
8216 it exceeds SIZE. If only two probes are needed, this will not
8217 generate any code. Then probe at FIRST + SIZE. */
8218 for (i
= 2 * PROBE_INTERVAL
; i
< size
; i
+= PROBE_INTERVAL
)
8220 emit_set_insn (reg1
,
8221 plus_constant (Pmode
, reg1
, -PROBE_INTERVAL
));
8222 emit_stack_probe (reg1
);
8225 rem
= size
- (i
- PROBE_INTERVAL
);
8228 const HOST_WIDE_INT base
= ROUND_UP (rem
, ARITH_FACTOR
);
8230 emit_set_insn (reg1
, plus_constant (Pmode
, reg1
, -base
));
8231 emit_stack_probe (plus_constant (Pmode
, reg1
, base
- rem
));
8234 emit_stack_probe (plus_constant (Pmode
, reg1
, -rem
));
8237 /* Otherwise, do the same as above, but in a loop. Note that we must be
8238 extra careful with variables wrapping around because we might be at
8239 the very top (or the very bottom) of the address space and we have
8240 to be able to handle this case properly; in particular, we use an
8241 equality test for the loop condition. */
8244 rtx reg2
= gen_rtx_REG (Pmode
, PROBE_STACK_SECOND_REGNUM
);
8246 /* Step 1: round SIZE to the previous multiple of the interval. */
8248 HOST_WIDE_INT rounded_size
= size
& -PROBE_INTERVAL
;
8251 /* Step 2: compute initial and final value of the loop counter. */
8253 /* TEST_ADDR = SP + FIRST. */
8254 emit_set_insn (reg1
,
8255 plus_constant (Pmode
, stack_pointer_rtx
, -first
));
8257 /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE. */
8258 HOST_WIDE_INT adjustment
= - (first
+ rounded_size
);
8259 if (! aarch64_uimm12_shift (adjustment
))
8261 aarch64_internal_mov_immediate (reg2
, GEN_INT (adjustment
),
8263 emit_set_insn (reg2
, gen_rtx_PLUS (Pmode
, stack_pointer_rtx
, reg2
));
8266 emit_set_insn (reg2
,
8267 plus_constant (Pmode
, stack_pointer_rtx
, adjustment
));
8273 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
8276 while (TEST_ADDR != LAST_ADDR)
8278 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
8279 until it is equal to ROUNDED_SIZE. */
8281 emit_insn (gen_probe_stack_range (reg1
, reg1
, reg2
));
8284 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
8285 that SIZE is equal to ROUNDED_SIZE. */
8287 if (size
!= rounded_size
)
8289 HOST_WIDE_INT rem
= size
- rounded_size
;
8293 const HOST_WIDE_INT base
= ROUND_UP (rem
, ARITH_FACTOR
);
8295 emit_set_insn (reg2
, plus_constant (Pmode
, reg2
, -base
));
8296 emit_stack_probe (plus_constant (Pmode
, reg2
, base
- rem
));
8299 emit_stack_probe (plus_constant (Pmode
, reg2
, -rem
));
8303 /* Make sure nothing is scheduled before we are done. */
8304 emit_insn (gen_blockage ());
8307 /* Probe a range of stack addresses from REG1 to REG2 inclusive. These are
8308 absolute addresses. */
8311 aarch64_output_probe_stack_range (rtx reg1
, rtx reg2
)
8313 static int labelno
= 0;
8317 ASM_GENERATE_INTERNAL_LABEL (loop_lab
, "LPSRL", labelno
++);
8320 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file
, loop_lab
);
8322 HOST_WIDE_INT stack_clash_probe_interval
8323 = 1 << param_stack_clash_protection_guard_size
;
8325 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
8327 HOST_WIDE_INT interval
;
8328 if (flag_stack_clash_protection
)
8329 interval
= stack_clash_probe_interval
;
8331 interval
= PROBE_INTERVAL
;
8333 gcc_assert (aarch64_uimm12_shift (interval
));
8334 xops
[1] = GEN_INT (interval
);
8336 output_asm_insn ("sub\t%0, %0, %1", xops
);
8338 /* If doing stack clash protection then we probe up by the ABI specified
8339 amount. We do this because we're dropping full pages at a time in the
8340 loop. But if we're doing non-stack clash probing, probe at SP 0. */
8341 if (flag_stack_clash_protection
)
8342 xops
[1] = GEN_INT (STACK_CLASH_CALLER_GUARD
);
8344 xops
[1] = CONST0_RTX (GET_MODE (xops
[1]));
8346 /* Probe at TEST_ADDR. If we're inside the loop it is always safe to probe
8347 by this amount for each iteration. */
8348 output_asm_insn ("str\txzr, [%0, %1]", xops
);
8350 /* Test if TEST_ADDR == LAST_ADDR. */
8352 output_asm_insn ("cmp\t%0, %1", xops
);
8355 fputs ("\tb.ne\t", asm_out_file
);
8356 assemble_name_raw (asm_out_file
, loop_lab
);
8357 fputc ('\n', asm_out_file
);
8362 /* Emit the probe loop for doing stack clash probes and stack adjustments for
8363 SVE. This emits probes from BASE to BASE - ADJUSTMENT based on a guard size
8364 of GUARD_SIZE. When a probe is emitted it is done at most
8365 MIN_PROBE_THRESHOLD bytes from the current BASE at an interval of
8366 at most MIN_PROBE_THRESHOLD. By the end of this function
8367 BASE = BASE - ADJUSTMENT. */
8370 aarch64_output_probe_sve_stack_clash (rtx base
, rtx adjustment
,
8371 rtx min_probe_threshold
, rtx guard_size
)
8373 /* This function is not allowed to use any instruction generation function
8374 like gen_ and friends. If you do you'll likely ICE during CFG validation,
8375 so instead emit the code you want using output_asm_insn. */
8376 gcc_assert (flag_stack_clash_protection
);
8377 gcc_assert (CONST_INT_P (min_probe_threshold
) && CONST_INT_P (guard_size
));
8378 gcc_assert (INTVAL (guard_size
) > INTVAL (min_probe_threshold
));
8380 /* The minimum required allocation before the residual requires probing. */
8381 HOST_WIDE_INT residual_probe_guard
= INTVAL (min_probe_threshold
);
8383 /* Clamp the value down to the nearest value that can be used with a cmp. */
8384 residual_probe_guard
= aarch64_clamp_to_uimm12_shift (residual_probe_guard
);
8385 rtx probe_offset_value_rtx
= gen_int_mode (residual_probe_guard
, Pmode
);
8387 gcc_assert (INTVAL (min_probe_threshold
) >= residual_probe_guard
);
8388 gcc_assert (aarch64_uimm12_shift (residual_probe_guard
));
8390 static int labelno
= 0;
8391 char loop_start_lab
[32];
8392 char loop_end_lab
[32];
8395 ASM_GENERATE_INTERNAL_LABEL (loop_start_lab
, "SVLPSPL", labelno
);
8396 ASM_GENERATE_INTERNAL_LABEL (loop_end_lab
, "SVLPEND", labelno
++);
8398 /* Emit loop start label. */
8399 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file
, loop_start_lab
);
8401 /* ADJUSTMENT < RESIDUAL_PROBE_GUARD. */
8402 xops
[0] = adjustment
;
8403 xops
[1] = probe_offset_value_rtx
;
8404 output_asm_insn ("cmp\t%0, %1", xops
);
8406 /* Branch to end if not enough adjustment to probe. */
8407 fputs ("\tb.lt\t", asm_out_file
);
8408 assemble_name_raw (asm_out_file
, loop_end_lab
);
8409 fputc ('\n', asm_out_file
);
8411 /* BASE = BASE - RESIDUAL_PROBE_GUARD. */
8413 xops
[1] = probe_offset_value_rtx
;
8414 output_asm_insn ("sub\t%0, %0, %1", xops
);
8416 /* Probe at BASE. */
8417 xops
[1] = const0_rtx
;
8418 output_asm_insn ("str\txzr, [%0, %1]", xops
);
8420 /* ADJUSTMENT = ADJUSTMENT - RESIDUAL_PROBE_GUARD. */
8421 xops
[0] = adjustment
;
8422 xops
[1] = probe_offset_value_rtx
;
8423 output_asm_insn ("sub\t%0, %0, %1", xops
);
8425 /* Branch to start if still more bytes to allocate. */
8426 fputs ("\tb\t", asm_out_file
);
8427 assemble_name_raw (asm_out_file
, loop_start_lab
);
8428 fputc ('\n', asm_out_file
);
8430 /* No probe leave. */
8431 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file
, loop_end_lab
);
8433 /* BASE = BASE - ADJUSTMENT. */
8435 xops
[1] = adjustment
;
8436 output_asm_insn ("sub\t%0, %0, %1", xops
);
8440 /* Determine whether a frame chain needs to be generated. */
8442 aarch64_needs_frame_chain (void)
8444 /* Force a frame chain for EH returns so the return address is at FP+8. */
8445 if (frame_pointer_needed
|| crtl
->calls_eh_return
)
8448 /* A leaf function cannot have calls or write LR. */
8449 bool is_leaf
= crtl
->is_leaf
&& !df_regs_ever_live_p (LR_REGNUM
);
8451 /* Don't use a frame chain in leaf functions if leaf frame pointers
8453 if (flag_omit_leaf_frame_pointer
&& is_leaf
)
8456 return aarch64_use_frame_pointer
;
8459 /* Mark the registers that need to be saved by the callee and calculate
8460 the size of the callee-saved registers area and frame record (both FP
8461 and LR may be omitted). */
8463 aarch64_layout_frame (void)
8465 poly_int64 offset
= 0;
8466 int regno
, last_fp_reg
= INVALID_REGNUM
;
8467 machine_mode vector_save_mode
= aarch64_reg_save_mode (V8_REGNUM
);
8468 poly_int64 vector_save_size
= GET_MODE_SIZE (vector_save_mode
);
8469 bool frame_related_fp_reg_p
= false;
8470 aarch64_frame
&frame
= cfun
->machine
->frame
;
8472 frame
.emit_frame_chain
= aarch64_needs_frame_chain ();
8474 /* Adjust the outgoing arguments size if required. Keep it in sync with what
8475 the mid-end is doing. */
8476 crtl
->outgoing_args_size
= STACK_DYNAMIC_OFFSET (cfun
);
8478 #define SLOT_NOT_REQUIRED (-2)
8479 #define SLOT_REQUIRED (-1)
8481 frame
.wb_push_candidate1
= INVALID_REGNUM
;
8482 frame
.wb_push_candidate2
= INVALID_REGNUM
;
8483 frame
.spare_pred_reg
= INVALID_REGNUM
;
8485 /* First mark all the registers that really need to be saved... */
8486 for (regno
= 0; regno
<= LAST_SAVED_REGNUM
; regno
++)
8487 frame
.reg_offset
[regno
] = SLOT_NOT_REQUIRED
;
8489 /* ... that includes the eh data registers (if needed)... */
8490 if (crtl
->calls_eh_return
)
8491 for (regno
= 0; EH_RETURN_DATA_REGNO (regno
) != INVALID_REGNUM
; regno
++)
8492 frame
.reg_offset
[EH_RETURN_DATA_REGNO (regno
)] = SLOT_REQUIRED
;
8494 /* ... and any callee saved register that dataflow says is live. */
8495 for (regno
= R0_REGNUM
; regno
<= R30_REGNUM
; regno
++)
8496 if (df_regs_ever_live_p (regno
)
8497 && !fixed_regs
[regno
]
8498 && (regno
== R30_REGNUM
8499 || !crtl
->abi
->clobbers_full_reg_p (regno
)))
8500 frame
.reg_offset
[regno
] = SLOT_REQUIRED
;
8502 for (regno
= V0_REGNUM
; regno
<= V31_REGNUM
; regno
++)
8503 if (df_regs_ever_live_p (regno
)
8504 && !fixed_regs
[regno
]
8505 && !crtl
->abi
->clobbers_full_reg_p (regno
))
8507 frame
.reg_offset
[regno
] = SLOT_REQUIRED
;
8508 last_fp_reg
= regno
;
8509 if (aarch64_emit_cfi_for_reg_p (regno
))
8510 frame_related_fp_reg_p
= true;
8513 /* Big-endian SVE frames need a spare predicate register in order
8514 to save Z8-Z15. Decide which register they should use. Prefer
8515 an unused argument register if possible, so that we don't force P4
8516 to be saved unnecessarily. */
8517 if (frame_related_fp_reg_p
8518 && crtl
->abi
->id () == ARM_PCS_SVE
8519 && BYTES_BIG_ENDIAN
)
8521 bitmap live1
= df_get_live_out (ENTRY_BLOCK_PTR_FOR_FN (cfun
));
8522 bitmap live2
= df_get_live_in (EXIT_BLOCK_PTR_FOR_FN (cfun
));
8523 for (regno
= P0_REGNUM
; regno
<= P7_REGNUM
; regno
++)
8524 if (!bitmap_bit_p (live1
, regno
) && !bitmap_bit_p (live2
, regno
))
8526 gcc_assert (regno
<= P7_REGNUM
);
8527 frame
.spare_pred_reg
= regno
;
8528 df_set_regs_ever_live (regno
, true);
8531 for (regno
= P0_REGNUM
; regno
<= P15_REGNUM
; regno
++)
8532 if (df_regs_ever_live_p (regno
)
8533 && !fixed_regs
[regno
]
8534 && !crtl
->abi
->clobbers_full_reg_p (regno
))
8535 frame
.reg_offset
[regno
] = SLOT_REQUIRED
;
8537 /* With stack-clash, LR must be saved in non-leaf functions. The saving of
8538 LR counts as an implicit probe which allows us to maintain the invariant
8539 described in the comment at expand_prologue. */
8540 gcc_assert (crtl
->is_leaf
8541 || maybe_ne (frame
.reg_offset
[R30_REGNUM
], SLOT_NOT_REQUIRED
));
8543 /* Now assign stack slots for the registers. Start with the predicate
8544 registers, since predicate LDR and STR have a relatively small
8545 offset range. These saves happen below the hard frame pointer. */
8546 for (regno
= P0_REGNUM
; regno
<= P15_REGNUM
; regno
++)
8547 if (known_eq (frame
.reg_offset
[regno
], SLOT_REQUIRED
))
8549 frame
.reg_offset
[regno
] = offset
;
8550 offset
+= BYTES_PER_SVE_PRED
;
8553 if (maybe_ne (offset
, 0))
8555 /* If we have any vector registers to save above the predicate registers,
8556 the offset of the vector register save slots need to be a multiple
8557 of the vector size. This lets us use the immediate forms of LDR/STR
8558 (or LD1/ST1 for big-endian).
8560 A vector register is 8 times the size of a predicate register,
8561 and we need to save a maximum of 12 predicate registers, so the
8562 first vector register will be at either #1, MUL VL or #2, MUL VL.
8564 If we don't have any vector registers to save, and we know how
8565 big the predicate save area is, we can just round it up to the
8566 next 16-byte boundary. */
8567 if (last_fp_reg
== (int) INVALID_REGNUM
&& offset
.is_constant ())
8568 offset
= aligned_upper_bound (offset
, STACK_BOUNDARY
/ BITS_PER_UNIT
);
8571 if (known_le (offset
, vector_save_size
))
8572 offset
= vector_save_size
;
8573 else if (known_le (offset
, vector_save_size
* 2))
8574 offset
= vector_save_size
* 2;
8580 /* If we need to save any SVE vector registers, add them next. */
8581 if (last_fp_reg
!= (int) INVALID_REGNUM
&& crtl
->abi
->id () == ARM_PCS_SVE
)
8582 for (regno
= V0_REGNUM
; regno
<= V31_REGNUM
; regno
++)
8583 if (known_eq (frame
.reg_offset
[regno
], SLOT_REQUIRED
))
8585 frame
.reg_offset
[regno
] = offset
;
8586 offset
+= vector_save_size
;
8589 /* OFFSET is now the offset of the hard frame pointer from the bottom
8590 of the callee save area. */
8591 bool saves_below_hard_fp_p
= maybe_ne (offset
, 0);
8592 frame
.below_hard_fp_saved_regs_size
= offset
;
8593 if (frame
.emit_frame_chain
)
8595 /* FP and LR are placed in the linkage record. */
8596 frame
.reg_offset
[R29_REGNUM
] = offset
;
8597 frame
.wb_push_candidate1
= R29_REGNUM
;
8598 frame
.reg_offset
[R30_REGNUM
] = offset
+ UNITS_PER_WORD
;
8599 frame
.wb_push_candidate2
= R30_REGNUM
;
8600 offset
+= 2 * UNITS_PER_WORD
;
8603 for (regno
= R0_REGNUM
; regno
<= R30_REGNUM
; regno
++)
8604 if (known_eq (frame
.reg_offset
[regno
], SLOT_REQUIRED
))
8606 frame
.reg_offset
[regno
] = offset
;
8607 if (frame
.wb_push_candidate1
== INVALID_REGNUM
)
8608 frame
.wb_push_candidate1
= regno
;
8609 else if (frame
.wb_push_candidate2
== INVALID_REGNUM
)
8610 frame
.wb_push_candidate2
= regno
;
8611 offset
+= UNITS_PER_WORD
;
8614 poly_int64 max_int_offset
= offset
;
8615 offset
= aligned_upper_bound (offset
, STACK_BOUNDARY
/ BITS_PER_UNIT
);
8616 bool has_align_gap
= maybe_ne (offset
, max_int_offset
);
8618 for (regno
= V0_REGNUM
; regno
<= V31_REGNUM
; regno
++)
8619 if (known_eq (frame
.reg_offset
[regno
], SLOT_REQUIRED
))
8621 /* If there is an alignment gap between integer and fp callee-saves,
8622 allocate the last fp register to it if possible. */
8623 if (regno
== last_fp_reg
8625 && known_eq (vector_save_size
, 8)
8626 && multiple_p (offset
, 16))
8628 frame
.reg_offset
[regno
] = max_int_offset
;
8632 frame
.reg_offset
[regno
] = offset
;
8633 if (frame
.wb_push_candidate1
== INVALID_REGNUM
)
8634 frame
.wb_push_candidate1
= regno
;
8635 else if (frame
.wb_push_candidate2
== INVALID_REGNUM
8636 && frame
.wb_push_candidate1
>= V0_REGNUM
)
8637 frame
.wb_push_candidate2
= regno
;
8638 offset
+= vector_save_size
;
8641 offset
= aligned_upper_bound (offset
, STACK_BOUNDARY
/ BITS_PER_UNIT
);
8643 frame
.saved_regs_size
= offset
;
8645 poly_int64 varargs_and_saved_regs_size
= offset
+ frame
.saved_varargs_size
;
8647 poly_int64 above_outgoing_args
8648 = aligned_upper_bound (varargs_and_saved_regs_size
8649 + get_frame_size (),
8650 STACK_BOUNDARY
/ BITS_PER_UNIT
);
8652 frame
.hard_fp_offset
8653 = above_outgoing_args
- frame
.below_hard_fp_saved_regs_size
;
8655 /* Both these values are already aligned. */
8656 gcc_assert (multiple_p (crtl
->outgoing_args_size
,
8657 STACK_BOUNDARY
/ BITS_PER_UNIT
));
8658 frame
.frame_size
= above_outgoing_args
+ crtl
->outgoing_args_size
;
8660 frame
.locals_offset
= frame
.saved_varargs_size
;
8662 frame
.initial_adjust
= 0;
8663 frame
.final_adjust
= 0;
8664 frame
.callee_adjust
= 0;
8665 frame
.sve_callee_adjust
= 0;
8666 frame
.callee_offset
= 0;
8668 frame
.wb_pop_candidate1
= frame
.wb_push_candidate1
;
8669 frame
.wb_pop_candidate2
= frame
.wb_push_candidate2
;
8671 /* Shadow call stack only deals with functions where the LR is pushed
8672 onto the stack and without specifying the "no_sanitize" attribute
8673 with the argument "shadow-call-stack". */
8674 frame
.is_scs_enabled
8675 = (!crtl
->calls_eh_return
8676 && sanitize_flags_p (SANITIZE_SHADOW_CALL_STACK
)
8677 && known_ge (cfun
->machine
->frame
.reg_offset
[LR_REGNUM
], 0));
8679 /* When shadow call stack is enabled, the scs_pop in the epilogue will
8680 restore x30, and we don't need to pop x30 again in the traditional
8681 way. Pop candidates record the registers that need to be popped
8683 if (frame
.is_scs_enabled
)
8685 if (frame
.wb_pop_candidate2
== R30_REGNUM
)
8686 frame
.wb_pop_candidate2
= INVALID_REGNUM
;
8687 else if (frame
.wb_pop_candidate1
== R30_REGNUM
)
8688 frame
.wb_pop_candidate1
= INVALID_REGNUM
;
8691 /* If candidate2 is INVALID_REGNUM, we need to adjust max_push_offset to
8692 256 to ensure that the offset meets the requirements of emit_move_insn.
8693 Similarly, if candidate1 is INVALID_REGNUM, we need to set
8694 max_push_offset to 0, because no registers are popped at this time,
8695 so callee_adjust cannot be adjusted. */
8696 HOST_WIDE_INT max_push_offset
= 0;
8697 if (frame
.wb_pop_candidate2
!= INVALID_REGNUM
)
8698 max_push_offset
= 512;
8699 else if (frame
.wb_pop_candidate1
!= INVALID_REGNUM
)
8700 max_push_offset
= 256;
8702 HOST_WIDE_INT const_size
, const_outgoing_args_size
, const_fp_offset
;
8703 HOST_WIDE_INT const_saved_regs_size
;
8704 if (frame
.frame_size
.is_constant (&const_size
)
8705 && const_size
< max_push_offset
8706 && known_eq (frame
.hard_fp_offset
, const_size
))
8708 /* Simple, small frame with no outgoing arguments:
8710 stp reg1, reg2, [sp, -frame_size]!
8711 stp reg3, reg4, [sp, 16] */
8712 frame
.callee_adjust
= const_size
;
8714 else if (crtl
->outgoing_args_size
.is_constant (&const_outgoing_args_size
)
8715 && frame
.saved_regs_size
.is_constant (&const_saved_regs_size
)
8716 && const_outgoing_args_size
+ const_saved_regs_size
< 512
8717 /* We could handle this case even with outgoing args, provided
8718 that the number of args left us with valid offsets for all
8719 predicate and vector save slots. It's such a rare case that
8720 it hardly seems worth the effort though. */
8721 && (!saves_below_hard_fp_p
|| const_outgoing_args_size
== 0)
8722 && !(cfun
->calls_alloca
8723 && frame
.hard_fp_offset
.is_constant (&const_fp_offset
)
8724 && const_fp_offset
< max_push_offset
))
8726 /* Frame with small outgoing arguments:
8728 sub sp, sp, frame_size
8729 stp reg1, reg2, [sp, outgoing_args_size]
8730 stp reg3, reg4, [sp, outgoing_args_size + 16] */
8731 frame
.initial_adjust
= frame
.frame_size
;
8732 frame
.callee_offset
= const_outgoing_args_size
;
8734 else if (saves_below_hard_fp_p
8735 && known_eq (frame
.saved_regs_size
,
8736 frame
.below_hard_fp_saved_regs_size
))
8738 /* Frame in which all saves are SVE saves:
8740 sub sp, sp, hard_fp_offset + below_hard_fp_saved_regs_size
8741 save SVE registers relative to SP
8742 sub sp, sp, outgoing_args_size */
8743 frame
.initial_adjust
= (frame
.hard_fp_offset
8744 + frame
.below_hard_fp_saved_regs_size
);
8745 frame
.final_adjust
= crtl
->outgoing_args_size
;
8747 else if (frame
.hard_fp_offset
.is_constant (&const_fp_offset
)
8748 && const_fp_offset
< max_push_offset
)
8750 /* Frame with large outgoing arguments or SVE saves, but with
8753 stp reg1, reg2, [sp, -hard_fp_offset]!
8754 stp reg3, reg4, [sp, 16]
8755 [sub sp, sp, below_hard_fp_saved_regs_size]
8756 [save SVE registers relative to SP]
8757 sub sp, sp, outgoing_args_size */
8758 frame
.callee_adjust
= const_fp_offset
;
8759 frame
.sve_callee_adjust
= frame
.below_hard_fp_saved_regs_size
;
8760 frame
.final_adjust
= crtl
->outgoing_args_size
;
8764 /* Frame with large local area and outgoing arguments or SVE saves,
8765 using frame pointer:
8767 sub sp, sp, hard_fp_offset
8768 stp x29, x30, [sp, 0]
8770 stp reg3, reg4, [sp, 16]
8771 [sub sp, sp, below_hard_fp_saved_regs_size]
8772 [save SVE registers relative to SP]
8773 sub sp, sp, outgoing_args_size */
8774 frame
.initial_adjust
= frame
.hard_fp_offset
;
8775 frame
.sve_callee_adjust
= frame
.below_hard_fp_saved_regs_size
;
8776 frame
.final_adjust
= crtl
->outgoing_args_size
;
8779 /* Make sure the individual adjustments add up to the full frame size. */
8780 gcc_assert (known_eq (frame
.initial_adjust
8781 + frame
.callee_adjust
8782 + frame
.sve_callee_adjust
8783 + frame
.final_adjust
, frame
.frame_size
));
8785 if (!frame
.emit_frame_chain
&& frame
.callee_adjust
== 0)
8787 /* We've decided not to associate any register saves with the initial
8788 stack allocation. */
8789 frame
.wb_pop_candidate1
= frame
.wb_push_candidate1
= INVALID_REGNUM
;
8790 frame
.wb_pop_candidate2
= frame
.wb_push_candidate2
= INVALID_REGNUM
;
8793 frame
.laid_out
= true;
8796 /* Return true if the register REGNO is saved on entry to
8797 the current function. */
8800 aarch64_register_saved_on_entry (int regno
)
8802 return known_ge (cfun
->machine
->frame
.reg_offset
[regno
], 0);
8805 /* Return the next register up from REGNO up to LIMIT for the callee
8809 aarch64_next_callee_save (unsigned regno
, unsigned limit
)
8811 while (regno
<= limit
&& !aarch64_register_saved_on_entry (regno
))
8816 /* Push the register number REGNO of mode MODE to the stack with write-back
8817 adjusting the stack by ADJUSTMENT. */
8820 aarch64_pushwb_single_reg (machine_mode mode
, unsigned regno
,
8821 HOST_WIDE_INT adjustment
)
8823 rtx base_rtx
= stack_pointer_rtx
;
8826 reg
= gen_rtx_REG (mode
, regno
);
8827 mem
= gen_rtx_PRE_MODIFY (Pmode
, base_rtx
,
8828 plus_constant (Pmode
, base_rtx
, -adjustment
));
8829 mem
= gen_frame_mem (mode
, mem
);
8831 insn
= emit_move_insn (mem
, reg
);
8832 RTX_FRAME_RELATED_P (insn
) = 1;
8835 /* Generate and return an instruction to store the pair of registers
8836 REG and REG2 of mode MODE to location BASE with write-back adjusting
8837 the stack location BASE by ADJUSTMENT. */
8840 aarch64_gen_storewb_pair (machine_mode mode
, rtx base
, rtx reg
, rtx reg2
,
8841 HOST_WIDE_INT adjustment
)
8846 return gen_storewb_pairdi_di (base
, base
, reg
, reg2
,
8847 GEN_INT (-adjustment
),
8848 GEN_INT (UNITS_PER_WORD
- adjustment
));
8850 return gen_storewb_pairdf_di (base
, base
, reg
, reg2
,
8851 GEN_INT (-adjustment
),
8852 GEN_INT (UNITS_PER_WORD
- adjustment
));
8854 return gen_storewb_pairtf_di (base
, base
, reg
, reg2
,
8855 GEN_INT (-adjustment
),
8856 GEN_INT (UNITS_PER_VREG
- adjustment
));
8862 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
8863 stack pointer by ADJUSTMENT. */
8866 aarch64_push_regs (unsigned regno1
, unsigned regno2
, HOST_WIDE_INT adjustment
)
8869 machine_mode mode
= aarch64_reg_save_mode (regno1
);
8871 if (regno2
== INVALID_REGNUM
)
8872 return aarch64_pushwb_single_reg (mode
, regno1
, adjustment
);
8874 rtx reg1
= gen_rtx_REG (mode
, regno1
);
8875 rtx reg2
= gen_rtx_REG (mode
, regno2
);
8877 insn
= emit_insn (aarch64_gen_storewb_pair (mode
, stack_pointer_rtx
, reg1
,
8879 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn
), 0, 2)) = 1;
8880 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn
), 0, 1)) = 1;
8881 RTX_FRAME_RELATED_P (insn
) = 1;
8884 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
8885 adjusting it by ADJUSTMENT afterwards. */
8888 aarch64_gen_loadwb_pair (machine_mode mode
, rtx base
, rtx reg
, rtx reg2
,
8889 HOST_WIDE_INT adjustment
)
8894 return gen_loadwb_pairdi_di (base
, base
, reg
, reg2
, GEN_INT (adjustment
),
8895 GEN_INT (UNITS_PER_WORD
));
8897 return gen_loadwb_pairdf_di (base
, base
, reg
, reg2
, GEN_INT (adjustment
),
8898 GEN_INT (UNITS_PER_WORD
));
8900 return gen_loadwb_pairtf_di (base
, base
, reg
, reg2
, GEN_INT (adjustment
),
8901 GEN_INT (UNITS_PER_VREG
));
8907 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
8908 afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
8912 aarch64_pop_regs (unsigned regno1
, unsigned regno2
, HOST_WIDE_INT adjustment
,
8915 machine_mode mode
= aarch64_reg_save_mode (regno1
);
8916 rtx reg1
= gen_rtx_REG (mode
, regno1
);
8918 *cfi_ops
= alloc_reg_note (REG_CFA_RESTORE
, reg1
, *cfi_ops
);
8920 if (regno2
== INVALID_REGNUM
)
8922 rtx mem
= plus_constant (Pmode
, stack_pointer_rtx
, adjustment
);
8923 mem
= gen_rtx_POST_MODIFY (Pmode
, stack_pointer_rtx
, mem
);
8924 emit_move_insn (reg1
, gen_frame_mem (mode
, mem
));
8928 rtx reg2
= gen_rtx_REG (mode
, regno2
);
8929 *cfi_ops
= alloc_reg_note (REG_CFA_RESTORE
, reg2
, *cfi_ops
);
8930 emit_insn (aarch64_gen_loadwb_pair (mode
, stack_pointer_rtx
, reg1
,
8935 /* Generate and return a store pair instruction of mode MODE to store
8936 register REG1 to MEM1 and register REG2 to MEM2. */
8939 aarch64_gen_store_pair (machine_mode mode
, rtx mem1
, rtx reg1
, rtx mem2
,
8945 return gen_store_pair_dw_didi (mem1
, reg1
, mem2
, reg2
);
8948 return gen_store_pair_dw_dfdf (mem1
, reg1
, mem2
, reg2
);
8951 return gen_store_pair_dw_tftf (mem1
, reg1
, mem2
, reg2
);
8954 return gen_vec_store_pairv4siv4si (mem1
, reg1
, mem2
, reg2
);
8957 return gen_vec_store_pairv16qiv16qi (mem1
, reg1
, mem2
, reg2
);
8964 /* Generate and regurn a load pair isntruction of mode MODE to load register
8965 REG1 from MEM1 and register REG2 from MEM2. */
8968 aarch64_gen_load_pair (machine_mode mode
, rtx reg1
, rtx mem1
, rtx reg2
,
8974 return gen_load_pair_dw_didi (reg1
, mem1
, reg2
, mem2
);
8977 return gen_load_pair_dw_dfdf (reg1
, mem1
, reg2
, mem2
);
8980 return gen_load_pair_dw_tftf (reg1
, mem1
, reg2
, mem2
);
8983 return gen_load_pairv4siv4si (reg1
, mem1
, reg2
, mem2
);
8990 /* Return TRUE if return address signing should be enabled for the current
8991 function, otherwise return FALSE. */
8994 aarch64_return_address_signing_enabled (void)
8996 /* This function should only be called after frame laid out. */
8997 gcc_assert (cfun
->machine
->frame
.laid_out
);
8999 /* Turn return address signing off in any function that uses
9000 __builtin_eh_return. The address passed to __builtin_eh_return
9001 is not signed so either it has to be signed (with original sp)
9002 or the code path that uses it has to avoid authenticating it.
9003 Currently eh return introduces a return to anywhere gadget, no
9004 matter what we do here since it uses ret with user provided
9005 address. An ideal fix for that is to use indirect branch which
9006 can be protected with BTI j (to some extent). */
9007 if (crtl
->calls_eh_return
)
9010 /* If signing scope is AARCH64_FUNCTION_NON_LEAF, we only sign a leaf function
9011 if its LR is pushed onto stack. */
9012 return (aarch64_ra_sign_scope
== AARCH64_FUNCTION_ALL
9013 || (aarch64_ra_sign_scope
== AARCH64_FUNCTION_NON_LEAF
9014 && known_ge (cfun
->machine
->frame
.reg_offset
[LR_REGNUM
], 0)));
9017 /* Return TRUE if Branch Target Identification Mechanism is enabled. */
9019 aarch64_bti_enabled (void)
9021 return (aarch64_enable_bti
== 1);
9024 /* The caller is going to use ST1D or LD1D to save or restore an SVE
9025 register in mode MODE at BASE_RTX + OFFSET, where OFFSET is in
9026 the range [1, 16] * GET_MODE_SIZE (MODE). Prepare for this by:
9028 (1) updating BASE_RTX + OFFSET so that it is a legitimate ST1D
9031 (2) setting PRED to a valid predicate register for the ST1D or LD1D,
9032 if the variable isn't already nonnull
9034 (1) is needed when OFFSET is in the range [8, 16] * GET_MODE_SIZE (MODE).
9035 Handle this case using a temporary base register that is suitable for
9036 all offsets in that range. Use ANCHOR_REG as this base register if it
9037 is nonnull, otherwise create a new register and store it in ANCHOR_REG. */
9040 aarch64_adjust_sve_callee_save_base (machine_mode mode
, rtx
&base_rtx
,
9041 rtx
&anchor_reg
, poly_int64
&offset
,
9044 if (maybe_ge (offset
, 8 * GET_MODE_SIZE (mode
)))
9046 /* This is the maximum valid offset of the anchor from the base.
9047 Lower values would be valid too. */
9048 poly_int64 anchor_offset
= 16 * GET_MODE_SIZE (mode
);
9051 anchor_reg
= gen_rtx_REG (Pmode
, STACK_CLASH_SVE_CFA_REGNUM
);
9052 emit_insn (gen_add3_insn (anchor_reg
, base_rtx
,
9053 gen_int_mode (anchor_offset
, Pmode
)));
9055 base_rtx
= anchor_reg
;
9056 offset
-= anchor_offset
;
9060 int pred_reg
= cfun
->machine
->frame
.spare_pred_reg
;
9061 emit_move_insn (gen_rtx_REG (VNx16BImode
, pred_reg
),
9062 CONSTM1_RTX (VNx16BImode
));
9063 ptrue
= gen_rtx_REG (VNx2BImode
, pred_reg
);
9067 /* Add a REG_CFA_EXPRESSION note to INSN to say that register REG
9068 is saved at BASE + OFFSET. */
9071 aarch64_add_cfa_expression (rtx_insn
*insn
, rtx reg
,
9072 rtx base
, poly_int64 offset
)
9074 rtx mem
= gen_frame_mem (GET_MODE (reg
),
9075 plus_constant (Pmode
, base
, offset
));
9076 add_reg_note (insn
, REG_CFA_EXPRESSION
, gen_rtx_SET (mem
, reg
));
9079 /* Emit code to save the callee-saved registers from register number START
9080 to LIMIT to the stack at the location starting at offset START_OFFSET,
9081 skipping any write-back candidates if SKIP_WB is true. HARD_FP_VALID_P
9082 is true if the hard frame pointer has been set up. */
9085 aarch64_save_callee_saves (poly_int64 start_offset
,
9086 unsigned start
, unsigned limit
, bool skip_wb
,
9087 bool hard_fp_valid_p
)
9092 rtx anchor_reg
= NULL_RTX
, ptrue
= NULL_RTX
;
9094 for (regno
= aarch64_next_callee_save (start
, limit
);
9096 regno
= aarch64_next_callee_save (regno
+ 1, limit
))
9100 bool frame_related_p
= aarch64_emit_cfi_for_reg_p (regno
);
9103 && (regno
== cfun
->machine
->frame
.wb_push_candidate1
9104 || regno
== cfun
->machine
->frame
.wb_push_candidate2
))
9107 if (cfun
->machine
->reg_is_wrapped_separately
[regno
])
9110 machine_mode mode
= aarch64_reg_save_mode (regno
);
9111 reg
= gen_rtx_REG (mode
, regno
);
9112 offset
= start_offset
+ cfun
->machine
->frame
.reg_offset
[regno
];
9113 rtx base_rtx
= stack_pointer_rtx
;
9114 poly_int64 sp_offset
= offset
;
9116 HOST_WIDE_INT const_offset
;
9117 if (mode
== VNx2DImode
&& BYTES_BIG_ENDIAN
)
9118 aarch64_adjust_sve_callee_save_base (mode
, base_rtx
, anchor_reg
,
9120 else if (GP_REGNUM_P (regno
)
9121 && (!offset
.is_constant (&const_offset
) || const_offset
>= 512))
9123 gcc_assert (known_eq (start_offset
, 0));
9124 poly_int64 fp_offset
9125 = cfun
->machine
->frame
.below_hard_fp_saved_regs_size
;
9126 if (hard_fp_valid_p
)
9127 base_rtx
= hard_frame_pointer_rtx
;
9132 anchor_reg
= gen_rtx_REG (Pmode
, STACK_CLASH_SVE_CFA_REGNUM
);
9133 emit_insn (gen_add3_insn (anchor_reg
, base_rtx
,
9134 gen_int_mode (fp_offset
, Pmode
)));
9136 base_rtx
= anchor_reg
;
9138 offset
-= fp_offset
;
9140 mem
= gen_frame_mem (mode
, plus_constant (Pmode
, base_rtx
, offset
));
9141 bool need_cfa_note_p
= (base_rtx
!= stack_pointer_rtx
);
9143 if (!aarch64_sve_mode_p (mode
)
9144 && (regno2
= aarch64_next_callee_save (regno
+ 1, limit
)) <= limit
9145 && !cfun
->machine
->reg_is_wrapped_separately
[regno2
]
9146 && known_eq (GET_MODE_SIZE (mode
),
9147 cfun
->machine
->frame
.reg_offset
[regno2
]
9148 - cfun
->machine
->frame
.reg_offset
[regno
]))
9150 rtx reg2
= gen_rtx_REG (mode
, regno2
);
9153 offset
+= GET_MODE_SIZE (mode
);
9154 mem2
= gen_frame_mem (mode
, plus_constant (Pmode
, base_rtx
, offset
));
9155 insn
= emit_insn (aarch64_gen_store_pair (mode
, mem
, reg
, mem2
,
9158 /* The first part of a frame-related parallel insn is
9159 always assumed to be relevant to the frame
9160 calculations; subsequent parts, are only
9161 frame-related if explicitly marked. */
9162 if (aarch64_emit_cfi_for_reg_p (regno2
))
9164 if (need_cfa_note_p
)
9165 aarch64_add_cfa_expression (insn
, reg2
, stack_pointer_rtx
,
9166 sp_offset
+ GET_MODE_SIZE (mode
));
9168 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn
), 0, 1)) = 1;
9173 else if (mode
== VNx2DImode
&& BYTES_BIG_ENDIAN
)
9175 insn
= emit_insn (gen_aarch64_pred_mov (mode
, mem
, ptrue
, reg
));
9176 need_cfa_note_p
= true;
9178 else if (aarch64_sve_mode_p (mode
))
9179 insn
= emit_insn (gen_rtx_SET (mem
, reg
));
9181 insn
= emit_move_insn (mem
, reg
);
9183 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
9184 if (frame_related_p
&& need_cfa_note_p
)
9185 aarch64_add_cfa_expression (insn
, reg
, stack_pointer_rtx
, sp_offset
);
9189 /* Emit code to restore the callee registers from register number START
9190 up to and including LIMIT. Restore from the stack offset START_OFFSET,
9191 skipping any write-back candidates if SKIP_WB is true. Write the
9192 appropriate REG_CFA_RESTORE notes into CFI_OPS. */
9195 aarch64_restore_callee_saves (poly_int64 start_offset
, unsigned start
,
9196 unsigned limit
, bool skip_wb
, rtx
*cfi_ops
)
9201 rtx anchor_reg
= NULL_RTX
, ptrue
= NULL_RTX
;
9203 for (regno
= aarch64_next_callee_save (start
, limit
);
9205 regno
= aarch64_next_callee_save (regno
+ 1, limit
))
9207 bool frame_related_p
= aarch64_emit_cfi_for_reg_p (regno
);
9208 if (cfun
->machine
->reg_is_wrapped_separately
[regno
])
9214 && (regno
== cfun
->machine
->frame
.wb_pop_candidate1
9215 || regno
== cfun
->machine
->frame
.wb_pop_candidate2
))
9218 machine_mode mode
= aarch64_reg_save_mode (regno
);
9219 reg
= gen_rtx_REG (mode
, regno
);
9220 offset
= start_offset
+ cfun
->machine
->frame
.reg_offset
[regno
];
9221 rtx base_rtx
= stack_pointer_rtx
;
9222 if (mode
== VNx2DImode
&& BYTES_BIG_ENDIAN
)
9223 aarch64_adjust_sve_callee_save_base (mode
, base_rtx
, anchor_reg
,
9225 mem
= gen_frame_mem (mode
, plus_constant (Pmode
, base_rtx
, offset
));
9227 if (!aarch64_sve_mode_p (mode
)
9228 && (regno2
= aarch64_next_callee_save (regno
+ 1, limit
)) <= limit
9229 && !cfun
->machine
->reg_is_wrapped_separately
[regno2
]
9230 && known_eq (GET_MODE_SIZE (mode
),
9231 cfun
->machine
->frame
.reg_offset
[regno2
]
9232 - cfun
->machine
->frame
.reg_offset
[regno
]))
9234 rtx reg2
= gen_rtx_REG (mode
, regno2
);
9237 offset
+= GET_MODE_SIZE (mode
);
9238 mem2
= gen_frame_mem (mode
, plus_constant (Pmode
, base_rtx
, offset
));
9239 emit_insn (aarch64_gen_load_pair (mode
, reg
, mem
, reg2
, mem2
));
9241 *cfi_ops
= alloc_reg_note (REG_CFA_RESTORE
, reg2
, *cfi_ops
);
9244 else if (mode
== VNx2DImode
&& BYTES_BIG_ENDIAN
)
9245 emit_insn (gen_aarch64_pred_mov (mode
, reg
, ptrue
, mem
));
9246 else if (aarch64_sve_mode_p (mode
))
9247 emit_insn (gen_rtx_SET (reg
, mem
));
9249 emit_move_insn (reg
, mem
);
9250 if (frame_related_p
)
9251 *cfi_ops
= alloc_reg_note (REG_CFA_RESTORE
, reg
, *cfi_ops
);
9255 /* Return true if OFFSET is a signed 4-bit value multiplied by the size
9259 offset_4bit_signed_scaled_p (machine_mode mode
, poly_int64 offset
)
9261 HOST_WIDE_INT multiple
;
9262 return (constant_multiple_p (offset
, GET_MODE_SIZE (mode
), &multiple
)
9263 && IN_RANGE (multiple
, -8, 7));
9266 /* Return true if OFFSET is a signed 6-bit value multiplied by the size
9270 offset_6bit_signed_scaled_p (machine_mode mode
, poly_int64 offset
)
9272 HOST_WIDE_INT multiple
;
9273 return (constant_multiple_p (offset
, GET_MODE_SIZE (mode
), &multiple
)
9274 && IN_RANGE (multiple
, -32, 31));
9277 /* Return true if OFFSET is an unsigned 6-bit value multiplied by the size
9281 offset_6bit_unsigned_scaled_p (machine_mode mode
, poly_int64 offset
)
9283 HOST_WIDE_INT multiple
;
9284 return (constant_multiple_p (offset
, GET_MODE_SIZE (mode
), &multiple
)
9285 && IN_RANGE (multiple
, 0, 63));
9288 /* Return true if OFFSET is a signed 7-bit value multiplied by the size
9292 aarch64_offset_7bit_signed_scaled_p (machine_mode mode
, poly_int64 offset
)
9294 HOST_WIDE_INT multiple
;
9295 return (constant_multiple_p (offset
, GET_MODE_SIZE (mode
), &multiple
)
9296 && IN_RANGE (multiple
, -64, 63));
9299 /* Return true if OFFSET is a signed 9-bit value. */
9302 aarch64_offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED
,
9305 HOST_WIDE_INT const_offset
;
9306 return (offset
.is_constant (&const_offset
)
9307 && IN_RANGE (const_offset
, -256, 255));
9310 /* Return true if OFFSET is a signed 9-bit value multiplied by the size
9314 offset_9bit_signed_scaled_p (machine_mode mode
, poly_int64 offset
)
9316 HOST_WIDE_INT multiple
;
9317 return (constant_multiple_p (offset
, GET_MODE_SIZE (mode
), &multiple
)
9318 && IN_RANGE (multiple
, -256, 255));
9321 /* Return true if OFFSET is an unsigned 12-bit value multiplied by the size
9325 offset_12bit_unsigned_scaled_p (machine_mode mode
, poly_int64 offset
)
9327 HOST_WIDE_INT multiple
;
9328 return (constant_multiple_p (offset
, GET_MODE_SIZE (mode
), &multiple
)
9329 && IN_RANGE (multiple
, 0, 4095));
9332 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS. */
9335 aarch64_get_separate_components (void)
9337 sbitmap components
= sbitmap_alloc (LAST_SAVED_REGNUM
+ 1);
9338 bitmap_clear (components
);
9340 /* The registers we need saved to the frame. */
9341 for (unsigned regno
= 0; regno
<= LAST_SAVED_REGNUM
; regno
++)
9342 if (aarch64_register_saved_on_entry (regno
))
9344 /* Punt on saves and restores that use ST1D and LD1D. We could
9345 try to be smarter, but it would involve making sure that the
9346 spare predicate register itself is safe to use at the save
9347 and restore points. Also, when a frame pointer is being used,
9348 the slots are often out of reach of ST1D and LD1D anyway. */
9349 machine_mode mode
= aarch64_reg_save_mode (regno
);
9350 if (mode
== VNx2DImode
&& BYTES_BIG_ENDIAN
)
9353 poly_int64 offset
= cfun
->machine
->frame
.reg_offset
[regno
];
9355 /* If the register is saved in the first SVE save slot, we use
9356 it as a stack probe for -fstack-clash-protection. */
9357 if (flag_stack_clash_protection
9358 && maybe_ne (cfun
->machine
->frame
.below_hard_fp_saved_regs_size
, 0)
9359 && known_eq (offset
, 0))
9362 /* Get the offset relative to the register we'll use. */
9363 if (frame_pointer_needed
)
9364 offset
-= cfun
->machine
->frame
.below_hard_fp_saved_regs_size
;
9366 offset
+= crtl
->outgoing_args_size
;
9368 /* Check that we can access the stack slot of the register with one
9369 direct load with no adjustments needed. */
9370 if (aarch64_sve_mode_p (mode
)
9371 ? offset_9bit_signed_scaled_p (mode
, offset
)
9372 : offset_12bit_unsigned_scaled_p (mode
, offset
))
9373 bitmap_set_bit (components
, regno
);
9376 /* Don't mess with the hard frame pointer. */
9377 if (frame_pointer_needed
)
9378 bitmap_clear_bit (components
, HARD_FRAME_POINTER_REGNUM
);
9380 /* If the spare predicate register used by big-endian SVE code
9381 is call-preserved, it must be saved in the main prologue
9382 before any saves that use it. */
9383 if (cfun
->machine
->frame
.spare_pred_reg
!= INVALID_REGNUM
)
9384 bitmap_clear_bit (components
, cfun
->machine
->frame
.spare_pred_reg
);
9386 unsigned reg1
= cfun
->machine
->frame
.wb_push_candidate1
;
9387 unsigned reg2
= cfun
->machine
->frame
.wb_push_candidate2
;
9388 /* If registers have been chosen to be stored/restored with
9389 writeback don't interfere with them to avoid having to output explicit
9390 stack adjustment instructions. */
9391 if (reg2
!= INVALID_REGNUM
)
9392 bitmap_clear_bit (components
, reg2
);
9393 if (reg1
!= INVALID_REGNUM
)
9394 bitmap_clear_bit (components
, reg1
);
9396 bitmap_clear_bit (components
, LR_REGNUM
);
9397 bitmap_clear_bit (components
, SP_REGNUM
);
9402 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB. */
9405 aarch64_components_for_bb (basic_block bb
)
9407 bitmap in
= DF_LIVE_IN (bb
);
9408 bitmap gen
= &DF_LIVE_BB_INFO (bb
)->gen
;
9409 bitmap kill
= &DF_LIVE_BB_INFO (bb
)->kill
;
9411 sbitmap components
= sbitmap_alloc (LAST_SAVED_REGNUM
+ 1);
9412 bitmap_clear (components
);
9414 /* Clobbered registers don't generate values in any meaningful sense,
9415 since nothing after the clobber can rely on their value. And we can't
9416 say that partially-clobbered registers are unconditionally killed,
9417 because whether they're killed or not depends on the mode of the
9418 value they're holding. Thus partially call-clobbered registers
9419 appear in neither the kill set nor the gen set.
9421 Check manually for any calls that clobber more of a register than the
9422 current function can. */
9423 function_abi_aggregator callee_abis
;
9425 FOR_BB_INSNS (bb
, insn
)
9427 callee_abis
.note_callee_abi (insn_callee_abi (insn
));
9428 HARD_REG_SET extra_caller_saves
= callee_abis
.caller_save_regs (*crtl
->abi
);
9430 /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets. */
9431 for (unsigned regno
= 0; regno
<= LAST_SAVED_REGNUM
; regno
++)
9432 if (!fixed_regs
[regno
]
9433 && !crtl
->abi
->clobbers_full_reg_p (regno
)
9434 && (TEST_HARD_REG_BIT (extra_caller_saves
, regno
)
9435 || bitmap_bit_p (in
, regno
)
9436 || bitmap_bit_p (gen
, regno
)
9437 || bitmap_bit_p (kill
, regno
)))
9439 bitmap_set_bit (components
, regno
);
9441 /* If there is a callee-save at an adjacent offset, add it too
9442 to increase the use of LDP/STP. */
9443 poly_int64 offset
= cfun
->machine
->frame
.reg_offset
[regno
];
9444 unsigned regno2
= multiple_p (offset
, 16) ? regno
+ 1 : regno
- 1;
9446 if (regno2
<= LAST_SAVED_REGNUM
)
9448 poly_int64 offset2
= cfun
->machine
->frame
.reg_offset
[regno2
];
9450 ? known_eq (offset
+ 8, offset2
)
9451 : multiple_p (offset2
, 16) && known_eq (offset2
+ 8, offset
))
9452 bitmap_set_bit (components
, regno2
);
9459 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
9460 Nothing to do for aarch64. */
9463 aarch64_disqualify_components (sbitmap
, edge
, sbitmap
, bool)
9467 /* Return the next set bit in BMP from START onwards. Return the total number
9468 of bits in BMP if no set bit is found at or after START. */
9471 aarch64_get_next_set_bit (sbitmap bmp
, unsigned int start
)
9473 unsigned int nbits
= SBITMAP_SIZE (bmp
);
9477 gcc_assert (start
< nbits
);
9478 for (unsigned int i
= start
; i
< nbits
; i
++)
9479 if (bitmap_bit_p (bmp
, i
))
9485 /* Do the work for aarch64_emit_prologue_components and
9486 aarch64_emit_epilogue_components. COMPONENTS is the bitmap of registers
9487 to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
9488 for these components or the epilogue sequence. That is, it determines
9489 whether we should emit stores or loads and what kind of CFA notes to attach
9490 to the insns. Otherwise the logic for the two sequences is very
9494 aarch64_process_components (sbitmap components
, bool prologue_p
)
9496 rtx ptr_reg
= gen_rtx_REG (Pmode
, frame_pointer_needed
9497 ? HARD_FRAME_POINTER_REGNUM
9498 : STACK_POINTER_REGNUM
);
9500 unsigned last_regno
= SBITMAP_SIZE (components
);
9501 unsigned regno
= aarch64_get_next_set_bit (components
, R0_REGNUM
);
9502 rtx_insn
*insn
= NULL
;
9504 while (regno
!= last_regno
)
9506 bool frame_related_p
= aarch64_emit_cfi_for_reg_p (regno
);
9507 machine_mode mode
= aarch64_reg_save_mode (regno
);
9509 rtx reg
= gen_rtx_REG (mode
, regno
);
9510 poly_int64 offset
= cfun
->machine
->frame
.reg_offset
[regno
];
9511 if (frame_pointer_needed
)
9512 offset
-= cfun
->machine
->frame
.below_hard_fp_saved_regs_size
;
9514 offset
+= crtl
->outgoing_args_size
;
9516 rtx addr
= plus_constant (Pmode
, ptr_reg
, offset
);
9517 rtx mem
= gen_frame_mem (mode
, addr
);
9519 rtx set
= prologue_p
? gen_rtx_SET (mem
, reg
) : gen_rtx_SET (reg
, mem
);
9520 unsigned regno2
= aarch64_get_next_set_bit (components
, regno
+ 1);
9521 /* No more registers to handle after REGNO.
9522 Emit a single save/restore and exit. */
9523 if (regno2
== last_regno
)
9525 insn
= emit_insn (set
);
9526 if (frame_related_p
)
9528 RTX_FRAME_RELATED_P (insn
) = 1;
9530 add_reg_note (insn
, REG_CFA_OFFSET
, copy_rtx (set
));
9532 add_reg_note (insn
, REG_CFA_RESTORE
, reg
);
9537 poly_int64 offset2
= cfun
->machine
->frame
.reg_offset
[regno2
];
9538 /* The next register is not of the same class or its offset is not
9539 mergeable with the current one into a pair. */
9540 if (aarch64_sve_mode_p (mode
)
9541 || !satisfies_constraint_Ump (mem
)
9542 || GP_REGNUM_P (regno
) != GP_REGNUM_P (regno2
)
9543 || (crtl
->abi
->id () == ARM_PCS_SIMD
&& FP_REGNUM_P (regno
))
9544 || maybe_ne ((offset2
- cfun
->machine
->frame
.reg_offset
[regno
]),
9545 GET_MODE_SIZE (mode
)))
9547 insn
= emit_insn (set
);
9548 if (frame_related_p
)
9550 RTX_FRAME_RELATED_P (insn
) = 1;
9552 add_reg_note (insn
, REG_CFA_OFFSET
, copy_rtx (set
));
9554 add_reg_note (insn
, REG_CFA_RESTORE
, reg
);
9561 bool frame_related2_p
= aarch64_emit_cfi_for_reg_p (regno2
);
9563 /* REGNO2 can be saved/restored in a pair with REGNO. */
9564 rtx reg2
= gen_rtx_REG (mode
, regno2
);
9565 if (frame_pointer_needed
)
9566 offset2
-= cfun
->machine
->frame
.below_hard_fp_saved_regs_size
;
9568 offset2
+= crtl
->outgoing_args_size
;
9569 rtx addr2
= plus_constant (Pmode
, ptr_reg
, offset2
);
9570 rtx mem2
= gen_frame_mem (mode
, addr2
);
9571 rtx set2
= prologue_p
? gen_rtx_SET (mem2
, reg2
)
9572 : gen_rtx_SET (reg2
, mem2
);
9575 insn
= emit_insn (aarch64_gen_store_pair (mode
, mem
, reg
, mem2
, reg2
));
9577 insn
= emit_insn (aarch64_gen_load_pair (mode
, reg
, mem
, reg2
, mem2
));
9579 if (frame_related_p
|| frame_related2_p
)
9581 RTX_FRAME_RELATED_P (insn
) = 1;
9584 if (frame_related_p
)
9585 add_reg_note (insn
, REG_CFA_OFFSET
, set
);
9586 if (frame_related2_p
)
9587 add_reg_note (insn
, REG_CFA_OFFSET
, set2
);
9591 if (frame_related_p
)
9592 add_reg_note (insn
, REG_CFA_RESTORE
, reg
);
9593 if (frame_related2_p
)
9594 add_reg_note (insn
, REG_CFA_RESTORE
, reg2
);
9598 regno
= aarch64_get_next_set_bit (components
, regno2
+ 1);
9602 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS. */
9605 aarch64_emit_prologue_components (sbitmap components
)
9607 aarch64_process_components (components
, true);
9610 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS. */
9613 aarch64_emit_epilogue_components (sbitmap components
)
9615 aarch64_process_components (components
, false);
9618 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS. */
9621 aarch64_set_handled_components (sbitmap components
)
9623 for (unsigned regno
= 0; regno
<= LAST_SAVED_REGNUM
; regno
++)
9624 if (bitmap_bit_p (components
, regno
))
9625 cfun
->machine
->reg_is_wrapped_separately
[regno
] = true;
9628 /* On AArch64 we have an ABI defined safe buffer. This constant is used to
9629 determining the probe offset for alloca. */
9631 static HOST_WIDE_INT
9632 aarch64_stack_clash_protection_alloca_probe_range (void)
9634 return STACK_CLASH_CALLER_GUARD
;
9638 /* Allocate POLY_SIZE bytes of stack space using TEMP1 and TEMP2 as scratch
9639 registers. If POLY_SIZE is not large enough to require a probe this function
9640 will only adjust the stack. When allocating the stack space
9641 FRAME_RELATED_P is then used to indicate if the allocation is frame related.
9642 FINAL_ADJUSTMENT_P indicates whether we are allocating the outgoing
9643 arguments. If we are then we ensure that any allocation larger than the ABI
9644 defined buffer needs a probe so that the invariant of having a 1KB buffer is
9647 We emit barriers after each stack adjustment to prevent optimizations from
9648 breaking the invariant that we never drop the stack more than a page. This
9649 invariant is needed to make it easier to correctly handle asynchronous
9650 events, e.g. if we were to allow the stack to be dropped by more than a page
9651 and then have multiple probes up and we take a signal somewhere in between
9652 then the signal handler doesn't know the state of the stack and can make no
9653 assumptions about which pages have been probed. */
9656 aarch64_allocate_and_probe_stack_space (rtx temp1
, rtx temp2
,
9657 poly_int64 poly_size
,
9658 bool frame_related_p
,
9659 bool final_adjustment_p
)
9661 HOST_WIDE_INT guard_size
9662 = 1 << param_stack_clash_protection_guard_size
;
9663 HOST_WIDE_INT guard_used_by_caller
= STACK_CLASH_CALLER_GUARD
;
9664 HOST_WIDE_INT min_probe_threshold
9665 = (final_adjustment_p
9666 ? guard_used_by_caller
9667 : guard_size
- guard_used_by_caller
);
9668 /* When doing the final adjustment for the outgoing arguments, take into
9669 account any unprobed space there is above the current SP. There are
9672 - When saving SVE registers below the hard frame pointer, we force
9673 the lowest save to take place in the prologue before doing the final
9674 adjustment (i.e. we don't allow the save to be shrink-wrapped).
9675 This acts as a probe at SP, so there is no unprobed space.
9677 - When there are no SVE register saves, we use the store of the link
9678 register as a probe. We can't assume that LR was saved at position 0
9679 though, so treat any space below it as unprobed. */
9680 if (final_adjustment_p
9681 && known_eq (cfun
->machine
->frame
.below_hard_fp_saved_regs_size
, 0))
9683 poly_int64 lr_offset
= cfun
->machine
->frame
.reg_offset
[LR_REGNUM
];
9684 if (known_ge (lr_offset
, 0))
9685 min_probe_threshold
-= lr_offset
.to_constant ();
9687 gcc_assert (!flag_stack_clash_protection
|| known_eq (poly_size
, 0));
9690 poly_int64 frame_size
= cfun
->machine
->frame
.frame_size
;
9692 /* We should always have a positive probe threshold. */
9693 gcc_assert (min_probe_threshold
> 0);
9695 if (flag_stack_clash_protection
&& !final_adjustment_p
)
9697 poly_int64 initial_adjust
= cfun
->machine
->frame
.initial_adjust
;
9698 poly_int64 sve_callee_adjust
= cfun
->machine
->frame
.sve_callee_adjust
;
9699 poly_int64 final_adjust
= cfun
->machine
->frame
.final_adjust
;
9701 if (known_eq (frame_size
, 0))
9703 dump_stack_clash_frame_info (NO_PROBE_NO_FRAME
, false);
9705 else if (known_lt (initial_adjust
+ sve_callee_adjust
,
9706 guard_size
- guard_used_by_caller
)
9707 && known_lt (final_adjust
, guard_used_by_caller
))
9709 dump_stack_clash_frame_info (NO_PROBE_SMALL_FRAME
, true);
9713 /* If SIZE is not large enough to require probing, just adjust the stack and
9715 if (known_lt (poly_size
, min_probe_threshold
)
9716 || !flag_stack_clash_protection
)
9718 aarch64_sub_sp (temp1
, temp2
, poly_size
, frame_related_p
);
9723 /* Handle the SVE non-constant case first. */
9724 if (!poly_size
.is_constant (&size
))
9728 fprintf (dump_file
, "Stack clash SVE prologue: ");
9729 print_dec (poly_size
, dump_file
);
9730 fprintf (dump_file
, " bytes, dynamic probing will be required.\n");
9733 /* First calculate the amount of bytes we're actually spilling. */
9734 aarch64_add_offset (Pmode
, temp1
, CONST0_RTX (Pmode
),
9735 poly_size
, temp1
, temp2
, false, true);
9737 rtx_insn
*insn
= get_last_insn ();
9739 if (frame_related_p
)
9741 /* This is done to provide unwinding information for the stack
9742 adjustments we're about to do, however to prevent the optimizers
9743 from removing the R11 move and leaving the CFA note (which would be
9744 very wrong) we tie the old and new stack pointer together.
9745 The tie will expand to nothing but the optimizers will not touch
9747 rtx stack_ptr_copy
= gen_rtx_REG (Pmode
, STACK_CLASH_SVE_CFA_REGNUM
);
9748 emit_move_insn (stack_ptr_copy
, stack_pointer_rtx
);
9749 emit_insn (gen_stack_tie (stack_ptr_copy
, stack_pointer_rtx
));
9751 /* We want the CFA independent of the stack pointer for the
9752 duration of the loop. */
9753 add_reg_note (insn
, REG_CFA_DEF_CFA
, stack_ptr_copy
);
9754 RTX_FRAME_RELATED_P (insn
) = 1;
9757 rtx probe_const
= gen_int_mode (min_probe_threshold
, Pmode
);
9758 rtx guard_const
= gen_int_mode (guard_size
, Pmode
);
9760 insn
= emit_insn (gen_probe_sve_stack_clash (Pmode
, stack_pointer_rtx
,
9761 stack_pointer_rtx
, temp1
,
9762 probe_const
, guard_const
));
9764 /* Now reset the CFA register if needed. */
9765 if (frame_related_p
)
9767 add_reg_note (insn
, REG_CFA_DEF_CFA
,
9768 gen_rtx_PLUS (Pmode
, stack_pointer_rtx
,
9769 gen_int_mode (poly_size
, Pmode
)));
9770 RTX_FRAME_RELATED_P (insn
) = 1;
9778 "Stack clash AArch64 prologue: " HOST_WIDE_INT_PRINT_DEC
9779 " bytes, probing will be required.\n", size
);
9781 /* Round size to the nearest multiple of guard_size, and calculate the
9782 residual as the difference between the original size and the rounded
9784 HOST_WIDE_INT rounded_size
= ROUND_DOWN (size
, guard_size
);
9785 HOST_WIDE_INT residual
= size
- rounded_size
;
9787 /* We can handle a small number of allocations/probes inline. Otherwise
9789 if (rounded_size
<= STACK_CLASH_MAX_UNROLL_PAGES
* guard_size
)
9791 for (HOST_WIDE_INT i
= 0; i
< rounded_size
; i
+= guard_size
)
9793 aarch64_sub_sp (NULL
, temp2
, guard_size
, true);
9794 emit_stack_probe (plus_constant (Pmode
, stack_pointer_rtx
,
9795 guard_used_by_caller
));
9796 emit_insn (gen_blockage ());
9798 dump_stack_clash_frame_info (PROBE_INLINE
, size
!= rounded_size
);
9802 /* Compute the ending address. */
9803 aarch64_add_offset (Pmode
, temp1
, stack_pointer_rtx
, -rounded_size
,
9804 temp1
, NULL
, false, true);
9805 rtx_insn
*insn
= get_last_insn ();
9807 /* For the initial allocation, we don't have a frame pointer
9808 set up, so we always need CFI notes. If we're doing the
9809 final allocation, then we may have a frame pointer, in which
9810 case it is the CFA, otherwise we need CFI notes.
9812 We can determine which allocation we are doing by looking at
9813 the value of FRAME_RELATED_P since the final allocations are not
9815 if (frame_related_p
)
9817 /* We want the CFA independent of the stack pointer for the
9818 duration of the loop. */
9819 add_reg_note (insn
, REG_CFA_DEF_CFA
,
9820 plus_constant (Pmode
, temp1
, rounded_size
));
9821 RTX_FRAME_RELATED_P (insn
) = 1;
9824 /* This allocates and probes the stack. Note that this re-uses some of
9825 the existing Ada stack protection code. However we are guaranteed not
9826 to enter the non loop or residual branches of that code.
9828 The non-loop part won't be entered because if our allocation amount
9829 doesn't require a loop, the case above would handle it.
9831 The residual amount won't be entered because TEMP1 is a mutliple of
9832 the allocation size. The residual will always be 0. As such, the only
9833 part we are actually using from that code is the loop setup. The
9834 actual probing is done in aarch64_output_probe_stack_range. */
9835 insn
= emit_insn (gen_probe_stack_range (stack_pointer_rtx
,
9836 stack_pointer_rtx
, temp1
));
9838 /* Now reset the CFA register if needed. */
9839 if (frame_related_p
)
9841 add_reg_note (insn
, REG_CFA_DEF_CFA
,
9842 plus_constant (Pmode
, stack_pointer_rtx
, rounded_size
));
9843 RTX_FRAME_RELATED_P (insn
) = 1;
9846 emit_insn (gen_blockage ());
9847 dump_stack_clash_frame_info (PROBE_LOOP
, size
!= rounded_size
);
9850 /* Handle any residuals. Residuals of at least MIN_PROBE_THRESHOLD have to
9851 be probed. This maintains the requirement that each page is probed at
9852 least once. For initial probing we probe only if the allocation is
9853 more than GUARD_SIZE - buffer, and for the outgoing arguments we probe
9854 if the amount is larger than buffer. GUARD_SIZE - buffer + buffer ==
9855 GUARD_SIZE. This works that for any allocation that is large enough to
9856 trigger a probe here, we'll have at least one, and if they're not large
9857 enough for this code to emit anything for them, The page would have been
9858 probed by the saving of FP/LR either by this function or any callees. If
9859 we don't have any callees then we won't have more stack adjustments and so
9863 HOST_WIDE_INT residual_probe_offset
= guard_used_by_caller
;
9864 /* If we're doing final adjustments, and we've done any full page
9865 allocations then any residual needs to be probed. */
9866 if (final_adjustment_p
&& rounded_size
!= 0)
9867 min_probe_threshold
= 0;
9868 /* If doing a small final adjustment, we always probe at offset 0.
9869 This is done to avoid issues when LR is not at position 0 or when
9870 the final adjustment is smaller than the probing offset. */
9871 else if (final_adjustment_p
&& rounded_size
== 0)
9872 residual_probe_offset
= 0;
9874 aarch64_sub_sp (temp1
, temp2
, residual
, frame_related_p
);
9875 if (residual
>= min_probe_threshold
)
9879 "Stack clash AArch64 prologue residuals: "
9880 HOST_WIDE_INT_PRINT_DEC
" bytes, probing will be required."
9883 emit_stack_probe (plus_constant (Pmode
, stack_pointer_rtx
,
9884 residual_probe_offset
));
9885 emit_insn (gen_blockage ());
9890 /* Return 1 if the register is used by the epilogue. We need to say the
9891 return register is used, but only after epilogue generation is complete.
9892 Note that in the case of sibcalls, the values "used by the epilogue" are
9893 considered live at the start of the called function.
9895 For SIMD functions we need to return 1 for FP registers that are saved and
9896 restored by a function but are not zero in call_used_regs. If we do not do
9897 this optimizations may remove the restore of the register. */
9900 aarch64_epilogue_uses (int regno
)
9902 if (epilogue_completed
)
9904 if (regno
== LR_REGNUM
)
9910 /* AArch64 stack frames generated by this compiler look like:
9912 +-------------------------------+
9914 | incoming stack arguments |
9916 +-------------------------------+
9917 | | <-- incoming stack pointer (aligned)
9918 | callee-allocated save area |
9919 | for register varargs |
9921 +-------------------------------+
9922 | local variables | <-- frame_pointer_rtx
9924 +-------------------------------+
9926 +-------------------------------+ |
9927 | callee-saved registers | | frame.saved_regs_size
9928 +-------------------------------+ |
9930 +-------------------------------+ |
9932 +-------------------------------+ |<- hard_frame_pointer_rtx (aligned)
9933 | SVE vector registers | | \
9934 +-------------------------------+ | | below_hard_fp_saved_regs_size
9935 | SVE predicate registers | / /
9936 +-------------------------------+
9937 | dynamic allocation |
9938 +-------------------------------+
9940 +-------------------------------+
9941 | outgoing stack arguments | <-- arg_pointer
9943 +-------------------------------+
9944 | | <-- stack_pointer_rtx (aligned)
9946 Dynamic stack allocations via alloca() decrease stack_pointer_rtx
9947 but leave frame_pointer_rtx and hard_frame_pointer_rtx
9950 By default for stack-clash we assume the guard is at least 64KB, but this
9951 value is configurable to either 4KB or 64KB. We also force the guard size to
9952 be the same as the probing interval and both values are kept in sync.
9954 With those assumptions the callee can allocate up to 63KB (or 3KB depending
9955 on the guard size) of stack space without probing.
9957 When probing is needed, we emit a probe at the start of the prologue
9958 and every PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE bytes thereafter.
9960 We have to track how much space has been allocated and the only stores
9961 to the stack we track as implicit probes are the FP/LR stores.
9963 For outgoing arguments we probe if the size is larger than 1KB, such that
9964 the ABI specified buffer is maintained for the next callee.
9966 The following registers are reserved during frame layout and should not be
9967 used for any other purpose:
9969 - r11: Used by stack clash protection when SVE is enabled, and also
9970 as an anchor register when saving and restoring registers
9971 - r12(EP0) and r13(EP1): Used as temporaries for stack adjustment.
9972 - r14 and r15: Used for speculation tracking.
9973 - r16(IP0), r17(IP1): Used by indirect tailcalls.
9974 - r30(LR), r29(FP): Used by standard frame layout.
9976 These registers must be avoided in frame layout related code unless the
9977 explicit intention is to interact with one of the features listed above. */
9979 /* Generate the prologue instructions for entry into a function.
9980 Establish the stack frame by decreasing the stack pointer with a
9981 properly calculated size and, if necessary, create a frame record
9982 filled with the values of LR and previous frame pointer. The
9983 current FP is also set up if it is in use. */
9986 aarch64_expand_prologue (void)
9988 poly_int64 frame_size
= cfun
->machine
->frame
.frame_size
;
9989 poly_int64 initial_adjust
= cfun
->machine
->frame
.initial_adjust
;
9990 HOST_WIDE_INT callee_adjust
= cfun
->machine
->frame
.callee_adjust
;
9991 poly_int64 final_adjust
= cfun
->machine
->frame
.final_adjust
;
9992 poly_int64 callee_offset
= cfun
->machine
->frame
.callee_offset
;
9993 poly_int64 sve_callee_adjust
= cfun
->machine
->frame
.sve_callee_adjust
;
9994 poly_int64 below_hard_fp_saved_regs_size
9995 = cfun
->machine
->frame
.below_hard_fp_saved_regs_size
;
9996 unsigned reg1
= cfun
->machine
->frame
.wb_push_candidate1
;
9997 unsigned reg2
= cfun
->machine
->frame
.wb_push_candidate2
;
9998 bool emit_frame_chain
= cfun
->machine
->frame
.emit_frame_chain
;
10001 if (flag_stack_clash_protection
&& known_eq (callee_adjust
, 0))
10003 /* Fold the SVE allocation into the initial allocation.
10004 We don't do this in aarch64_layout_arg to avoid pessimizing
10005 the epilogue code. */
10006 initial_adjust
+= sve_callee_adjust
;
10007 sve_callee_adjust
= 0;
10010 /* Sign return address for functions. */
10011 if (aarch64_return_address_signing_enabled ())
10013 switch (aarch64_ra_sign_key
)
10015 case AARCH64_KEY_A
:
10016 insn
= emit_insn (gen_paciasp ());
10018 case AARCH64_KEY_B
:
10019 insn
= emit_insn (gen_pacibsp ());
10022 gcc_unreachable ();
10024 add_reg_note (insn
, REG_CFA_TOGGLE_RA_MANGLE
, const0_rtx
);
10025 RTX_FRAME_RELATED_P (insn
) = 1;
10028 /* Push return address to shadow call stack. */
10029 if (cfun
->machine
->frame
.is_scs_enabled
)
10030 emit_insn (gen_scs_push ());
10032 if (flag_stack_usage_info
)
10033 current_function_static_stack_size
= constant_lower_bound (frame_size
);
10035 if (flag_stack_check
== STATIC_BUILTIN_STACK_CHECK
)
10037 if (crtl
->is_leaf
&& !cfun
->calls_alloca
)
10039 if (maybe_gt (frame_size
, PROBE_INTERVAL
)
10040 && maybe_gt (frame_size
, get_stack_check_protect ()))
10041 aarch64_emit_probe_stack_range (get_stack_check_protect (),
10043 - get_stack_check_protect ()));
10045 else if (maybe_gt (frame_size
, 0))
10046 aarch64_emit_probe_stack_range (get_stack_check_protect (), frame_size
);
10049 rtx tmp0_rtx
= gen_rtx_REG (Pmode
, EP0_REGNUM
);
10050 rtx tmp1_rtx
= gen_rtx_REG (Pmode
, EP1_REGNUM
);
10052 /* In theory we should never have both an initial adjustment
10053 and a callee save adjustment. Verify that is the case since the
10054 code below does not handle it for -fstack-clash-protection. */
10055 gcc_assert (known_eq (initial_adjust
, 0) || callee_adjust
== 0);
10057 /* Will only probe if the initial adjustment is larger than the guard
10058 less the amount of the guard reserved for use by the caller's
10060 aarch64_allocate_and_probe_stack_space (tmp0_rtx
, tmp1_rtx
, initial_adjust
,
10063 if (callee_adjust
!= 0)
10064 aarch64_push_regs (reg1
, reg2
, callee_adjust
);
10066 /* The offset of the frame chain record (if any) from the current SP. */
10067 poly_int64 chain_offset
= (initial_adjust
+ callee_adjust
10068 - cfun
->machine
->frame
.hard_fp_offset
);
10069 gcc_assert (known_ge (chain_offset
, 0));
10071 /* The offset of the bottom of the save area from the current SP. */
10072 poly_int64 saved_regs_offset
= chain_offset
- below_hard_fp_saved_regs_size
;
10074 if (emit_frame_chain
)
10076 if (callee_adjust
== 0)
10080 aarch64_save_callee_saves (saved_regs_offset
, reg1
, reg2
,
10084 gcc_assert (known_eq (chain_offset
, 0));
10085 aarch64_add_offset (Pmode
, hard_frame_pointer_rtx
,
10086 stack_pointer_rtx
, chain_offset
,
10087 tmp1_rtx
, tmp0_rtx
, frame_pointer_needed
);
10088 if (frame_pointer_needed
&& !frame_size
.is_constant ())
10090 /* Variable-sized frames need to describe the save slot
10091 address using DW_CFA_expression rather than DW_CFA_offset.
10092 This means that, without taking further action, the
10093 locations of the registers that we've already saved would
10094 remain based on the stack pointer even after we redefine
10095 the CFA based on the frame pointer. We therefore need new
10096 DW_CFA_expressions to re-express the save slots with addresses
10097 based on the frame pointer. */
10098 rtx_insn
*insn
= get_last_insn ();
10099 gcc_assert (RTX_FRAME_RELATED_P (insn
));
10101 /* Add an explicit CFA definition if this was previously
10103 if (!find_reg_note (insn
, REG_CFA_ADJUST_CFA
, NULL_RTX
))
10105 rtx src
= plus_constant (Pmode
, stack_pointer_rtx
,
10107 add_reg_note (insn
, REG_CFA_ADJUST_CFA
,
10108 gen_rtx_SET (hard_frame_pointer_rtx
, src
));
10111 /* Change the save slot expressions for the registers that
10112 we've already saved. */
10113 aarch64_add_cfa_expression (insn
, regno_reg_rtx
[reg2
],
10114 hard_frame_pointer_rtx
, UNITS_PER_WORD
);
10115 aarch64_add_cfa_expression (insn
, regno_reg_rtx
[reg1
],
10116 hard_frame_pointer_rtx
, 0);
10118 emit_insn (gen_stack_tie (stack_pointer_rtx
, hard_frame_pointer_rtx
));
10121 aarch64_save_callee_saves (saved_regs_offset
, R0_REGNUM
, R30_REGNUM
,
10122 callee_adjust
!= 0 || emit_frame_chain
,
10124 if (maybe_ne (sve_callee_adjust
, 0))
10126 gcc_assert (!flag_stack_clash_protection
10127 || known_eq (initial_adjust
, 0));
10128 aarch64_allocate_and_probe_stack_space (tmp1_rtx
, tmp0_rtx
,
10130 !frame_pointer_needed
, false);
10131 saved_regs_offset
+= sve_callee_adjust
;
10133 aarch64_save_callee_saves (saved_regs_offset
, P0_REGNUM
, P15_REGNUM
,
10134 false, emit_frame_chain
);
10135 aarch64_save_callee_saves (saved_regs_offset
, V0_REGNUM
, V31_REGNUM
,
10136 callee_adjust
!= 0 || emit_frame_chain
,
10139 /* We may need to probe the final adjustment if it is larger than the guard
10140 that is assumed by the called. */
10141 aarch64_allocate_and_probe_stack_space (tmp1_rtx
, tmp0_rtx
, final_adjust
,
10142 !frame_pointer_needed
, true);
10145 /* Return TRUE if we can use a simple_return insn.
10147 This function checks whether the callee saved stack is empty, which
10148 means no restore actions are need. The pro_and_epilogue will use
10149 this to check whether shrink-wrapping opt is feasible. */
10152 aarch64_use_return_insn_p (void)
10154 if (!reload_completed
)
10160 return known_eq (cfun
->machine
->frame
.frame_size
, 0);
10163 /* Generate the epilogue instructions for returning from a function.
10164 This is almost exactly the reverse of the prolog sequence, except
10165 that we need to insert barriers to avoid scheduling loads that read
10166 from a deallocated stack, and we optimize the unwind records by
10167 emitting them all together if possible. */
10169 aarch64_expand_epilogue (bool for_sibcall
)
10171 poly_int64 initial_adjust
= cfun
->machine
->frame
.initial_adjust
;
10172 HOST_WIDE_INT callee_adjust
= cfun
->machine
->frame
.callee_adjust
;
10173 poly_int64 final_adjust
= cfun
->machine
->frame
.final_adjust
;
10174 poly_int64 callee_offset
= cfun
->machine
->frame
.callee_offset
;
10175 poly_int64 sve_callee_adjust
= cfun
->machine
->frame
.sve_callee_adjust
;
10176 poly_int64 below_hard_fp_saved_regs_size
10177 = cfun
->machine
->frame
.below_hard_fp_saved_regs_size
;
10178 unsigned reg1
= cfun
->machine
->frame
.wb_pop_candidate1
;
10179 unsigned reg2
= cfun
->machine
->frame
.wb_pop_candidate2
;
10180 unsigned int last_gpr
= (cfun
->machine
->frame
.is_scs_enabled
10181 ? R29_REGNUM
: R30_REGNUM
);
10182 rtx cfi_ops
= NULL
;
10184 /* A stack clash protection prologue may not have left EP0_REGNUM or
10185 EP1_REGNUM in a usable state. The same is true for allocations
10186 with an SVE component, since we then need both temporary registers
10187 for each allocation. For stack clash we are in a usable state if
10188 the adjustment is less than GUARD_SIZE - GUARD_USED_BY_CALLER. */
10189 HOST_WIDE_INT guard_size
10190 = 1 << param_stack_clash_protection_guard_size
;
10191 HOST_WIDE_INT guard_used_by_caller
= STACK_CLASH_CALLER_GUARD
;
10193 /* We can re-use the registers when:
10195 (a) the deallocation amount is the same as the corresponding
10196 allocation amount (which is false if we combine the initial
10197 and SVE callee save allocations in the prologue); and
10199 (b) the allocation amount doesn't need a probe (which is false
10200 if the amount is guard_size - guard_used_by_caller or greater).
10202 In such situations the register should remain live with the correct
10204 bool can_inherit_p
= (initial_adjust
.is_constant ()
10205 && final_adjust
.is_constant ()
10206 && (!flag_stack_clash_protection
10207 || (known_lt (initial_adjust
,
10208 guard_size
- guard_used_by_caller
)
10209 && known_eq (sve_callee_adjust
, 0))));
10211 /* We need to add memory barrier to prevent read from deallocated stack. */
10212 bool need_barrier_p
10213 = maybe_ne (get_frame_size ()
10214 + cfun
->machine
->frame
.saved_varargs_size
, 0);
10216 /* Emit a barrier to prevent loads from a deallocated stack. */
10217 if (maybe_gt (final_adjust
, crtl
->outgoing_args_size
)
10218 || cfun
->calls_alloca
10219 || crtl
->calls_eh_return
)
10221 emit_insn (gen_stack_tie (stack_pointer_rtx
, stack_pointer_rtx
));
10222 need_barrier_p
= false;
10225 /* Restore the stack pointer from the frame pointer if it may not
10226 be the same as the stack pointer. */
10227 rtx tmp0_rtx
= gen_rtx_REG (Pmode
, EP0_REGNUM
);
10228 rtx tmp1_rtx
= gen_rtx_REG (Pmode
, EP1_REGNUM
);
10229 if (frame_pointer_needed
10230 && (maybe_ne (final_adjust
, 0) || cfun
->calls_alloca
))
10231 /* If writeback is used when restoring callee-saves, the CFA
10232 is restored on the instruction doing the writeback. */
10233 aarch64_add_offset (Pmode
, stack_pointer_rtx
,
10234 hard_frame_pointer_rtx
,
10235 -callee_offset
- below_hard_fp_saved_regs_size
,
10236 tmp1_rtx
, tmp0_rtx
, callee_adjust
== 0);
10238 /* The case where we need to re-use the register here is very rare, so
10239 avoid the complicated condition and just always emit a move if the
10240 immediate doesn't fit. */
10241 aarch64_add_sp (tmp1_rtx
, tmp0_rtx
, final_adjust
, true);
10243 /* Restore the vector registers before the predicate registers,
10244 so that we can use P4 as a temporary for big-endian SVE frames. */
10245 aarch64_restore_callee_saves (callee_offset
, V0_REGNUM
, V31_REGNUM
,
10246 callee_adjust
!= 0, &cfi_ops
);
10247 aarch64_restore_callee_saves (callee_offset
, P0_REGNUM
, P15_REGNUM
,
10249 if (maybe_ne (sve_callee_adjust
, 0))
10250 aarch64_add_sp (NULL_RTX
, NULL_RTX
, sve_callee_adjust
, true);
10252 /* When shadow call stack is enabled, the scs_pop in the epilogue will
10253 restore x30, we don't need to restore x30 again in the traditional
10255 aarch64_restore_callee_saves (callee_offset
- sve_callee_adjust
,
10256 R0_REGNUM
, last_gpr
,
10257 callee_adjust
!= 0, &cfi_ops
);
10259 if (need_barrier_p
)
10260 emit_insn (gen_stack_tie (stack_pointer_rtx
, stack_pointer_rtx
));
10262 if (callee_adjust
!= 0)
10263 aarch64_pop_regs (reg1
, reg2
, callee_adjust
, &cfi_ops
);
10265 /* If we have no register restore information, the CFA must have been
10266 defined in terms of the stack pointer since the end of the prologue. */
10267 gcc_assert (cfi_ops
|| !frame_pointer_needed
);
10269 if (cfi_ops
&& (callee_adjust
!= 0 || maybe_gt (initial_adjust
, 65536)))
10271 /* Emit delayed restores and set the CFA to be SP + initial_adjust. */
10272 insn
= get_last_insn ();
10273 rtx new_cfa
= plus_constant (Pmode
, stack_pointer_rtx
, initial_adjust
);
10274 REG_NOTES (insn
) = alloc_reg_note (REG_CFA_DEF_CFA
, new_cfa
, cfi_ops
);
10275 RTX_FRAME_RELATED_P (insn
) = 1;
10279 /* Liveness of EP0_REGNUM can not be trusted across function calls either, so
10280 add restriction on emit_move optimization to leaf functions. */
10281 aarch64_add_sp (tmp0_rtx
, tmp1_rtx
, initial_adjust
,
10282 (!can_inherit_p
|| !crtl
->is_leaf
10283 || df_regs_ever_live_p (EP0_REGNUM
)));
10287 /* Emit delayed restores and reset the CFA to be SP. */
10288 insn
= get_last_insn ();
10289 cfi_ops
= alloc_reg_note (REG_CFA_DEF_CFA
, stack_pointer_rtx
, cfi_ops
);
10290 REG_NOTES (insn
) = cfi_ops
;
10291 RTX_FRAME_RELATED_P (insn
) = 1;
10294 /* Pop return address from shadow call stack. */
10295 if (cfun
->machine
->frame
.is_scs_enabled
)
10297 machine_mode mode
= aarch64_reg_save_mode (R30_REGNUM
);
10298 rtx reg
= gen_rtx_REG (mode
, R30_REGNUM
);
10300 insn
= emit_insn (gen_scs_pop ());
10301 add_reg_note (insn
, REG_CFA_RESTORE
, reg
);
10302 RTX_FRAME_RELATED_P (insn
) = 1;
10305 /* We prefer to emit the combined return/authenticate instruction RETAA,
10306 however there are three cases in which we must instead emit an explicit
10307 authentication instruction.
10309 1) Sibcalls don't return in a normal way, so if we're about to call one
10310 we must authenticate.
10312 2) The RETAA instruction is not available before ARMv8.3-A, so if we are
10313 generating code for !TARGET_ARMV8_3 we can't use it and must
10314 explicitly authenticate.
10316 if (aarch64_return_address_signing_enabled ()
10317 && (for_sibcall
|| !TARGET_ARMV8_3
))
10319 switch (aarch64_ra_sign_key
)
10321 case AARCH64_KEY_A
:
10322 insn
= emit_insn (gen_autiasp ());
10324 case AARCH64_KEY_B
:
10325 insn
= emit_insn (gen_autibsp ());
10328 gcc_unreachable ();
10330 add_reg_note (insn
, REG_CFA_TOGGLE_RA_MANGLE
, const0_rtx
);
10331 RTX_FRAME_RELATED_P (insn
) = 1;
10334 /* Stack adjustment for exception handler. */
10335 if (crtl
->calls_eh_return
&& !for_sibcall
)
10337 /* We need to unwind the stack by the offset computed by
10338 EH_RETURN_STACKADJ_RTX. We have already reset the CFA
10339 to be SP; letting the CFA move during this adjustment
10340 is just as correct as retaining the CFA from the body
10341 of the function. Therefore, do nothing special. */
10342 emit_insn (gen_add2_insn (stack_pointer_rtx
, EH_RETURN_STACKADJ_RTX
));
10345 emit_use (gen_rtx_REG (DImode
, LR_REGNUM
));
10347 emit_jump_insn (ret_rtx
);
10350 /* Implement EH_RETURN_HANDLER_RTX. EH returns need to either return
10351 normally or return to a previous frame after unwinding.
10353 An EH return uses a single shared return sequence. The epilogue is
10354 exactly like a normal epilogue except that it has an extra input
10355 register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
10356 that must be applied after the frame has been destroyed. An extra label
10357 is inserted before the epilogue which initializes this register to zero,
10358 and this is the entry point for a normal return.
10360 An actual EH return updates the return address, initializes the stack
10361 adjustment and jumps directly into the epilogue (bypassing the zeroing
10362 of the adjustment). Since the return address is typically saved on the
10363 stack when a function makes a call, the saved LR must be updated outside
10366 This poses problems as the store is generated well before the epilogue,
10367 so the offset of LR is not known yet. Also optimizations will remove the
10368 store as it appears dead, even after the epilogue is generated (as the
10369 base or offset for loading LR is different in many cases).
10371 To avoid these problems this implementation forces the frame pointer
10372 in eh_return functions so that the location of LR is fixed and known early.
10373 It also marks the store volatile, so no optimization is permitted to
10374 remove the store. */
10376 aarch64_eh_return_handler_rtx (void)
10378 rtx tmp
= gen_frame_mem (Pmode
,
10379 plus_constant (Pmode
, hard_frame_pointer_rtx
, UNITS_PER_WORD
));
10381 /* Mark the store volatile, so no optimization is permitted to remove it. */
10382 MEM_VOLATILE_P (tmp
) = true;
10386 /* Output code to add DELTA to the first argument, and then jump
10387 to FUNCTION. Used for C++ multiple inheritance. */
10389 aarch64_output_mi_thunk (FILE *file
, tree thunk ATTRIBUTE_UNUSED
,
10390 HOST_WIDE_INT delta
,
10391 HOST_WIDE_INT vcall_offset
,
10394 /* The this pointer is always in x0. Note that this differs from
10395 Arm where the this pointer maybe bumped to r1 if r0 is required
10396 to return a pointer to an aggregate. On AArch64 a result value
10397 pointer will be in x8. */
10398 int this_regno
= R0_REGNUM
;
10399 rtx this_rtx
, temp0
, temp1
, addr
, funexp
;
10401 const char *fnname
= IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (thunk
));
10403 if (aarch64_bti_enabled ())
10404 emit_insn (gen_bti_c());
10406 reload_completed
= 1;
10407 emit_note (NOTE_INSN_PROLOGUE_END
);
10409 this_rtx
= gen_rtx_REG (Pmode
, this_regno
);
10410 temp0
= gen_rtx_REG (Pmode
, EP0_REGNUM
);
10411 temp1
= gen_rtx_REG (Pmode
, EP1_REGNUM
);
10413 if (vcall_offset
== 0)
10414 aarch64_add_offset (Pmode
, this_rtx
, this_rtx
, delta
, temp1
, temp0
, false);
10417 gcc_assert ((vcall_offset
& (POINTER_BYTES
- 1)) == 0);
10422 if (delta
>= -256 && delta
< 256)
10423 addr
= gen_rtx_PRE_MODIFY (Pmode
, this_rtx
,
10424 plus_constant (Pmode
, this_rtx
, delta
));
10426 aarch64_add_offset (Pmode
, this_rtx
, this_rtx
, delta
,
10427 temp1
, temp0
, false);
10430 if (Pmode
== ptr_mode
)
10431 aarch64_emit_move (temp0
, gen_rtx_MEM (ptr_mode
, addr
));
10433 aarch64_emit_move (temp0
,
10434 gen_rtx_ZERO_EXTEND (Pmode
,
10435 gen_rtx_MEM (ptr_mode
, addr
)));
10437 if (vcall_offset
>= -256 && vcall_offset
< 4096 * POINTER_BYTES
)
10438 addr
= plus_constant (Pmode
, temp0
, vcall_offset
);
10441 aarch64_internal_mov_immediate (temp1
, GEN_INT (vcall_offset
), true,
10443 addr
= gen_rtx_PLUS (Pmode
, temp0
, temp1
);
10446 if (Pmode
== ptr_mode
)
10447 aarch64_emit_move (temp1
, gen_rtx_MEM (ptr_mode
,addr
));
10449 aarch64_emit_move (temp1
,
10450 gen_rtx_SIGN_EXTEND (Pmode
,
10451 gen_rtx_MEM (ptr_mode
, addr
)));
10453 emit_insn (gen_add2_insn (this_rtx
, temp1
));
10456 /* Generate a tail call to the target function. */
10457 if (!TREE_USED (function
))
10459 assemble_external (function
);
10460 TREE_USED (function
) = 1;
10462 funexp
= XEXP (DECL_RTL (function
), 0);
10463 funexp
= gen_rtx_MEM (FUNCTION_MODE
, funexp
);
10464 rtx callee_abi
= gen_int_mode (fndecl_abi (function
).id (), DImode
);
10465 insn
= emit_call_insn (gen_sibcall (funexp
, const0_rtx
, callee_abi
));
10466 SIBLING_CALL_P (insn
) = 1;
10468 insn
= get_insns ();
10469 shorten_branches (insn
);
10471 assemble_start_function (thunk
, fnname
);
10472 final_start_function (insn
, file
, 1);
10473 final (insn
, file
, 1);
10474 final_end_function ();
10475 assemble_end_function (thunk
, fnname
);
10477 /* Stop pretending to be a post-reload pass. */
10478 reload_completed
= 0;
10482 aarch64_tls_referenced_p (rtx x
)
10484 if (!TARGET_HAVE_TLS
)
10486 subrtx_iterator::array_type array
;
10487 FOR_EACH_SUBRTX (iter
, array
, x
, ALL
)
10489 const_rtx x
= *iter
;
10490 if (SYMBOL_REF_P (x
) && SYMBOL_REF_TLS_MODEL (x
) != 0)
10492 /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
10493 TLS offsets, not real symbol references. */
10494 if (GET_CODE (x
) == UNSPEC
&& XINT (x
, 1) == UNSPEC_TLS
)
10495 iter
.skip_subrtxes ();
10502 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED
, rtx x
)
10504 if (GET_CODE (x
) == HIGH
)
10507 /* There's no way to calculate VL-based values using relocations. */
10508 subrtx_iterator::array_type array
;
10509 FOR_EACH_SUBRTX (iter
, array
, x
, ALL
)
10510 if (GET_CODE (*iter
) == CONST_POLY_INT
)
10514 rtx base
= strip_offset_and_salt (x
, &offset
);
10515 if (SYMBOL_REF_P (base
) || LABEL_REF_P (base
))
10517 /* We checked for POLY_INT_CST offsets above. */
10518 if (aarch64_classify_symbol (base
, offset
.to_constant ())
10519 != SYMBOL_FORCE_TO_MEM
)
10522 /* Avoid generating a 64-bit relocation in ILP32; leave
10523 to aarch64_expand_mov_immediate to handle it properly. */
10524 return mode
!= ptr_mode
;
10527 return aarch64_tls_referenced_p (x
);
10530 /* Implement TARGET_CASE_VALUES_THRESHOLD.
10531 The expansion for a table switch is quite expensive due to the number
10532 of instructions, the table lookup and hard to predict indirect jump.
10533 When optimizing for speed, and -O3 enabled, use the per-core tuning if
10534 set, otherwise use tables for >= 11 cases as a tradeoff between size and
10535 performance. When optimizing for size, use 8 for smallest codesize. */
10537 static unsigned int
10538 aarch64_case_values_threshold (void)
10540 /* Use the specified limit for the number of cases before using jump
10541 tables at higher optimization levels. */
10543 && aarch64_tune_params
.max_case_values
!= 0)
10544 return aarch64_tune_params
.max_case_values
;
10546 return optimize_size
? 8 : 11;
10549 /* Return true if register REGNO is a valid index register.
10550 STRICT_P is true if REG_OK_STRICT is in effect. */
10553 aarch64_regno_ok_for_index_p (int regno
, bool strict_p
)
10555 if (!HARD_REGISTER_NUM_P (regno
))
10563 regno
= reg_renumber
[regno
];
10565 return GP_REGNUM_P (regno
);
10568 /* Return true if register REGNO is a valid base register for mode MODE.
10569 STRICT_P is true if REG_OK_STRICT is in effect. */
10572 aarch64_regno_ok_for_base_p (int regno
, bool strict_p
)
10574 if (!HARD_REGISTER_NUM_P (regno
))
10582 regno
= reg_renumber
[regno
];
10585 /* The fake registers will be eliminated to either the stack or
10586 hard frame pointer, both of which are usually valid base registers.
10587 Reload deals with the cases where the eliminated form isn't valid. */
10588 return (GP_REGNUM_P (regno
)
10589 || regno
== SP_REGNUM
10590 || regno
== FRAME_POINTER_REGNUM
10591 || regno
== ARG_POINTER_REGNUM
);
10594 /* Return true if X is a valid base register for mode MODE.
10595 STRICT_P is true if REG_OK_STRICT is in effect. */
10598 aarch64_base_register_rtx_p (rtx x
, bool strict_p
)
10602 && contains_reg_of_mode
[GENERAL_REGS
][GET_MODE (SUBREG_REG (x
))])
10603 x
= SUBREG_REG (x
);
10605 return (REG_P (x
) && aarch64_regno_ok_for_base_p (REGNO (x
), strict_p
));
10608 /* Return true if address offset is a valid index. If it is, fill in INFO
10609 appropriately. STRICT_P is true if REG_OK_STRICT is in effect. */
10612 aarch64_classify_index (struct aarch64_address_info
*info
, rtx x
,
10613 machine_mode mode
, bool strict_p
)
10615 enum aarch64_address_type type
;
10620 if ((REG_P (x
) || SUBREG_P (x
))
10621 && GET_MODE (x
) == Pmode
)
10623 type
= ADDRESS_REG_REG
;
10627 /* (sign_extend:DI (reg:SI)) */
10628 else if ((GET_CODE (x
) == SIGN_EXTEND
10629 || GET_CODE (x
) == ZERO_EXTEND
)
10630 && GET_MODE (x
) == DImode
10631 && GET_MODE (XEXP (x
, 0)) == SImode
)
10633 type
= (GET_CODE (x
) == SIGN_EXTEND
)
10634 ? ADDRESS_REG_SXTW
: ADDRESS_REG_UXTW
;
10635 index
= XEXP (x
, 0);
10638 /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
10639 else if (GET_CODE (x
) == MULT
10640 && (GET_CODE (XEXP (x
, 0)) == SIGN_EXTEND
10641 || GET_CODE (XEXP (x
, 0)) == ZERO_EXTEND
)
10642 && GET_MODE (XEXP (x
, 0)) == DImode
10643 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == SImode
10644 && CONST_INT_P (XEXP (x
, 1)))
10646 type
= (GET_CODE (XEXP (x
, 0)) == SIGN_EXTEND
)
10647 ? ADDRESS_REG_SXTW
: ADDRESS_REG_UXTW
;
10648 index
= XEXP (XEXP (x
, 0), 0);
10649 shift
= exact_log2 (INTVAL (XEXP (x
, 1)));
10651 /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
10652 else if (GET_CODE (x
) == ASHIFT
10653 && (GET_CODE (XEXP (x
, 0)) == SIGN_EXTEND
10654 || GET_CODE (XEXP (x
, 0)) == ZERO_EXTEND
)
10655 && GET_MODE (XEXP (x
, 0)) == DImode
10656 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == SImode
10657 && CONST_INT_P (XEXP (x
, 1)))
10659 type
= (GET_CODE (XEXP (x
, 0)) == SIGN_EXTEND
)
10660 ? ADDRESS_REG_SXTW
: ADDRESS_REG_UXTW
;
10661 index
= XEXP (XEXP (x
, 0), 0);
10662 shift
= INTVAL (XEXP (x
, 1));
10664 /* (and:DI (mult:DI (reg:DI) (const_int scale))
10665 (const_int 0xffffffff<<shift)) */
10666 else if (GET_CODE (x
) == AND
10667 && GET_MODE (x
) == DImode
10668 && GET_CODE (XEXP (x
, 0)) == MULT
10669 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == DImode
10670 && CONST_INT_P (XEXP (XEXP (x
, 0), 1))
10671 && CONST_INT_P (XEXP (x
, 1)))
10673 type
= ADDRESS_REG_UXTW
;
10674 index
= XEXP (XEXP (x
, 0), 0);
10675 shift
= exact_log2 (INTVAL (XEXP (XEXP (x
, 0), 1)));
10676 if (INTVAL (XEXP (x
, 1)) != (HOST_WIDE_INT
)0xffffffff << shift
)
10679 /* (and:DI (ashift:DI (reg:DI) (const_int shift))
10680 (const_int 0xffffffff<<shift)) */
10681 else if (GET_CODE (x
) == AND
10682 && GET_MODE (x
) == DImode
10683 && GET_CODE (XEXP (x
, 0)) == ASHIFT
10684 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == DImode
10685 && CONST_INT_P (XEXP (XEXP (x
, 0), 1))
10686 && CONST_INT_P (XEXP (x
, 1)))
10688 type
= ADDRESS_REG_UXTW
;
10689 index
= XEXP (XEXP (x
, 0), 0);
10690 shift
= INTVAL (XEXP (XEXP (x
, 0), 1));
10691 if (INTVAL (XEXP (x
, 1)) != (HOST_WIDE_INT
)0xffffffff << shift
)
10694 /* (mult:P (reg:P) (const_int scale)) */
10695 else if (GET_CODE (x
) == MULT
10696 && GET_MODE (x
) == Pmode
10697 && GET_MODE (XEXP (x
, 0)) == Pmode
10698 && CONST_INT_P (XEXP (x
, 1)))
10700 type
= ADDRESS_REG_REG
;
10701 index
= XEXP (x
, 0);
10702 shift
= exact_log2 (INTVAL (XEXP (x
, 1)));
10704 /* (ashift:P (reg:P) (const_int shift)) */
10705 else if (GET_CODE (x
) == ASHIFT
10706 && GET_MODE (x
) == Pmode
10707 && GET_MODE (XEXP (x
, 0)) == Pmode
10708 && CONST_INT_P (XEXP (x
, 1)))
10710 type
= ADDRESS_REG_REG
;
10711 index
= XEXP (x
, 0);
10712 shift
= INTVAL (XEXP (x
, 1));
10718 && SUBREG_P (index
)
10719 && contains_reg_of_mode
[GENERAL_REGS
][GET_MODE (SUBREG_REG (index
))])
10720 index
= SUBREG_REG (index
);
10722 if (aarch64_sve_data_mode_p (mode
))
10724 if (type
!= ADDRESS_REG_REG
10725 || (1 << shift
) != GET_MODE_UNIT_SIZE (mode
))
10731 && !(IN_RANGE (shift
, 1, 3)
10732 && known_eq (1 << shift
, GET_MODE_SIZE (mode
))))
10737 && aarch64_regno_ok_for_index_p (REGNO (index
), strict_p
))
10740 info
->offset
= index
;
10741 info
->shift
= shift
;
10748 /* Return true if MODE is one of the modes for which we
10749 support LDP/STP operations. */
10752 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode
)
10754 return mode
== SImode
|| mode
== DImode
10755 || mode
== SFmode
|| mode
== DFmode
10756 || mode
== SDmode
|| mode
== DDmode
10757 || (aarch64_vector_mode_supported_p (mode
)
10758 && (known_eq (GET_MODE_SIZE (mode
), 8)
10759 || (known_eq (GET_MODE_SIZE (mode
), 16)
10760 && (aarch64_tune_params
.extra_tuning_flags
10761 & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS
) == 0)));
10764 /* Return true if REGNO is a virtual pointer register, or an eliminable
10765 "soft" frame register. Like REGNO_PTR_FRAME_P except that we don't
10766 include stack_pointer or hard_frame_pointer. */
10768 virt_or_elim_regno_p (unsigned regno
)
10770 return ((regno
>= FIRST_VIRTUAL_REGISTER
10771 && regno
<= LAST_VIRTUAL_POINTER_REGISTER
)
10772 || regno
== FRAME_POINTER_REGNUM
10773 || regno
== ARG_POINTER_REGNUM
);
10776 /* Return true if X is a valid address of type TYPE for machine mode MODE.
10777 If it is, fill in INFO appropriately. STRICT_P is true if
10778 REG_OK_STRICT is in effect. */
10781 aarch64_classify_address (struct aarch64_address_info
*info
,
10782 rtx x
, machine_mode mode
, bool strict_p
,
10783 aarch64_addr_query_type type
)
10785 enum rtx_code code
= GET_CODE (x
);
10789 HOST_WIDE_INT const_size
;
10791 /* Whether a vector mode is partial doesn't affect address legitimacy.
10792 Partial vectors like VNx8QImode allow the same indexed addressing
10793 mode and MUL VL addressing mode as full vectors like VNx16QImode;
10794 in both cases, MUL VL counts multiples of GET_MODE_SIZE. */
10795 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
10796 vec_flags
&= ~VEC_PARTIAL
;
10798 /* On BE, we use load/store pair for all large int mode load/stores.
10799 TI/TF/TDmode may also use a load/store pair. */
10800 bool advsimd_struct_p
= (vec_flags
== (VEC_ADVSIMD
| VEC_STRUCT
));
10801 bool load_store_pair_p
= (type
== ADDR_QUERY_LDP_STP
10802 || type
== ADDR_QUERY_LDP_STP_N
10806 || ((!TARGET_SIMD
|| BYTES_BIG_ENDIAN
)
10807 && advsimd_struct_p
));
10808 /* If we are dealing with ADDR_QUERY_LDP_STP_N that means the incoming mode
10809 corresponds to the actual size of the memory being loaded/stored and the
10810 mode of the corresponding addressing mode is half of that. */
10811 if (type
== ADDR_QUERY_LDP_STP_N
)
10813 if (known_eq (GET_MODE_SIZE (mode
), 16))
10815 else if (known_eq (GET_MODE_SIZE (mode
), 8))
10821 bool allow_reg_index_p
= (!load_store_pair_p
10822 && ((vec_flags
== 0
10823 && known_lt (GET_MODE_SIZE (mode
), 16))
10824 || vec_flags
== VEC_ADVSIMD
10825 || vec_flags
& VEC_SVE_DATA
));
10827 /* For SVE, only accept [Rn], [Rn, #offset, MUL VL] and [Rn, Rm, LSL #shift].
10828 The latter is not valid for SVE predicates, and that's rejected through
10829 allow_reg_index_p above. */
10830 if ((vec_flags
& (VEC_SVE_DATA
| VEC_SVE_PRED
)) != 0
10831 && (code
!= REG
&& code
!= PLUS
))
10834 /* On LE, for AdvSIMD, don't support anything other than POST_INC or
10836 if (advsimd_struct_p
10838 && !BYTES_BIG_ENDIAN
10839 && (code
!= POST_INC
&& code
!= REG
))
10842 gcc_checking_assert (GET_MODE (x
) == VOIDmode
10843 || SCALAR_INT_MODE_P (GET_MODE (x
)));
10849 info
->type
= ADDRESS_REG_IMM
;
10851 info
->offset
= const0_rtx
;
10852 info
->const_offset
= 0;
10853 return aarch64_base_register_rtx_p (x
, strict_p
);
10861 && virt_or_elim_regno_p (REGNO (op0
))
10862 && poly_int_rtx_p (op1
, &offset
))
10864 info
->type
= ADDRESS_REG_IMM
;
10866 info
->offset
= op1
;
10867 info
->const_offset
= offset
;
10872 if (maybe_ne (GET_MODE_SIZE (mode
), 0)
10873 && aarch64_base_register_rtx_p (op0
, strict_p
)
10874 && poly_int_rtx_p (op1
, &offset
))
10876 info
->type
= ADDRESS_REG_IMM
;
10878 info
->offset
= op1
;
10879 info
->const_offset
= offset
;
10881 /* TImode, TFmode and TDmode values are allowed in both pairs of X
10882 registers and individual Q registers. The available
10884 X,X: 7-bit signed scaled offset
10885 Q: 9-bit signed offset
10886 We conservatively require an offset representable in either mode.
10887 When performing the check for pairs of X registers i.e. LDP/STP
10888 pass down DImode since that is the natural size of the LDP/STP
10889 instruction memory accesses. */
10890 if (mode
== TImode
|| mode
== TFmode
|| mode
== TDmode
)
10891 return (aarch64_offset_7bit_signed_scaled_p (DImode
, offset
)
10892 && (aarch64_offset_9bit_signed_unscaled_p (mode
, offset
)
10893 || offset_12bit_unsigned_scaled_p (mode
, offset
)));
10895 if (mode
== V8DImode
)
10896 return (aarch64_offset_7bit_signed_scaled_p (DImode
, offset
)
10897 && aarch64_offset_7bit_signed_scaled_p (DImode
, offset
+ 48));
10899 /* A 7bit offset check because OImode will emit a ldp/stp
10900 instruction (only !TARGET_SIMD or big endian will get here).
10901 For ldp/stp instructions, the offset is scaled for the size of a
10902 single element of the pair. */
10903 if (aarch64_advsimd_partial_struct_mode_p (mode
)
10904 && known_eq (GET_MODE_SIZE (mode
), 16))
10905 return aarch64_offset_7bit_signed_scaled_p (DImode
, offset
);
10906 if (aarch64_advsimd_full_struct_mode_p (mode
)
10907 && known_eq (GET_MODE_SIZE (mode
), 32))
10908 return aarch64_offset_7bit_signed_scaled_p (TImode
, offset
);
10910 /* Three 9/12 bit offsets checks because CImode will emit three
10911 ldr/str instructions (only !TARGET_SIMD or big endian will
10913 if (aarch64_advsimd_partial_struct_mode_p (mode
)
10914 && known_eq (GET_MODE_SIZE (mode
), 24))
10915 return (aarch64_offset_7bit_signed_scaled_p (DImode
, offset
)
10916 && (aarch64_offset_9bit_signed_unscaled_p (DImode
,
10918 || offset_12bit_unsigned_scaled_p (DImode
,
10920 if (aarch64_advsimd_full_struct_mode_p (mode
)
10921 && known_eq (GET_MODE_SIZE (mode
), 48))
10922 return (aarch64_offset_7bit_signed_scaled_p (TImode
, offset
)
10923 && (aarch64_offset_9bit_signed_unscaled_p (TImode
,
10925 || offset_12bit_unsigned_scaled_p (TImode
,
10928 /* Two 7bit offsets checks because XImode will emit two ldp/stp
10929 instructions (only big endian will get here). */
10930 if (aarch64_advsimd_partial_struct_mode_p (mode
)
10931 && known_eq (GET_MODE_SIZE (mode
), 32))
10932 return (aarch64_offset_7bit_signed_scaled_p (DImode
, offset
)
10933 && aarch64_offset_7bit_signed_scaled_p (DImode
,
10935 if (aarch64_advsimd_full_struct_mode_p (mode
)
10936 && known_eq (GET_MODE_SIZE (mode
), 64))
10937 return (aarch64_offset_7bit_signed_scaled_p (TImode
, offset
)
10938 && aarch64_offset_7bit_signed_scaled_p (TImode
,
10941 /* Make "m" use the LD1 offset range for SVE data modes, so
10942 that pre-RTL optimizers like ivopts will work to that
10943 instead of the wider LDR/STR range. */
10944 if (vec_flags
== VEC_SVE_DATA
)
10945 return (type
== ADDR_QUERY_M
10946 ? offset_4bit_signed_scaled_p (mode
, offset
)
10947 : offset_9bit_signed_scaled_p (mode
, offset
));
10949 if (vec_flags
== (VEC_SVE_DATA
| VEC_STRUCT
))
10951 poly_int64 end_offset
= (offset
10952 + GET_MODE_SIZE (mode
)
10953 - BYTES_PER_SVE_VECTOR
);
10954 return (type
== ADDR_QUERY_M
10955 ? offset_4bit_signed_scaled_p (mode
, offset
)
10956 : (offset_9bit_signed_scaled_p (SVE_BYTE_MODE
, offset
)
10957 && offset_9bit_signed_scaled_p (SVE_BYTE_MODE
,
10961 if (vec_flags
== VEC_SVE_PRED
)
10962 return offset_9bit_signed_scaled_p (mode
, offset
);
10964 if (load_store_pair_p
)
10965 return ((known_eq (GET_MODE_SIZE (mode
), 4)
10966 || known_eq (GET_MODE_SIZE (mode
), 8)
10967 || known_eq (GET_MODE_SIZE (mode
), 16))
10968 && aarch64_offset_7bit_signed_scaled_p (mode
, offset
));
10970 return (aarch64_offset_9bit_signed_unscaled_p (mode
, offset
)
10971 || offset_12bit_unsigned_scaled_p (mode
, offset
));
10974 if (allow_reg_index_p
)
10976 /* Look for base + (scaled/extended) index register. */
10977 if (aarch64_base_register_rtx_p (op0
, strict_p
)
10978 && aarch64_classify_index (info
, op1
, mode
, strict_p
))
10983 if (aarch64_base_register_rtx_p (op1
, strict_p
)
10984 && aarch64_classify_index (info
, op0
, mode
, strict_p
))
10997 info
->type
= ADDRESS_REG_WB
;
10998 info
->base
= XEXP (x
, 0);
10999 info
->offset
= NULL_RTX
;
11000 return aarch64_base_register_rtx_p (info
->base
, strict_p
);
11004 info
->type
= ADDRESS_REG_WB
;
11005 info
->base
= XEXP (x
, 0);
11006 if (GET_CODE (XEXP (x
, 1)) == PLUS
11007 && poly_int_rtx_p (XEXP (XEXP (x
, 1), 1), &offset
)
11008 && rtx_equal_p (XEXP (XEXP (x
, 1), 0), info
->base
)
11009 && aarch64_base_register_rtx_p (info
->base
, strict_p
))
11011 info
->offset
= XEXP (XEXP (x
, 1), 1);
11012 info
->const_offset
= offset
;
11014 /* TImode, TFmode and TDmode values are allowed in both pairs of X
11015 registers and individual Q registers. The available
11017 X,X: 7-bit signed scaled offset
11018 Q: 9-bit signed offset
11019 We conservatively require an offset representable in either mode.
11021 if (mode
== TImode
|| mode
== TFmode
|| mode
== TDmode
)
11022 return (aarch64_offset_7bit_signed_scaled_p (mode
, offset
)
11023 && aarch64_offset_9bit_signed_unscaled_p (mode
, offset
));
11025 if (load_store_pair_p
)
11026 return ((known_eq (GET_MODE_SIZE (mode
), 4)
11027 || known_eq (GET_MODE_SIZE (mode
), 8)
11028 || known_eq (GET_MODE_SIZE (mode
), 16))
11029 && aarch64_offset_7bit_signed_scaled_p (mode
, offset
));
11031 return aarch64_offset_9bit_signed_unscaled_p (mode
, offset
);
11038 /* load literal: pc-relative constant pool entry. Only supported
11039 for SI mode or larger. */
11040 info
->type
= ADDRESS_SYMBOLIC
;
11042 if (!load_store_pair_p
11043 && GET_MODE_SIZE (mode
).is_constant (&const_size
)
11044 && const_size
>= 4)
11047 rtx sym
= strip_offset_and_salt (x
, &offset
);
11048 return ((LABEL_REF_P (sym
)
11049 || (SYMBOL_REF_P (sym
)
11050 && CONSTANT_POOL_ADDRESS_P (sym
)
11051 && aarch64_pcrelative_literal_loads
)));
11056 info
->type
= ADDRESS_LO_SUM
;
11057 info
->base
= XEXP (x
, 0);
11058 info
->offset
= XEXP (x
, 1);
11059 if (allow_reg_index_p
11060 && aarch64_base_register_rtx_p (info
->base
, strict_p
))
11063 HOST_WIDE_INT const_offset
;
11064 rtx sym
= strip_offset_and_salt (info
->offset
, &offset
);
11065 if (SYMBOL_REF_P (sym
)
11066 && offset
.is_constant (&const_offset
)
11067 && (aarch64_classify_symbol (sym
, const_offset
)
11068 == SYMBOL_SMALL_ABSOLUTE
))
11070 /* The symbol and offset must be aligned to the access size. */
11071 unsigned int align
;
11073 if (CONSTANT_POOL_ADDRESS_P (sym
))
11074 align
= GET_MODE_ALIGNMENT (get_pool_mode (sym
));
11075 else if (TREE_CONSTANT_POOL_ADDRESS_P (sym
))
11077 tree exp
= SYMBOL_REF_DECL (sym
);
11078 align
= TYPE_ALIGN (TREE_TYPE (exp
));
11079 align
= aarch64_constant_alignment (exp
, align
);
11081 else if (SYMBOL_REF_DECL (sym
))
11082 align
= DECL_ALIGN (SYMBOL_REF_DECL (sym
));
11083 else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym
)
11084 && SYMBOL_REF_BLOCK (sym
) != NULL
)
11085 align
= SYMBOL_REF_BLOCK (sym
)->alignment
;
11087 align
= BITS_PER_UNIT
;
11089 poly_int64 ref_size
= GET_MODE_SIZE (mode
);
11090 if (known_eq (ref_size
, 0))
11091 ref_size
= GET_MODE_SIZE (DImode
);
11093 return (multiple_p (const_offset
, ref_size
)
11094 && multiple_p (align
/ BITS_PER_UNIT
, ref_size
));
11104 /* Return true if the address X is valid for a PRFM instruction.
11105 STRICT_P is true if we should do strict checking with
11106 aarch64_classify_address. */
11109 aarch64_address_valid_for_prefetch_p (rtx x
, bool strict_p
)
11111 struct aarch64_address_info addr
;
11113 /* PRFM accepts the same addresses as DImode... */
11114 bool res
= aarch64_classify_address (&addr
, x
, DImode
, strict_p
);
11118 /* ... except writeback forms. */
11119 return addr
.type
!= ADDRESS_REG_WB
;
11123 aarch64_symbolic_address_p (rtx x
)
11126 x
= strip_offset_and_salt (x
, &offset
);
11127 return SYMBOL_REF_P (x
) || LABEL_REF_P (x
);
11130 /* Classify the base of symbolic expression X. */
11132 enum aarch64_symbol_type
11133 aarch64_classify_symbolic_expression (rtx x
)
11137 split_const (x
, &x
, &offset
);
11138 return aarch64_classify_symbol (x
, INTVAL (offset
));
11142 /* Return TRUE if X is a legitimate address for accessing memory in
11145 aarch64_legitimate_address_hook_p (machine_mode mode
, rtx x
, bool strict_p
)
11147 struct aarch64_address_info addr
;
11149 return aarch64_classify_address (&addr
, x
, mode
, strict_p
);
11152 /* Return TRUE if X is a legitimate address of type TYPE for accessing
11153 memory in mode MODE. STRICT_P is true if REG_OK_STRICT is in effect. */
11155 aarch64_legitimate_address_p (machine_mode mode
, rtx x
, bool strict_p
,
11156 aarch64_addr_query_type type
)
11158 struct aarch64_address_info addr
;
11160 return aarch64_classify_address (&addr
, x
, mode
, strict_p
, type
);
11163 /* Implement TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT. */
11166 aarch64_legitimize_address_displacement (rtx
*offset1
, rtx
*offset2
,
11167 poly_int64 orig_offset
,
11170 HOST_WIDE_INT size
;
11171 if (GET_MODE_SIZE (mode
).is_constant (&size
))
11173 HOST_WIDE_INT const_offset
, second_offset
;
11175 /* A general SVE offset is A * VQ + B. Remove the A component from
11176 coefficient 0 in order to get the constant B. */
11177 const_offset
= orig_offset
.coeffs
[0] - orig_offset
.coeffs
[1];
11179 /* Split an out-of-range address displacement into a base and
11180 offset. Use 4KB range for 1- and 2-byte accesses and a 16KB
11181 range otherwise to increase opportunities for sharing the base
11182 address of different sizes. Unaligned accesses use the signed
11183 9-bit range, TImode/TFmode/TDmode use the intersection of signed
11184 scaled 7-bit and signed 9-bit offset. */
11185 if (mode
== TImode
|| mode
== TFmode
|| mode
== TDmode
)
11186 second_offset
= ((const_offset
+ 0x100) & 0x1f8) - 0x100;
11187 else if ((const_offset
& (size
- 1)) != 0)
11188 second_offset
= ((const_offset
+ 0x100) & 0x1ff) - 0x100;
11190 second_offset
= const_offset
& (size
< 4 ? 0xfff : 0x3ffc);
11192 if (second_offset
== 0 || known_eq (orig_offset
, second_offset
))
11195 /* Split the offset into second_offset and the rest. */
11196 *offset1
= gen_int_mode (orig_offset
- second_offset
, Pmode
);
11197 *offset2
= gen_int_mode (second_offset
, Pmode
);
11202 /* Get the mode we should use as the basis of the range. For structure
11203 modes this is the mode of one vector. */
11204 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
11205 machine_mode step_mode
11206 = (vec_flags
& VEC_STRUCT
) != 0 ? SVE_BYTE_MODE
: mode
;
11208 /* Get the "mul vl" multiplier we'd like to use. */
11209 HOST_WIDE_INT factor
= GET_MODE_SIZE (step_mode
).coeffs
[1];
11210 HOST_WIDE_INT vnum
= orig_offset
.coeffs
[1] / factor
;
11211 if (vec_flags
& VEC_SVE_DATA
)
11212 /* LDR supports a 9-bit range, but the move patterns for
11213 structure modes require all vectors to be in range of the
11214 same base. The simplest way of accomodating that while still
11215 promoting reuse of anchor points between different modes is
11216 to use an 8-bit range unconditionally. */
11217 vnum
= ((vnum
+ 128) & 255) - 128;
11219 /* Predicates are only handled singly, so we might as well use
11221 vnum
= ((vnum
+ 256) & 511) - 256;
11225 /* Convert the "mul vl" multiplier into a byte offset. */
11226 poly_int64 second_offset
= GET_MODE_SIZE (step_mode
) * vnum
;
11227 if (known_eq (second_offset
, orig_offset
))
11230 /* Split the offset into second_offset and the rest. */
11231 *offset1
= gen_int_mode (orig_offset
- second_offset
, Pmode
);
11232 *offset2
= gen_int_mode (second_offset
, Pmode
);
11237 /* Return the binary representation of floating point constant VALUE in INTVAL.
11238 If the value cannot be converted, return false without setting INTVAL.
11239 The conversion is done in the given MODE. */
11241 aarch64_reinterpret_float_as_int (rtx value
, unsigned HOST_WIDE_INT
*intval
)
11244 /* We make a general exception for 0. */
11245 if (aarch64_float_const_zero_rtx_p (value
))
11251 scalar_float_mode mode
;
11252 if (!CONST_DOUBLE_P (value
)
11253 || !is_a
<scalar_float_mode
> (GET_MODE (value
), &mode
)
11254 || GET_MODE_BITSIZE (mode
) > HOST_BITS_PER_WIDE_INT
11255 /* Only support up to DF mode. */
11256 || GET_MODE_BITSIZE (mode
) > GET_MODE_BITSIZE (DFmode
))
11259 unsigned HOST_WIDE_INT ival
= 0;
11262 real_to_target (res
,
11263 CONST_DOUBLE_REAL_VALUE (value
),
11264 REAL_MODE_FORMAT (mode
));
11266 if (mode
== DFmode
|| mode
== DDmode
)
11268 int order
= BYTES_BIG_ENDIAN
? 1 : 0;
11269 ival
= zext_hwi (res
[order
], 32);
11270 ival
|= (zext_hwi (res
[1 - order
], 32) << 32);
11273 ival
= zext_hwi (res
[0], 32);
11279 /* Return TRUE if rtx X is an immediate constant that can be moved using a
11280 single MOV(+MOVK) followed by an FMOV. */
11282 aarch64_float_const_rtx_p (rtx x
)
11284 machine_mode mode
= GET_MODE (x
);
11285 if (mode
== VOIDmode
)
11288 /* Determine whether it's cheaper to write float constants as
11289 mov/movk pairs over ldr/adrp pairs. */
11290 unsigned HOST_WIDE_INT ival
;
11292 if (CONST_DOUBLE_P (x
)
11293 && SCALAR_FLOAT_MODE_P (mode
)
11294 && aarch64_reinterpret_float_as_int (x
, &ival
))
11296 machine_mode imode
= known_eq (GET_MODE_SIZE (mode
), 8) ? DImode
: SImode
;
11297 int num_instr
= aarch64_internal_mov_immediate
11298 (NULL_RTX
, gen_int_mode (ival
, imode
), false, imode
);
11299 return num_instr
< 3;
11305 /* Return TRUE if rtx X is immediate constant 0.0 (but not in Decimal
11306 Floating Point). */
11308 aarch64_float_const_zero_rtx_p (rtx x
)
11310 /* 0.0 in Decimal Floating Point cannot be represented by #0 or
11311 zr as our callers expect, so no need to check the actual
11312 value if X is of Decimal Floating Point type. */
11313 if (GET_MODE_CLASS (GET_MODE (x
)) == MODE_DECIMAL_FLOAT
)
11316 if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x
)))
11317 return !HONOR_SIGNED_ZEROS (GET_MODE (x
));
11318 return real_equal (CONST_DOUBLE_REAL_VALUE (x
), &dconst0
);
11321 /* Return TRUE if rtx X is immediate constant that fits in a single
11322 MOVI immediate operation. */
11324 aarch64_can_const_movi_rtx_p (rtx x
, machine_mode mode
)
11329 machine_mode vmode
;
11330 scalar_int_mode imode
;
11331 unsigned HOST_WIDE_INT ival
;
11333 if (CONST_DOUBLE_P (x
)
11334 && SCALAR_FLOAT_MODE_P (mode
))
11336 if (!aarch64_reinterpret_float_as_int (x
, &ival
))
11339 /* We make a general exception for 0. */
11340 if (aarch64_float_const_zero_rtx_p (x
))
11343 imode
= int_mode_for_mode (mode
).require ();
11345 else if (CONST_INT_P (x
)
11346 && is_a
<scalar_int_mode
> (mode
, &imode
))
11351 /* use a 64 bit mode for everything except for DI/DF/DD mode, where we use
11352 a 128 bit vector mode. */
11353 int width
= GET_MODE_BITSIZE (imode
) == 64 ? 128 : 64;
11355 vmode
= aarch64_simd_container_mode (imode
, width
);
11356 rtx v_op
= aarch64_simd_gen_const_vector_dup (vmode
, ival
);
11358 return aarch64_simd_valid_immediate (v_op
, NULL
);
11362 /* Return the fixed registers used for condition codes. */
11365 aarch64_fixed_condition_code_regs (unsigned int *p1
, unsigned int *p2
)
11368 *p2
= INVALID_REGNUM
;
11372 /* This function is used by the call expanders of the machine description.
11373 RESULT is the register in which the result is returned. It's NULL for
11374 "call" and "sibcall".
11375 MEM is the location of the function call.
11376 CALLEE_ABI is a const_int that gives the arm_pcs of the callee.
11377 SIBCALL indicates whether this function call is normal call or sibling call.
11378 It will generate different pattern accordingly. */
11381 aarch64_expand_call (rtx result
, rtx mem
, rtx callee_abi
, bool sibcall
)
11383 rtx call
, callee
, tmp
;
11387 gcc_assert (MEM_P (mem
));
11388 callee
= XEXP (mem
, 0);
11389 mode
= GET_MODE (callee
);
11390 gcc_assert (mode
== Pmode
);
11392 /* Decide if we should generate indirect calls by loading the
11393 address of the callee into a register before performing
11394 the branch-and-link. */
11395 if (SYMBOL_REF_P (callee
)
11396 ? (aarch64_is_long_call_p (callee
)
11397 || aarch64_is_noplt_call_p (callee
))
11399 XEXP (mem
, 0) = force_reg (mode
, callee
);
11401 call
= gen_rtx_CALL (VOIDmode
, mem
, const0_rtx
);
11403 if (result
!= NULL_RTX
)
11404 call
= gen_rtx_SET (result
, call
);
11409 tmp
= gen_rtx_CLOBBER (VOIDmode
, gen_rtx_REG (Pmode
, LR_REGNUM
));
11411 gcc_assert (CONST_INT_P (callee_abi
));
11412 callee_abi
= gen_rtx_UNSPEC (DImode
, gen_rtvec (1, callee_abi
),
11413 UNSPEC_CALLEE_ABI
);
11415 vec
= gen_rtvec (3, call
, callee_abi
, tmp
);
11416 call
= gen_rtx_PARALLEL (VOIDmode
, vec
);
11418 aarch64_emit_call_insn (call
);
11421 /* Emit call insn with PAT and do aarch64-specific handling. */
11424 aarch64_emit_call_insn (rtx pat
)
11426 rtx insn
= emit_call_insn (pat
);
11428 rtx
*fusage
= &CALL_INSN_FUNCTION_USAGE (insn
);
11429 clobber_reg (fusage
, gen_rtx_REG (word_mode
, IP0_REGNUM
));
11430 clobber_reg (fusage
, gen_rtx_REG (word_mode
, IP1_REGNUM
));
11434 aarch64_select_cc_mode (RTX_CODE code
, rtx x
, rtx y
)
11436 machine_mode mode_x
= GET_MODE (x
);
11437 rtx_code code_x
= GET_CODE (x
);
11439 /* All floating point compares return CCFP if it is an equality
11440 comparison, and CCFPE otherwise. */
11441 if (GET_MODE_CLASS (mode_x
) == MODE_FLOAT
)
11464 gcc_unreachable ();
11468 /* Equality comparisons of short modes against zero can be performed
11469 using the TST instruction with the appropriate bitmask. */
11470 if (y
== const0_rtx
&& (REG_P (x
) || SUBREG_P (x
))
11471 && (code
== EQ
|| code
== NE
)
11472 && (mode_x
== HImode
|| mode_x
== QImode
))
11475 /* Similarly, comparisons of zero_extends from shorter modes can
11476 be performed using an ANDS with an immediate mask. */
11477 if (y
== const0_rtx
&& code_x
== ZERO_EXTEND
11478 && (mode_x
== SImode
|| mode_x
== DImode
)
11479 && (GET_MODE (XEXP (x
, 0)) == HImode
|| GET_MODE (XEXP (x
, 0)) == QImode
)
11480 && (code
== EQ
|| code
== NE
))
11483 /* Zero extracts support equality comparisons. */
11484 if ((mode_x
== SImode
|| mode_x
== DImode
)
11486 && (code_x
== ZERO_EXTRACT
&& CONST_INT_P (XEXP (x
, 1))
11487 && CONST_INT_P (XEXP (x
, 2)))
11488 && (code
== EQ
|| code
== NE
))
11491 /* ANDS/BICS/TST support equality and all signed comparisons. */
11492 if ((mode_x
== SImode
|| mode_x
== DImode
)
11495 && (code
== EQ
|| code
== NE
|| code
== LT
|| code
== GE
11496 || code
== GT
|| code
== LE
))
11499 /* ADDS/SUBS correctly set N and Z flags. */
11500 if ((mode_x
== SImode
|| mode_x
== DImode
)
11502 && (code
== EQ
|| code
== NE
|| code
== LT
|| code
== GE
)
11503 && (code_x
== PLUS
|| code_x
== MINUS
|| code_x
== NEG
))
11506 /* A compare with a shifted operand. Because of canonicalization,
11507 the comparison will have to be swapped when we emit the assembly
11509 if ((mode_x
== SImode
|| mode_x
== DImode
)
11510 && (REG_P (y
) || SUBREG_P (y
) || y
== const0_rtx
)
11511 && (code_x
== ASHIFT
|| code_x
== ASHIFTRT
11512 || code_x
== LSHIFTRT
11513 || code_x
== ZERO_EXTEND
|| code_x
== SIGN_EXTEND
))
11516 /* Similarly for a negated operand, but we can only do this for
11518 if ((mode_x
== SImode
|| mode_x
== DImode
)
11519 && (REG_P (y
) || SUBREG_P (y
))
11520 && (code
== EQ
|| code
== NE
)
11524 /* A test for unsigned overflow from an addition. */
11525 if ((mode_x
== DImode
|| mode_x
== TImode
)
11526 && (code
== LTU
|| code
== GEU
)
11528 && rtx_equal_p (XEXP (x
, 0), y
))
11531 /* A test for unsigned overflow from an add with carry. */
11532 if ((mode_x
== DImode
|| mode_x
== TImode
)
11533 && (code
== LTU
|| code
== GEU
)
11535 && CONST_SCALAR_INT_P (y
)
11536 && (rtx_mode_t (y
, mode_x
)
11537 == (wi::shwi (1, mode_x
)
11538 << (GET_MODE_BITSIZE (mode_x
).to_constant () / 2))))
11541 /* A test for signed overflow. */
11542 if ((mode_x
== DImode
|| mode_x
== TImode
)
11545 && GET_CODE (y
) == SIGN_EXTEND
)
11548 /* For everything else, return CCmode. */
11553 aarch64_get_condition_code_1 (machine_mode
, enum rtx_code
);
11556 aarch64_get_condition_code (rtx x
)
11558 machine_mode mode
= GET_MODE (XEXP (x
, 0));
11559 enum rtx_code comp_code
= GET_CODE (x
);
11561 if (GET_MODE_CLASS (mode
) != MODE_CC
)
11562 mode
= SELECT_CC_MODE (comp_code
, XEXP (x
, 0), XEXP (x
, 1));
11563 return aarch64_get_condition_code_1 (mode
, comp_code
);
11567 aarch64_get_condition_code_1 (machine_mode mode
, enum rtx_code comp_code
)
11575 case GE
: return AARCH64_GE
;
11576 case GT
: return AARCH64_GT
;
11577 case LE
: return AARCH64_LS
;
11578 case LT
: return AARCH64_MI
;
11579 case NE
: return AARCH64_NE
;
11580 case EQ
: return AARCH64_EQ
;
11581 case ORDERED
: return AARCH64_VC
;
11582 case UNORDERED
: return AARCH64_VS
;
11583 case UNLT
: return AARCH64_LT
;
11584 case UNLE
: return AARCH64_LE
;
11585 case UNGT
: return AARCH64_HI
;
11586 case UNGE
: return AARCH64_PL
;
11587 default: return -1;
11594 case NE
: return AARCH64_NE
;
11595 case EQ
: return AARCH64_EQ
;
11596 case GE
: return AARCH64_GE
;
11597 case GT
: return AARCH64_GT
;
11598 case LE
: return AARCH64_LE
;
11599 case LT
: return AARCH64_LT
;
11600 case GEU
: return AARCH64_CS
;
11601 case GTU
: return AARCH64_HI
;
11602 case LEU
: return AARCH64_LS
;
11603 case LTU
: return AARCH64_CC
;
11604 default: return -1;
11611 case NE
: return AARCH64_NE
;
11612 case EQ
: return AARCH64_EQ
;
11613 case GE
: return AARCH64_LE
;
11614 case GT
: return AARCH64_LT
;
11615 case LE
: return AARCH64_GE
;
11616 case LT
: return AARCH64_GT
;
11617 case GEU
: return AARCH64_LS
;
11618 case GTU
: return AARCH64_CC
;
11619 case LEU
: return AARCH64_CS
;
11620 case LTU
: return AARCH64_HI
;
11621 default: return -1;
11628 case NE
: return AARCH64_NE
; /* = any */
11629 case EQ
: return AARCH64_EQ
; /* = none */
11630 case GE
: return AARCH64_PL
; /* = nfrst */
11631 case LT
: return AARCH64_MI
; /* = first */
11632 case GEU
: return AARCH64_CS
; /* = nlast */
11633 case GTU
: return AARCH64_HI
; /* = pmore */
11634 case LEU
: return AARCH64_LS
; /* = plast */
11635 case LTU
: return AARCH64_CC
; /* = last */
11636 default: return -1;
11643 case NE
: return AARCH64_NE
;
11644 case EQ
: return AARCH64_EQ
;
11645 case GE
: return AARCH64_PL
;
11646 case LT
: return AARCH64_MI
;
11647 case GT
: return AARCH64_GT
;
11648 case LE
: return AARCH64_LE
;
11649 default: return -1;
11656 case NE
: return AARCH64_NE
;
11657 case EQ
: return AARCH64_EQ
;
11658 case GE
: return AARCH64_PL
;
11659 case LT
: return AARCH64_MI
;
11660 default: return -1;
11667 case NE
: return AARCH64_NE
;
11668 case EQ
: return AARCH64_EQ
;
11669 default: return -1;
11676 case LTU
: return AARCH64_CS
;
11677 case GEU
: return AARCH64_CC
;
11678 default: return -1;
11685 case GEU
: return AARCH64_CS
;
11686 case LTU
: return AARCH64_CC
;
11687 default: return -1;
11694 case NE
: return AARCH64_VS
;
11695 case EQ
: return AARCH64_VC
;
11696 default: return -1;
11708 aarch64_const_vec_all_same_in_range_p (rtx x
,
11709 HOST_WIDE_INT minval
,
11710 HOST_WIDE_INT maxval
)
11713 return (const_vec_duplicate_p (x
, &elt
)
11714 && CONST_INT_P (elt
)
11715 && IN_RANGE (INTVAL (elt
), minval
, maxval
));
11719 aarch64_const_vec_all_same_int_p (rtx x
, HOST_WIDE_INT val
)
11721 return aarch64_const_vec_all_same_in_range_p (x
, val
, val
);
11724 /* Return true if VEC is a constant in which every element is in the range
11725 [MINVAL, MAXVAL]. The elements do not need to have the same value. */
11728 aarch64_const_vec_all_in_range_p (rtx vec
,
11729 HOST_WIDE_INT minval
,
11730 HOST_WIDE_INT maxval
)
11732 if (!CONST_VECTOR_P (vec
)
11733 || GET_MODE_CLASS (GET_MODE (vec
)) != MODE_VECTOR_INT
)
11737 if (!CONST_VECTOR_STEPPED_P (vec
))
11738 nunits
= const_vector_encoded_nelts (vec
);
11739 else if (!CONST_VECTOR_NUNITS (vec
).is_constant (&nunits
))
11742 for (int i
= 0; i
< nunits
; i
++)
11744 rtx vec_elem
= CONST_VECTOR_ELT (vec
, i
);
11745 if (!CONST_INT_P (vec_elem
)
11746 || !IN_RANGE (INTVAL (vec_elem
), minval
, maxval
))
11753 #define AARCH64_CC_V 1
11754 #define AARCH64_CC_C (1 << 1)
11755 #define AARCH64_CC_Z (1 << 2)
11756 #define AARCH64_CC_N (1 << 3)
11758 /* N Z C V flags for ccmp. Indexed by AARCH64_COND_CODE. */
11759 static const int aarch64_nzcv_codes
[] =
11761 0, /* EQ, Z == 1. */
11762 AARCH64_CC_Z
, /* NE, Z == 0. */
11763 0, /* CS, C == 1. */
11764 AARCH64_CC_C
, /* CC, C == 0. */
11765 0, /* MI, N == 1. */
11766 AARCH64_CC_N
, /* PL, N == 0. */
11767 0, /* VS, V == 1. */
11768 AARCH64_CC_V
, /* VC, V == 0. */
11769 0, /* HI, C ==1 && Z == 0. */
11770 AARCH64_CC_C
, /* LS, !(C == 1 && Z == 0). */
11771 AARCH64_CC_V
, /* GE, N == V. */
11772 0, /* LT, N != V. */
11773 AARCH64_CC_Z
, /* GT, Z == 0 && N == V. */
11774 0, /* LE, !(Z == 0 && N == V). */
11779 /* Print floating-point vector immediate operand X to F, negating it
11780 first if NEGATE is true. Return true on success, false if it isn't
11781 a constant we can handle. */
11784 aarch64_print_vector_float_operand (FILE *f
, rtx x
, bool negate
)
11788 if (!const_vec_duplicate_p (x
, &elt
))
11791 REAL_VALUE_TYPE r
= *CONST_DOUBLE_REAL_VALUE (elt
);
11793 r
= real_value_negate (&r
);
11795 /* Handle the SVE single-bit immediates specially, since they have a
11796 fixed form in the assembly syntax. */
11797 if (real_equal (&r
, &dconst0
))
11798 asm_fprintf (f
, "0.0");
11799 else if (real_equal (&r
, &dconst2
))
11800 asm_fprintf (f
, "2.0");
11801 else if (real_equal (&r
, &dconst1
))
11802 asm_fprintf (f
, "1.0");
11803 else if (real_equal (&r
, &dconsthalf
))
11804 asm_fprintf (f
, "0.5");
11807 const int buf_size
= 20;
11808 char float_buf
[buf_size
] = {'\0'};
11809 real_to_decimal_for_mode (float_buf
, &r
, buf_size
, buf_size
,
11810 1, GET_MODE (elt
));
11811 asm_fprintf (f
, "%s", float_buf
);
11817 /* Return the equivalent letter for size. */
11819 sizetochar (int size
)
11823 case 64: return 'd';
11824 case 32: return 's';
11825 case 16: return 'h';
11826 case 8 : return 'b';
11827 default: gcc_unreachable ();
11831 /* Print operand X to file F in a target specific manner according to CODE.
11832 The acceptable formatting commands given by CODE are:
11833 'c': An integer or symbol address without a preceding #
11835 'C': Take the duplicated element in a vector constant
11836 and print it in hex.
11837 'D': Take the duplicated element in a vector constant
11838 and print it as an unsigned integer, in decimal.
11839 'e': Print the sign/zero-extend size as a character 8->b,
11840 16->h, 32->w. Can also be used for masks:
11841 0xff->b, 0xffff->h, 0xffffffff->w.
11842 'I': If the operand is a duplicated vector constant,
11843 replace it with the duplicated scalar. If the
11844 operand is then a floating-point constant, replace
11845 it with the integer bit representation. Print the
11846 transformed constant as a signed decimal number.
11847 'p': Prints N such that 2^N == X (X must be power of 2 and
11849 'P': Print the number of non-zero bits in X (a const_int).
11850 'H': Print the higher numbered register of a pair (TImode)
11852 'm': Print a condition (eq, ne, etc).
11853 'M': Same as 'm', but invert condition.
11854 'N': Take the duplicated element in a vector constant
11855 and print the negative of it in decimal.
11856 'b/h/s/d/q': Print a scalar FP/SIMD register name.
11857 'S/T/U/V': Print a FP/SIMD register name for a register list.
11858 The register printed is the FP/SIMD register name
11859 of X + 0/1/2/3 for S/T/U/V.
11860 'R': Print a scalar Integer/FP/SIMD register name + 1.
11861 'X': Print bottom 16 bits of integer constant in hex.
11862 'w/x': Print a general register name or the zero register
11863 (32-bit or 64-bit).
11864 '0': Print a normal operand, if it's a general register,
11865 then we assume DImode.
11866 'k': Print NZCV for conditional compare instructions.
11867 'A': Output address constant representing the first
11868 argument of X, specifying a relocation offset
11870 'L': Output constant address specified by X
11871 with a relocation offset if appropriate.
11872 'G': Prints address of X, specifying a PC relative
11873 relocation mode if appropriate.
11874 'y': Output address of LDP or STP - this is used for
11875 some LDP/STPs which don't use a PARALLEL in their
11876 pattern (so the mode needs to be adjusted).
11877 'z': Output address of a typical LDP or STP. */
11880 aarch64_print_operand (FILE *f
, rtx x
, int code
)
11886 if (CONST_INT_P (x
))
11887 fprintf (f
, HOST_WIDE_INT_PRINT_DEC
, INTVAL (x
));
11891 rtx base
= strip_offset_and_salt (x
, &offset
);
11892 if (SYMBOL_REF_P (base
))
11893 output_addr_const (f
, x
);
11895 output_operand_lossage ("unsupported operand for code '%c'", code
);
11901 x
= unwrap_const_vec_duplicate (x
);
11902 if (!CONST_INT_P (x
))
11904 output_operand_lossage ("invalid operand for '%%%c'", code
);
11908 HOST_WIDE_INT val
= INTVAL (x
);
11909 if ((val
& ~7) == 8 || val
== 0xff)
11911 else if ((val
& ~7) == 16 || val
== 0xffff)
11913 else if ((val
& ~7) == 32 || val
== 0xffffffff)
11917 output_operand_lossage ("invalid operand for '%%%c'", code
);
11927 if (!CONST_INT_P (x
) || (n
= exact_log2 (INTVAL (x
))) < 0)
11929 output_operand_lossage ("invalid operand for '%%%c'", code
);
11933 asm_fprintf (f
, "%d", n
);
11938 if (!CONST_INT_P (x
))
11940 output_operand_lossage ("invalid operand for '%%%c'", code
);
11944 asm_fprintf (f
, "%u", popcount_hwi (INTVAL (x
)));
11948 if (x
== const0_rtx
)
11950 asm_fprintf (f
, "xzr");
11954 if (!REG_P (x
) || !GP_REGNUM_P (REGNO (x
) + 1))
11956 output_operand_lossage ("invalid operand for '%%%c'", code
);
11960 asm_fprintf (f
, "%s", reg_names
[REGNO (x
) + 1]);
11965 x
= aarch64_bit_representation (unwrap_const_vec_duplicate (x
));
11966 if (CONST_INT_P (x
))
11967 asm_fprintf (f
, "%wd", INTVAL (x
));
11970 output_operand_lossage ("invalid operand for '%%%c'", code
);
11980 /* CONST_TRUE_RTX means al/nv (al is the default, don't print it). */
11981 if (x
== const_true_rtx
)
11988 if (!COMPARISON_P (x
))
11990 output_operand_lossage ("invalid operand for '%%%c'", code
);
11994 cond_code
= aarch64_get_condition_code (x
);
11995 gcc_assert (cond_code
>= 0);
11997 cond_code
= AARCH64_INVERSE_CONDITION_CODE (cond_code
);
11998 if (GET_MODE (XEXP (x
, 0)) == CC_NZCmode
)
11999 fputs (aarch64_sve_condition_codes
[cond_code
], f
);
12001 fputs (aarch64_condition_codes
[cond_code
], f
);
12006 if (!const_vec_duplicate_p (x
, &elt
))
12008 output_operand_lossage ("invalid vector constant");
12012 if (GET_MODE_CLASS (GET_MODE (x
)) == MODE_VECTOR_INT
)
12013 asm_fprintf (f
, "%wd", (HOST_WIDE_INT
) -UINTVAL (elt
));
12014 else if (GET_MODE_CLASS (GET_MODE (x
)) == MODE_VECTOR_FLOAT
12015 && aarch64_print_vector_float_operand (f
, x
, true))
12019 output_operand_lossage ("invalid vector constant");
12029 if (!REG_P (x
) || !FP_REGNUM_P (REGNO (x
)))
12031 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code
);
12034 asm_fprintf (f
, "%c%d", code
, REGNO (x
) - V0_REGNUM
);
12041 if (!REG_P (x
) || !FP_REGNUM_P (REGNO (x
)))
12043 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code
);
12046 asm_fprintf (f
, "%c%d",
12047 aarch64_sve_data_mode_p (GET_MODE (x
)) ? 'z' : 'v',
12048 REGNO (x
) - V0_REGNUM
+ (code
- 'S'));
12052 if (REG_P (x
) && FP_REGNUM_P (REGNO (x
))
12053 && (aarch64_advsimd_partial_struct_mode_p (GET_MODE (x
))))
12054 asm_fprintf (f
, "d%d", REGNO (x
) - V0_REGNUM
+ 1);
12055 else if (REG_P (x
) && FP_REGNUM_P (REGNO (x
)))
12056 asm_fprintf (f
, "q%d", REGNO (x
) - V0_REGNUM
+ 1);
12057 else if (REG_P (x
) && GP_REGNUM_P (REGNO (x
)))
12058 asm_fprintf (f
, "x%d", REGNO (x
) - R0_REGNUM
+ 1);
12060 output_operand_lossage ("incompatible register operand for '%%%c'",
12065 if (!CONST_INT_P (x
))
12067 output_operand_lossage ("invalid operand for '%%%c'", code
);
12070 asm_fprintf (f
, "0x%wx", UINTVAL (x
) & 0xffff);
12075 /* Print a replicated constant in hex. */
12076 if (!const_vec_duplicate_p (x
, &elt
) || !CONST_INT_P (elt
))
12078 output_operand_lossage ("invalid operand for '%%%c'", code
);
12081 scalar_mode inner_mode
= GET_MODE_INNER (GET_MODE (x
));
12082 asm_fprintf (f
, "0x%wx", UINTVAL (elt
) & GET_MODE_MASK (inner_mode
));
12088 /* Print a replicated constant in decimal, treating it as
12090 if (!const_vec_duplicate_p (x
, &elt
) || !CONST_INT_P (elt
))
12092 output_operand_lossage ("invalid operand for '%%%c'", code
);
12095 scalar_mode inner_mode
= GET_MODE_INNER (GET_MODE (x
));
12096 asm_fprintf (f
, "%wd", UINTVAL (elt
) & GET_MODE_MASK (inner_mode
));
12102 if (x
== const0_rtx
12103 || (CONST_DOUBLE_P (x
) && aarch64_float_const_zero_rtx_p (x
)))
12105 asm_fprintf (f
, "%czr", code
);
12109 if (REG_P (x
) && GP_REGNUM_P (REGNO (x
)))
12111 asm_fprintf (f
, "%c%d", code
, REGNO (x
) - R0_REGNUM
);
12115 if (REG_P (x
) && REGNO (x
) == SP_REGNUM
)
12117 asm_fprintf (f
, "%ssp", code
== 'w' ? "w" : "");
12126 output_operand_lossage ("missing operand");
12130 switch (GET_CODE (x
))
12133 if (aarch64_sve_data_mode_p (GET_MODE (x
)))
12135 if (REG_NREGS (x
) == 1)
12136 asm_fprintf (f
, "z%d", REGNO (x
) - V0_REGNUM
);
12140 = sizetochar (GET_MODE_UNIT_BITSIZE (GET_MODE (x
)));
12141 asm_fprintf (f
, "{z%d.%c - z%d.%c}",
12142 REGNO (x
) - V0_REGNUM
, suffix
,
12143 END_REGNO (x
) - V0_REGNUM
- 1, suffix
);
12147 asm_fprintf (f
, "%s", reg_names
[REGNO (x
)]);
12151 output_address (GET_MODE (x
), XEXP (x
, 0));
12156 output_addr_const (asm_out_file
, x
);
12160 asm_fprintf (f
, "%wd", INTVAL (x
));
12164 if (!VECTOR_MODE_P (GET_MODE (x
)))
12166 output_addr_const (asm_out_file
, x
);
12172 if (!const_vec_duplicate_p (x
, &elt
))
12174 output_operand_lossage ("invalid vector constant");
12178 if (GET_MODE_CLASS (GET_MODE (x
)) == MODE_VECTOR_INT
)
12179 asm_fprintf (f
, "%wd", INTVAL (elt
));
12180 else if (GET_MODE_CLASS (GET_MODE (x
)) == MODE_VECTOR_FLOAT
12181 && aarch64_print_vector_float_operand (f
, x
, false))
12185 output_operand_lossage ("invalid vector constant");
12191 /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
12192 be getting CONST_DOUBLEs holding integers. */
12193 gcc_assert (GET_MODE (x
) != VOIDmode
);
12194 if (aarch64_float_const_zero_rtx_p (x
))
12199 else if (aarch64_float_const_representable_p (x
))
12201 #define buf_size 20
12202 char float_buf
[buf_size
] = {'\0'};
12203 real_to_decimal_for_mode (float_buf
,
12204 CONST_DOUBLE_REAL_VALUE (x
),
12205 buf_size
, buf_size
,
12207 asm_fprintf (asm_out_file
, "%s", float_buf
);
12211 output_operand_lossage ("invalid constant");
12214 output_operand_lossage ("invalid operand");
12220 if (GET_CODE (x
) == HIGH
)
12223 switch (aarch64_classify_symbolic_expression (x
))
12225 case SYMBOL_SMALL_GOT_4G
:
12226 asm_fprintf (asm_out_file
, ":got:");
12229 case SYMBOL_SMALL_TLSGD
:
12230 asm_fprintf (asm_out_file
, ":tlsgd:");
12233 case SYMBOL_SMALL_TLSDESC
:
12234 asm_fprintf (asm_out_file
, ":tlsdesc:");
12237 case SYMBOL_SMALL_TLSIE
:
12238 asm_fprintf (asm_out_file
, ":gottprel:");
12241 case SYMBOL_TLSLE24
:
12242 asm_fprintf (asm_out_file
, ":tprel:");
12245 case SYMBOL_TINY_GOT
:
12246 gcc_unreachable ();
12252 output_addr_const (asm_out_file
, x
);
12256 switch (aarch64_classify_symbolic_expression (x
))
12258 case SYMBOL_SMALL_GOT_4G
:
12259 asm_fprintf (asm_out_file
, ":got_lo12:");
12262 case SYMBOL_SMALL_TLSGD
:
12263 asm_fprintf (asm_out_file
, ":tlsgd_lo12:");
12266 case SYMBOL_SMALL_TLSDESC
:
12267 asm_fprintf (asm_out_file
, ":tlsdesc_lo12:");
12270 case SYMBOL_SMALL_TLSIE
:
12271 asm_fprintf (asm_out_file
, ":gottprel_lo12:");
12274 case SYMBOL_TLSLE12
:
12275 asm_fprintf (asm_out_file
, ":tprel_lo12:");
12278 case SYMBOL_TLSLE24
:
12279 asm_fprintf (asm_out_file
, ":tprel_lo12_nc:");
12282 case SYMBOL_TINY_GOT
:
12283 asm_fprintf (asm_out_file
, ":got:");
12286 case SYMBOL_TINY_TLSIE
:
12287 asm_fprintf (asm_out_file
, ":gottprel:");
12293 output_addr_const (asm_out_file
, x
);
12297 switch (aarch64_classify_symbolic_expression (x
))
12299 case SYMBOL_TLSLE24
:
12300 asm_fprintf (asm_out_file
, ":tprel_hi12:");
12305 output_addr_const (asm_out_file
, x
);
12310 HOST_WIDE_INT cond_code
;
12312 if (!CONST_INT_P (x
))
12314 output_operand_lossage ("invalid operand for '%%%c'", code
);
12318 cond_code
= INTVAL (x
);
12319 gcc_assert (cond_code
>= 0 && cond_code
<= AARCH64_NV
);
12320 asm_fprintf (f
, "%d", aarch64_nzcv_codes
[cond_code
]);
12327 machine_mode mode
= GET_MODE (x
);
12331 && maybe_ne (GET_MODE_SIZE (mode
), 8)
12332 && maybe_ne (GET_MODE_SIZE (mode
), 16)))
12334 output_operand_lossage ("invalid operand for '%%%c'", code
);
12338 if (!aarch64_print_address_internal (f
, mode
, XEXP (x
, 0),
12340 ? ADDR_QUERY_LDP_STP_N
12341 : ADDR_QUERY_LDP_STP
))
12342 output_operand_lossage ("invalid operand prefix '%%%c'", code
);
12347 output_operand_lossage ("invalid operand prefix '%%%c'", code
);
12352 /* Print address 'x' of a memory access with mode 'mode'.
12353 'op' is the context required by aarch64_classify_address. It can either be
12354 MEM for a normal memory access or PARALLEL for LDP/STP. */
12356 aarch64_print_address_internal (FILE *f
, machine_mode mode
, rtx x
,
12357 aarch64_addr_query_type type
)
12359 struct aarch64_address_info addr
;
12360 unsigned int size
, vec_flags
;
12362 /* Check all addresses are Pmode - including ILP32. */
12363 if (GET_MODE (x
) != Pmode
12364 && (!CONST_INT_P (x
)
12365 || trunc_int_for_mode (INTVAL (x
), Pmode
) != INTVAL (x
)))
12367 output_operand_lossage ("invalid address mode");
12371 if (aarch64_classify_address (&addr
, x
, mode
, true, type
))
12374 case ADDRESS_REG_IMM
:
12375 if (known_eq (addr
.const_offset
, 0))
12377 asm_fprintf (f
, "[%s]", reg_names
[REGNO (addr
.base
)]);
12381 vec_flags
= aarch64_classify_vector_mode (mode
);
12382 if (vec_flags
& VEC_ANY_SVE
)
12385 = exact_div (addr
.const_offset
,
12386 aarch64_vl_bytes (mode
, vec_flags
)).to_constant ();
12387 asm_fprintf (f
, "[%s, #%wd, mul vl]",
12388 reg_names
[REGNO (addr
.base
)], vnum
);
12392 asm_fprintf (f
, "[%s, %wd]", reg_names
[REGNO (addr
.base
)],
12393 INTVAL (addr
.offset
));
12396 case ADDRESS_REG_REG
:
12397 if (addr
.shift
== 0)
12398 asm_fprintf (f
, "[%s, %s]", reg_names
[REGNO (addr
.base
)],
12399 reg_names
[REGNO (addr
.offset
)]);
12401 asm_fprintf (f
, "[%s, %s, lsl %u]", reg_names
[REGNO (addr
.base
)],
12402 reg_names
[REGNO (addr
.offset
)], addr
.shift
);
12405 case ADDRESS_REG_UXTW
:
12406 if (addr
.shift
== 0)
12407 asm_fprintf (f
, "[%s, w%d, uxtw]", reg_names
[REGNO (addr
.base
)],
12408 REGNO (addr
.offset
) - R0_REGNUM
);
12410 asm_fprintf (f
, "[%s, w%d, uxtw %u]", reg_names
[REGNO (addr
.base
)],
12411 REGNO (addr
.offset
) - R0_REGNUM
, addr
.shift
);
12414 case ADDRESS_REG_SXTW
:
12415 if (addr
.shift
== 0)
12416 asm_fprintf (f
, "[%s, w%d, sxtw]", reg_names
[REGNO (addr
.base
)],
12417 REGNO (addr
.offset
) - R0_REGNUM
);
12419 asm_fprintf (f
, "[%s, w%d, sxtw %u]", reg_names
[REGNO (addr
.base
)],
12420 REGNO (addr
.offset
) - R0_REGNUM
, addr
.shift
);
12423 case ADDRESS_REG_WB
:
12424 /* Writeback is only supported for fixed-width modes. */
12425 size
= GET_MODE_SIZE (mode
).to_constant ();
12426 switch (GET_CODE (x
))
12429 asm_fprintf (f
, "[%s, %d]!", reg_names
[REGNO (addr
.base
)], size
);
12432 asm_fprintf (f
, "[%s], %d", reg_names
[REGNO (addr
.base
)], size
);
12435 asm_fprintf (f
, "[%s, -%d]!", reg_names
[REGNO (addr
.base
)], size
);
12438 asm_fprintf (f
, "[%s], -%d", reg_names
[REGNO (addr
.base
)], size
);
12441 asm_fprintf (f
, "[%s, %wd]!", reg_names
[REGNO (addr
.base
)],
12442 INTVAL (addr
.offset
));
12445 asm_fprintf (f
, "[%s], %wd", reg_names
[REGNO (addr
.base
)],
12446 INTVAL (addr
.offset
));
12453 case ADDRESS_LO_SUM
:
12454 asm_fprintf (f
, "[%s, #:lo12:", reg_names
[REGNO (addr
.base
)]);
12455 output_addr_const (f
, addr
.offset
);
12456 asm_fprintf (f
, "]");
12459 case ADDRESS_SYMBOLIC
:
12460 output_addr_const (f
, x
);
12467 /* Print address 'x' of a memory access with mode 'mode'. */
12469 aarch64_print_operand_address (FILE *f
, machine_mode mode
, rtx x
)
12471 if (!aarch64_print_address_internal (f
, mode
, x
, ADDR_QUERY_ANY
))
12472 output_addr_const (f
, x
);
12475 /* Implement TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA. */
12478 aarch64_output_addr_const_extra (FILE *file
, rtx x
)
12480 if (GET_CODE (x
) == UNSPEC
&& XINT (x
, 1) == UNSPEC_SALT_ADDR
)
12482 output_addr_const (file
, XVECEXP (x
, 0, 0));
12489 aarch64_label_mentioned_p (rtx x
)
12494 if (LABEL_REF_P (x
))
12497 /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
12498 referencing instruction, but they are constant offsets, not
12500 if (GET_CODE (x
) == UNSPEC
&& XINT (x
, 1) == UNSPEC_TLS
)
12503 fmt
= GET_RTX_FORMAT (GET_CODE (x
));
12504 for (i
= GET_RTX_LENGTH (GET_CODE (x
)) - 1; i
>= 0; i
--)
12510 for (j
= XVECLEN (x
, i
) - 1; j
>= 0; j
--)
12511 if (aarch64_label_mentioned_p (XVECEXP (x
, i
, j
)))
12514 else if (fmt
[i
] == 'e' && aarch64_label_mentioned_p (XEXP (x
, i
)))
12521 /* Implement REGNO_REG_CLASS. */
12524 aarch64_regno_regclass (unsigned regno
)
12526 if (STUB_REGNUM_P (regno
))
12529 if (GP_REGNUM_P (regno
))
12530 return GENERAL_REGS
;
12532 if (regno
== SP_REGNUM
)
12535 if (regno
== FRAME_POINTER_REGNUM
12536 || regno
== ARG_POINTER_REGNUM
)
12537 return POINTER_REGS
;
12539 if (FP_REGNUM_P (regno
))
12540 return (FP_LO8_REGNUM_P (regno
) ? FP_LO8_REGS
12541 : FP_LO_REGNUM_P (regno
) ? FP_LO_REGS
: FP_REGS
);
12543 if (PR_REGNUM_P (regno
))
12544 return PR_LO_REGNUM_P (regno
) ? PR_LO_REGS
: PR_HI_REGS
;
12546 if (regno
== FFR_REGNUM
|| regno
== FFRT_REGNUM
)
12552 /* OFFSET is an address offset for mode MODE, which has SIZE bytes.
12553 If OFFSET is out of range, return an offset of an anchor point
12554 that is in range. Return 0 otherwise. */
12556 static HOST_WIDE_INT
12557 aarch64_anchor_offset (HOST_WIDE_INT offset
, HOST_WIDE_INT size
,
12560 /* Does it look like we'll need a 16-byte load/store-pair operation? */
12562 return (offset
+ 0x400) & ~0x7f0;
12564 /* For offsets that aren't a multiple of the access size, the limit is
12566 if (offset
& (size
- 1))
12568 /* BLKmode typically uses LDP of X-registers. */
12569 if (mode
== BLKmode
)
12570 return (offset
+ 512) & ~0x3ff;
12571 return (offset
+ 0x100) & ~0x1ff;
12574 /* Small negative offsets are supported. */
12575 if (IN_RANGE (offset
, -256, 0))
12578 if (mode
== TImode
|| mode
== TFmode
|| mode
== TDmode
)
12579 return (offset
+ 0x100) & ~0x1ff;
12581 /* Use 12-bit offset by access size. */
12582 return offset
& (~0xfff * size
);
12586 aarch64_legitimize_address (rtx x
, rtx
/* orig_x */, machine_mode mode
)
12588 /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
12589 where mask is selected by alignment and size of the offset.
12590 We try to pick as large a range for the offset as possible to
12591 maximize the chance of a CSE. However, for aligned addresses
12592 we limit the range to 4k so that structures with different sized
12593 elements are likely to use the same base. We need to be careful
12594 not to split a CONST for some forms of address expression, otherwise
12595 it will generate sub-optimal code. */
12597 if (GET_CODE (x
) == PLUS
&& CONST_INT_P (XEXP (x
, 1)))
12599 rtx base
= XEXP (x
, 0);
12600 rtx offset_rtx
= XEXP (x
, 1);
12601 HOST_WIDE_INT offset
= INTVAL (offset_rtx
);
12603 if (GET_CODE (base
) == PLUS
)
12605 rtx op0
= XEXP (base
, 0);
12606 rtx op1
= XEXP (base
, 1);
12608 /* Force any scaling into a temp for CSE. */
12609 op0
= force_reg (Pmode
, op0
);
12610 op1
= force_reg (Pmode
, op1
);
12612 /* Let the pointer register be in op0. */
12613 if (REG_POINTER (op1
))
12614 std::swap (op0
, op1
);
12616 /* If the pointer is virtual or frame related, then we know that
12617 virtual register instantiation or register elimination is going
12618 to apply a second constant. We want the two constants folded
12619 together easily. Therefore, emit as (OP0 + CONST) + OP1. */
12620 if (virt_or_elim_regno_p (REGNO (op0
)))
12622 base
= expand_binop (Pmode
, add_optab
, op0
, offset_rtx
,
12623 NULL_RTX
, true, OPTAB_DIRECT
);
12624 return gen_rtx_PLUS (Pmode
, base
, op1
);
12627 /* Otherwise, in order to encourage CSE (and thence loop strength
12628 reduce) scaled addresses, emit as (OP0 + OP1) + CONST. */
12629 base
= expand_binop (Pmode
, add_optab
, op0
, op1
,
12630 NULL_RTX
, true, OPTAB_DIRECT
);
12631 x
= gen_rtx_PLUS (Pmode
, base
, offset_rtx
);
12634 HOST_WIDE_INT size
;
12635 if (GET_MODE_SIZE (mode
).is_constant (&size
))
12637 HOST_WIDE_INT base_offset
= aarch64_anchor_offset (offset
, size
,
12639 if (base_offset
!= 0)
12641 base
= plus_constant (Pmode
, base
, base_offset
);
12642 base
= force_operand (base
, NULL_RTX
);
12643 return plus_constant (Pmode
, base
, offset
- base_offset
);
12652 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED
, rtx x
,
12653 reg_class_t rclass
,
12655 secondary_reload_info
*sri
)
12657 /* Use aarch64_sve_reload_mem for SVE memory reloads that cannot use
12658 LDR and STR. See the comment at the head of aarch64-sve.md for
12659 more details about the big-endian handling. */
12660 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
12661 if (reg_class_subset_p (rclass
, FP_REGS
)
12662 && !((REG_P (x
) && HARD_REGISTER_P (x
))
12663 || aarch64_simd_valid_immediate (x
, NULL
))
12664 && mode
!= VNx16QImode
12665 && (vec_flags
& VEC_SVE_DATA
)
12666 && ((vec_flags
& VEC_PARTIAL
) || BYTES_BIG_ENDIAN
))
12668 sri
->icode
= CODE_FOR_aarch64_sve_reload_mem
;
12672 /* If we have to disable direct literal pool loads and stores because the
12673 function is too big, then we need a scratch register. */
12674 if (MEM_P (x
) && SYMBOL_REF_P (x
) && CONSTANT_POOL_ADDRESS_P (x
)
12675 && (SCALAR_FLOAT_MODE_P (GET_MODE (x
))
12676 || targetm
.vector_mode_supported_p (GET_MODE (x
)))
12677 && !aarch64_pcrelative_literal_loads
)
12679 sri
->icode
= code_for_aarch64_reload_movcp (mode
, DImode
);
12683 /* Without the TARGET_SIMD instructions we cannot move a Q register
12684 to a Q register directly. We need a scratch. */
12689 || (vec_flags
== VEC_ADVSIMD
&& known_eq (GET_MODE_SIZE (mode
), 16)))
12690 && mode
== GET_MODE (x
)
12692 && FP_REGNUM_P (REGNO (x
))
12693 && reg_class_subset_p (rclass
, FP_REGS
))
12695 sri
->icode
= code_for_aarch64_reload_mov (mode
);
12699 /* A TFmode, TImode or TDmode memory access should be handled via an FP_REGS
12700 because AArch64 has richer addressing modes for LDR/STR instructions
12701 than LDP/STP instructions. */
12702 if (TARGET_FLOAT
&& rclass
== GENERAL_REGS
12703 && known_eq (GET_MODE_SIZE (mode
), 16) && MEM_P (x
))
12706 if (rclass
== FP_REGS
12707 && (mode
== TImode
|| mode
== TFmode
|| mode
== TDmode
)
12709 return GENERAL_REGS
;
12714 /* Implement TARGET_SECONDARY_MEMORY_NEEDED. */
12717 aarch64_secondary_memory_needed (machine_mode mode
, reg_class_t class1
,
12718 reg_class_t class2
)
12721 && reg_classes_intersect_p (class1
, FP_REGS
)
12722 && reg_classes_intersect_p (class2
, FP_REGS
))
12724 /* We can't do a 128-bit FPR-to-FPR move without TARGET_SIMD,
12725 so we can't easily split a move involving tuples of 128-bit
12726 vectors. Force the copy through memory instead.
12728 (Tuples of 64-bit vectors are fine.) */
12729 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
12730 if (vec_flags
== (VEC_ADVSIMD
| VEC_STRUCT
))
12737 aarch64_can_eliminate (const int from ATTRIBUTE_UNUSED
, const int to
)
12739 gcc_assert (from
== ARG_POINTER_REGNUM
|| from
== FRAME_POINTER_REGNUM
);
12741 /* If we need a frame pointer, ARG_POINTER_REGNUM and FRAME_POINTER_REGNUM
12742 can only eliminate to HARD_FRAME_POINTER_REGNUM. */
12743 if (frame_pointer_needed
)
12744 return to
== HARD_FRAME_POINTER_REGNUM
;
12749 aarch64_initial_elimination_offset (unsigned from
, unsigned to
)
12751 if (to
== HARD_FRAME_POINTER_REGNUM
)
12753 if (from
== ARG_POINTER_REGNUM
)
12754 return cfun
->machine
->frame
.hard_fp_offset
;
12756 if (from
== FRAME_POINTER_REGNUM
)
12757 return cfun
->machine
->frame
.hard_fp_offset
12758 - cfun
->machine
->frame
.locals_offset
;
12761 if (to
== STACK_POINTER_REGNUM
)
12763 if (from
== FRAME_POINTER_REGNUM
)
12764 return cfun
->machine
->frame
.frame_size
12765 - cfun
->machine
->frame
.locals_offset
;
12768 return cfun
->machine
->frame
.frame_size
;
12772 /* Get return address without mangling. */
12775 aarch64_return_addr_rtx (void)
12777 rtx val
= get_hard_reg_initial_val (Pmode
, LR_REGNUM
);
12778 /* Note: aarch64_return_address_signing_enabled only
12779 works after cfun->machine->frame.laid_out is set,
12780 so here we don't know if the return address will
12781 be signed or not. */
12782 rtx lr
= gen_rtx_REG (Pmode
, LR_REGNUM
);
12783 emit_move_insn (lr
, val
);
12784 emit_insn (GEN_FCN (CODE_FOR_xpaclri
) ());
12789 /* Implement RETURN_ADDR_RTX. We do not support moving back to a
12793 aarch64_return_addr (int count
, rtx frame ATTRIBUTE_UNUSED
)
12797 return aarch64_return_addr_rtx ();
12801 aarch64_asm_trampoline_template (FILE *f
)
12803 /* Even if the current function doesn't have branch protection, some
12804 later function might, so since this template is only generated once
12805 we have to add a BTI just in case. */
12806 asm_fprintf (f
, "\thint\t34 // bti c\n");
12810 asm_fprintf (f
, "\tldr\tw%d, .+20\n", IP1_REGNUM
- R0_REGNUM
);
12811 asm_fprintf (f
, "\tldr\tw%d, .+20\n", STATIC_CHAIN_REGNUM
- R0_REGNUM
);
12815 asm_fprintf (f
, "\tldr\t%s, .+20\n", reg_names
[IP1_REGNUM
]);
12816 asm_fprintf (f
, "\tldr\t%s, .+24\n", reg_names
[STATIC_CHAIN_REGNUM
]);
12818 asm_fprintf (f
, "\tbr\t%s\n", reg_names
[IP1_REGNUM
]);
12820 /* We always emit a speculation barrier.
12821 This is because the same trampoline template is used for every nested
12822 function. Since nested functions are not particularly common or
12823 performant we don't worry too much about the extra instructions to copy
12825 This is not yet a problem, since we have not yet implemented function
12826 specific attributes to choose between hardening against straight line
12827 speculation or not, but such function specific attributes are likely to
12828 happen in the future. */
12829 asm_fprintf (f
, "\tdsb\tsy\n\tisb\n");
12831 assemble_aligned_integer (POINTER_BYTES
, const0_rtx
);
12832 assemble_aligned_integer (POINTER_BYTES
, const0_rtx
);
12836 aarch64_trampoline_init (rtx m_tramp
, tree fndecl
, rtx chain_value
)
12838 rtx fnaddr
, mem
, a_tramp
;
12839 const int tramp_code_sz
= 24;
12841 /* Don't need to copy the trailing D-words, we fill those in below. */
12842 /* We create our own memory address in Pmode so that `emit_block_move` can
12843 use parts of the backend which expect Pmode addresses. */
12844 rtx temp
= convert_memory_address (Pmode
, XEXP (m_tramp
, 0));
12845 emit_block_move (gen_rtx_MEM (BLKmode
, temp
),
12846 assemble_trampoline_template (),
12847 GEN_INT (tramp_code_sz
), BLOCK_OP_NORMAL
);
12848 mem
= adjust_address (m_tramp
, ptr_mode
, tramp_code_sz
);
12849 fnaddr
= XEXP (DECL_RTL (fndecl
), 0);
12850 if (GET_MODE (fnaddr
) != ptr_mode
)
12851 fnaddr
= convert_memory_address (ptr_mode
, fnaddr
);
12852 emit_move_insn (mem
, fnaddr
);
12854 mem
= adjust_address (m_tramp
, ptr_mode
, tramp_code_sz
+ POINTER_BYTES
);
12855 emit_move_insn (mem
, chain_value
);
12857 /* XXX We should really define a "clear_cache" pattern and use
12858 gen_clear_cache(). */
12859 a_tramp
= XEXP (m_tramp
, 0);
12860 maybe_emit_call_builtin___clear_cache (a_tramp
,
12861 plus_constant (ptr_mode
,
12866 static unsigned char
12867 aarch64_class_max_nregs (reg_class_t regclass
, machine_mode mode
)
12869 /* ??? Logically we should only need to provide a value when
12870 HARD_REGNO_MODE_OK says that at least one register in REGCLASS
12871 can hold MODE, but at the moment we need to handle all modes.
12872 Just ignore any runtime parts for registers that can't store them. */
12873 HOST_WIDE_INT lowest_size
= constant_lower_bound (GET_MODE_SIZE (mode
));
12874 unsigned int nregs
, vec_flags
;
12878 case TAILCALL_ADDR_REGS
:
12882 case POINTER_AND_FP_REGS
:
12886 vec_flags
= aarch64_classify_vector_mode (mode
);
12887 if ((vec_flags
& VEC_SVE_DATA
)
12888 && constant_multiple_p (GET_MODE_SIZE (mode
),
12889 aarch64_vl_bytes (mode
, vec_flags
), &nregs
))
12891 return (vec_flags
& VEC_ADVSIMD
12892 ? CEIL (lowest_size
, UNITS_PER_VREG
)
12893 : CEIL (lowest_size
, UNITS_PER_WORD
));
12899 case PR_AND_FFR_REGS
:
12908 gcc_unreachable ();
12912 aarch64_preferred_reload_class (rtx x
, reg_class_t regclass
)
12914 if (regclass
== POINTER_REGS
)
12915 return GENERAL_REGS
;
12917 if (regclass
== STACK_REG
)
12920 && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x
)), POINTER_REGS
))
12926 /* Register eliminiation can result in a request for
12927 SP+constant->FP_REGS. We cannot support such operations which
12928 use SP as source and an FP_REG as destination, so reject out
12930 if (! reg_class_subset_p (regclass
, GENERAL_REGS
) && GET_CODE (x
) == PLUS
)
12932 rtx lhs
= XEXP (x
, 0);
12934 /* Look through a possible SUBREG introduced by ILP32. */
12935 if (SUBREG_P (lhs
))
12936 lhs
= SUBREG_REG (lhs
);
12938 gcc_assert (REG_P (lhs
));
12939 gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs
)),
12948 aarch64_asm_output_labelref (FILE* f
, const char *name
)
12950 asm_fprintf (f
, "%U%s", name
);
12954 aarch64_elf_asm_constructor (rtx symbol
, int priority
)
12956 if (priority
== DEFAULT_INIT_PRIORITY
)
12957 default_ctor_section_asm_out_constructor (symbol
, priority
);
12961 /* While priority is known to be in range [0, 65535], so 18 bytes
12962 would be enough, the compiler might not know that. To avoid
12963 -Wformat-truncation false positive, use a larger size. */
12965 snprintf (buf
, sizeof (buf
), ".init_array.%.5u", priority
);
12966 s
= get_section (buf
, SECTION_WRITE
| SECTION_NOTYPE
, NULL
);
12967 switch_to_section (s
);
12968 assemble_align (POINTER_SIZE
);
12969 assemble_aligned_integer (POINTER_BYTES
, symbol
);
12974 aarch64_elf_asm_destructor (rtx symbol
, int priority
)
12976 if (priority
== DEFAULT_INIT_PRIORITY
)
12977 default_dtor_section_asm_out_destructor (symbol
, priority
);
12981 /* While priority is known to be in range [0, 65535], so 18 bytes
12982 would be enough, the compiler might not know that. To avoid
12983 -Wformat-truncation false positive, use a larger size. */
12985 snprintf (buf
, sizeof (buf
), ".fini_array.%.5u", priority
);
12986 s
= get_section (buf
, SECTION_WRITE
| SECTION_NOTYPE
, NULL
);
12987 switch_to_section (s
);
12988 assemble_align (POINTER_SIZE
);
12989 assemble_aligned_integer (POINTER_BYTES
, symbol
);
12994 aarch64_output_casesi (rtx
*operands
)
12998 rtx diff_vec
= PATTERN (NEXT_INSN (as_a
<rtx_insn
*> (operands
[2])));
13000 static const char *const patterns
[4][2] =
13003 "ldrb\t%w3, [%0,%w1,uxtw]",
13004 "add\t%3, %4, %w3, sxtb #2"
13007 "ldrh\t%w3, [%0,%w1,uxtw #1]",
13008 "add\t%3, %4, %w3, sxth #2"
13011 "ldr\t%w3, [%0,%w1,uxtw #2]",
13012 "add\t%3, %4, %w3, sxtw #2"
13014 /* We assume that DImode is only generated when not optimizing and
13015 that we don't really need 64-bit address offsets. That would
13016 imply an object file with 8GB of code in a single function! */
13018 "ldr\t%w3, [%0,%w1,uxtw #2]",
13019 "add\t%3, %4, %w3, sxtw #2"
13023 gcc_assert (GET_CODE (diff_vec
) == ADDR_DIFF_VEC
);
13025 scalar_int_mode mode
= as_a
<scalar_int_mode
> (GET_MODE (diff_vec
));
13026 index
= exact_log2 (GET_MODE_SIZE (mode
));
13028 gcc_assert (index
>= 0 && index
<= 3);
13030 /* Need to implement table size reduction, by chaning the code below. */
13031 output_asm_insn (patterns
[index
][0], operands
);
13032 ASM_GENERATE_INTERNAL_LABEL (label
, "Lrtx", CODE_LABEL_NUMBER (operands
[2]));
13033 snprintf (buf
, sizeof (buf
),
13034 "adr\t%%4, %s", targetm
.strip_name_encoding (label
));
13035 output_asm_insn (buf
, operands
);
13036 output_asm_insn (patterns
[index
][1], operands
);
13037 output_asm_insn ("br\t%3", operands
);
13038 output_asm_insn (aarch64_sls_barrier (aarch64_harden_sls_retbr_p ()),
13040 assemble_label (asm_out_file
, label
);
13045 /* Return size in bits of an arithmetic operand which is shifted/scaled and
13046 masked such that it is suitable for a UXTB, UXTH, or UXTW extend
13050 aarch64_uxt_size (int shift
, HOST_WIDE_INT mask
)
13052 if (shift
>= 0 && shift
<= 3)
13055 for (size
= 8; size
<= 32; size
*= 2)
13057 HOST_WIDE_INT bits
= ((HOST_WIDE_INT
)1U << size
) - 1;
13058 if (mask
== bits
<< shift
)
13065 /* Constant pools are per function only when PC relative
13066 literal loads are true or we are in the large memory
13070 aarch64_can_use_per_function_literal_pools_p (void)
13072 return (aarch64_pcrelative_literal_loads
13073 || aarch64_cmodel
== AARCH64_CMODEL_LARGE
);
13077 aarch64_use_blocks_for_constant_p (machine_mode
, const_rtx
)
13079 /* We can't use blocks for constants when we're using a per-function
13081 return !aarch64_can_use_per_function_literal_pools_p ();
13084 /* Select appropriate section for constants depending
13085 on where we place literal pools. */
13088 aarch64_select_rtx_section (machine_mode mode
,
13090 unsigned HOST_WIDE_INT align
)
13092 if (aarch64_can_use_per_function_literal_pools_p ())
13093 return function_section (current_function_decl
);
13095 return default_elf_select_rtx_section (mode
, x
, align
);
13098 /* Implement ASM_OUTPUT_POOL_EPILOGUE. */
13100 aarch64_asm_output_pool_epilogue (FILE *f
, const char *, tree
,
13101 HOST_WIDE_INT offset
)
13103 /* When using per-function literal pools, we must ensure that any code
13104 section is aligned to the minimal instruction length, lest we get
13105 errors from the assembler re "unaligned instructions". */
13106 if ((offset
& 3) && aarch64_can_use_per_function_literal_pools_p ())
13107 ASM_OUTPUT_ALIGN (f
, 2);
13112 /* Helper function for rtx cost calculation. Strip a shift expression
13113 from X. Returns the inner operand if successful, or the original
13114 expression on failure. */
13116 aarch64_strip_shift (rtx x
)
13120 /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
13121 we can convert both to ROR during final output. */
13122 if ((GET_CODE (op
) == ASHIFT
13123 || GET_CODE (op
) == ASHIFTRT
13124 || GET_CODE (op
) == LSHIFTRT
13125 || GET_CODE (op
) == ROTATERT
13126 || GET_CODE (op
) == ROTATE
)
13127 && CONST_INT_P (XEXP (op
, 1)))
13128 return XEXP (op
, 0);
13130 if (GET_CODE (op
) == MULT
13131 && CONST_INT_P (XEXP (op
, 1))
13132 && ((unsigned) exact_log2 (INTVAL (XEXP (op
, 1)))) < 64)
13133 return XEXP (op
, 0);
13138 /* Helper function for rtx cost calculation. Strip an extend
13139 expression from X. Returns the inner operand if successful, or the
13140 original expression on failure. We deal with a number of possible
13141 canonicalization variations here. If STRIP_SHIFT is true, then
13142 we can strip off a shift also. */
13144 aarch64_strip_extend (rtx x
, bool strip_shift
)
13146 scalar_int_mode mode
;
13149 if (!is_a
<scalar_int_mode
> (GET_MODE (op
), &mode
))
13152 if (GET_CODE (op
) == AND
13153 && GET_CODE (XEXP (op
, 0)) == MULT
13154 && CONST_INT_P (XEXP (XEXP (op
, 0), 1))
13155 && CONST_INT_P (XEXP (op
, 1))
13156 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op
, 0), 1))),
13157 INTVAL (XEXP (op
, 1))) != 0)
13158 return XEXP (XEXP (op
, 0), 0);
13160 /* Now handle extended register, as this may also have an optional
13161 left shift by 1..4. */
13163 && GET_CODE (op
) == ASHIFT
13164 && CONST_INT_P (XEXP (op
, 1))
13165 && ((unsigned HOST_WIDE_INT
) INTVAL (XEXP (op
, 1))) <= 4)
13168 if (GET_CODE (op
) == ZERO_EXTEND
13169 || GET_CODE (op
) == SIGN_EXTEND
)
13178 /* Helper function for rtx cost calculation. Strip extension as well as any
13179 inner VEC_SELECT high-half from X. Returns the inner vector operand if
13180 successful, or the original expression on failure. */
13182 aarch64_strip_extend_vec_half (rtx x
)
13184 if (GET_CODE (x
) == ZERO_EXTEND
|| GET_CODE (x
) == SIGN_EXTEND
)
13187 if (GET_CODE (x
) == VEC_SELECT
13188 && vec_series_highpart_p (GET_MODE (x
), GET_MODE (XEXP (x
, 0)),
13195 /* Helper function for rtx cost calculation. Strip VEC_DUPLICATE as well as
13196 any subsequent extend and VEC_SELECT from X. Returns the inner scalar
13197 operand if successful, or the original expression on failure. */
13199 aarch64_strip_duplicate_vec_elt (rtx x
)
13201 if (GET_CODE (x
) == VEC_DUPLICATE
13202 && is_a
<scalar_mode
> (GET_MODE (XEXP (x
, 0))))
13205 if (GET_CODE (x
) == VEC_SELECT
)
13207 else if ((GET_CODE (x
) == ZERO_EXTEND
|| GET_CODE (x
) == SIGN_EXTEND
)
13208 && GET_CODE (XEXP (x
, 0)) == VEC_SELECT
)
13209 x
= XEXP (XEXP (x
, 0), 0);
13214 /* Return true iff CODE is a shift supported in combination
13215 with arithmetic instructions. */
13218 aarch64_shift_p (enum rtx_code code
)
13220 return code
== ASHIFT
|| code
== ASHIFTRT
|| code
== LSHIFTRT
;
13224 /* Return true iff X is a cheap shift without a sign extend. */
13227 aarch64_cheap_mult_shift_p (rtx x
)
13234 if (!(aarch64_tune_params
.extra_tuning_flags
13235 & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND
))
13238 if (GET_CODE (op0
) == SIGN_EXTEND
)
13241 if (GET_CODE (x
) == ASHIFT
&& CONST_INT_P (op1
)
13242 && UINTVAL (op1
) <= 4)
13245 if (GET_CODE (x
) != MULT
|| !CONST_INT_P (op1
))
13248 HOST_WIDE_INT l2
= exact_log2 (INTVAL (op1
));
13250 if (l2
> 0 && l2
<= 4)
13256 /* Helper function for rtx cost calculation. Calculate the cost of
13257 a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
13258 Return the calculated cost of the expression, recursing manually in to
13259 operands where needed. */
13262 aarch64_rtx_mult_cost (rtx x
, enum rtx_code code
, int outer
, bool speed
)
13265 const struct cpu_cost_table
*extra_cost
13266 = aarch64_tune_params
.insn_extra_cost
;
13268 bool compound_p
= (outer
== PLUS
|| outer
== MINUS
);
13269 machine_mode mode
= GET_MODE (x
);
13271 gcc_checking_assert (code
== MULT
);
13276 if (VECTOR_MODE_P (mode
))
13278 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
13279 if (TARGET_SIMD
&& (vec_flags
& VEC_ADVSIMD
))
13281 /* The select-operand-high-half versions of the instruction have the
13282 same cost as the three vector version - don't add the costs of the
13283 extension or selection into the costs of the multiply. */
13284 op0
= aarch64_strip_extend_vec_half (op0
);
13285 op1
= aarch64_strip_extend_vec_half (op1
);
13286 /* The by-element versions of the instruction have the same costs as
13287 the normal 3-vector version. We make an assumption that the input
13288 to the VEC_DUPLICATE is already on the FP & SIMD side. This means
13289 costing of a MUL by element pre RA is a bit optimistic. */
13290 op0
= aarch64_strip_duplicate_vec_elt (op0
);
13291 op1
= aarch64_strip_duplicate_vec_elt (op1
);
13293 cost
+= rtx_cost (op0
, mode
, MULT
, 0, speed
);
13294 cost
+= rtx_cost (op1
, mode
, MULT
, 1, speed
);
13297 if (GET_CODE (x
) == MULT
)
13298 cost
+= extra_cost
->vect
.mult
;
13299 /* This is to catch the SSRA costing currently flowing here. */
13301 cost
+= extra_cost
->vect
.alu
;
13306 /* Integer multiply/fma. */
13307 if (GET_MODE_CLASS (mode
) == MODE_INT
)
13309 /* The multiply will be canonicalized as a shift, cost it as such. */
13310 if (aarch64_shift_p (GET_CODE (x
))
13311 || (CONST_INT_P (op1
)
13312 && exact_log2 (INTVAL (op1
)) > 0))
13314 bool is_extend
= GET_CODE (op0
) == ZERO_EXTEND
13315 || GET_CODE (op0
) == SIGN_EXTEND
;
13320 /* If the shift is considered cheap,
13321 then don't add any cost. */
13322 if (aarch64_cheap_mult_shift_p (x
))
13324 else if (REG_P (op1
))
13325 /* ARITH + shift-by-register. */
13326 cost
+= extra_cost
->alu
.arith_shift_reg
;
13327 else if (is_extend
)
13328 /* ARITH + extended register. We don't have a cost field
13329 for ARITH+EXTEND+SHIFT, so use extend_arith here. */
13330 cost
+= extra_cost
->alu
.extend_arith
;
13332 /* ARITH + shift-by-immediate. */
13333 cost
+= extra_cost
->alu
.arith_shift
;
13336 /* LSL (immediate). */
13337 cost
+= extra_cost
->alu
.shift
;
13340 /* Strip extends as we will have costed them in the case above. */
13342 op0
= aarch64_strip_extend (op0
, true);
13344 cost
+= rtx_cost (op0
, VOIDmode
, code
, 0, speed
);
13349 /* MNEG or [US]MNEGL. Extract the NEG operand and indicate that it's a
13350 compound and let the below cases handle it. After all, MNEG is a
13351 special-case alias of MSUB. */
13352 if (GET_CODE (op0
) == NEG
)
13354 op0
= XEXP (op0
, 0);
13358 /* Integer multiplies or FMAs have zero/sign extending variants. */
13359 if ((GET_CODE (op0
) == ZERO_EXTEND
13360 && GET_CODE (op1
) == ZERO_EXTEND
)
13361 || (GET_CODE (op0
) == SIGN_EXTEND
13362 && GET_CODE (op1
) == SIGN_EXTEND
))
13364 cost
+= rtx_cost (XEXP (op0
, 0), VOIDmode
, MULT
, 0, speed
);
13365 cost
+= rtx_cost (XEXP (op1
, 0), VOIDmode
, MULT
, 1, speed
);
13370 /* SMADDL/UMADDL/UMSUBL/SMSUBL. */
13371 cost
+= extra_cost
->mult
[0].extend_add
;
13373 /* MUL/SMULL/UMULL. */
13374 cost
+= extra_cost
->mult
[0].extend
;
13380 /* This is either an integer multiply or a MADD. In both cases
13381 we want to recurse and cost the operands. */
13382 cost
+= rtx_cost (op0
, mode
, MULT
, 0, speed
);
13383 cost
+= rtx_cost (op1
, mode
, MULT
, 1, speed
);
13389 cost
+= extra_cost
->mult
[mode
== DImode
].add
;
13392 cost
+= extra_cost
->mult
[mode
== DImode
].simple
;
13401 /* Floating-point FMA/FMUL can also support negations of the
13402 operands, unless the rounding mode is upward or downward in
13403 which case FNMUL is different than FMUL with operand negation. */
13404 bool neg0
= GET_CODE (op0
) == NEG
;
13405 bool neg1
= GET_CODE (op1
) == NEG
;
13406 if (compound_p
|| !flag_rounding_math
|| (neg0
&& neg1
))
13409 op0
= XEXP (op0
, 0);
13411 op1
= XEXP (op1
, 0);
13415 /* FMADD/FNMADD/FNMSUB/FMSUB. */
13416 cost
+= extra_cost
->fp
[mode
== DFmode
].fma
;
13419 cost
+= extra_cost
->fp
[mode
== DFmode
].mult
;
13422 cost
+= rtx_cost (op0
, mode
, MULT
, 0, speed
);
13423 cost
+= rtx_cost (op1
, mode
, MULT
, 1, speed
);
13429 aarch64_address_cost (rtx x
,
13431 addr_space_t as ATTRIBUTE_UNUSED
,
13434 enum rtx_code c
= GET_CODE (x
);
13435 const struct cpu_addrcost_table
*addr_cost
= aarch64_tune_params
.addr_cost
;
13436 struct aarch64_address_info info
;
13440 if (!aarch64_classify_address (&info
, x
, mode
, false))
13442 if (GET_CODE (x
) == CONST
|| SYMBOL_REF_P (x
))
13444 /* This is a CONST or SYMBOL ref which will be split
13445 in a different way depending on the code model in use.
13446 Cost it through the generic infrastructure. */
13447 int cost_symbol_ref
= rtx_cost (x
, Pmode
, MEM
, 1, speed
);
13448 /* Divide through by the cost of one instruction to
13449 bring it to the same units as the address costs. */
13450 cost_symbol_ref
/= COSTS_N_INSNS (1);
13451 /* The cost is then the cost of preparing the address,
13452 followed by an immediate (possibly 0) offset. */
13453 return cost_symbol_ref
+ addr_cost
->imm_offset
;
13457 /* This is most likely a jump table from a case
13459 return addr_cost
->register_offset
;
13465 case ADDRESS_LO_SUM
:
13466 case ADDRESS_SYMBOLIC
:
13467 case ADDRESS_REG_IMM
:
13468 cost
+= addr_cost
->imm_offset
;
13471 case ADDRESS_REG_WB
:
13472 if (c
== PRE_INC
|| c
== PRE_DEC
|| c
== PRE_MODIFY
)
13473 cost
+= addr_cost
->pre_modify
;
13474 else if (c
== POST_INC
|| c
== POST_DEC
|| c
== POST_MODIFY
)
13476 unsigned int nvectors
= aarch64_ldn_stn_vectors (mode
);
13478 cost
+= addr_cost
->post_modify_ld3_st3
;
13479 else if (nvectors
== 4)
13480 cost
+= addr_cost
->post_modify_ld4_st4
;
13482 cost
+= addr_cost
->post_modify
;
13485 gcc_unreachable ();
13489 case ADDRESS_REG_REG
:
13490 cost
+= addr_cost
->register_offset
;
13493 case ADDRESS_REG_SXTW
:
13494 cost
+= addr_cost
->register_sextend
;
13497 case ADDRESS_REG_UXTW
:
13498 cost
+= addr_cost
->register_zextend
;
13502 gcc_unreachable ();
13506 if (info
.shift
> 0)
13508 /* For the sake of calculating the cost of the shifted register
13509 component, we can treat same sized modes in the same way. */
13510 if (known_eq (GET_MODE_BITSIZE (mode
), 16))
13511 cost
+= addr_cost
->addr_scale_costs
.hi
;
13512 else if (known_eq (GET_MODE_BITSIZE (mode
), 32))
13513 cost
+= addr_cost
->addr_scale_costs
.si
;
13514 else if (known_eq (GET_MODE_BITSIZE (mode
), 64))
13515 cost
+= addr_cost
->addr_scale_costs
.di
;
13517 /* We can't tell, or this is a 128-bit vector. */
13518 cost
+= addr_cost
->addr_scale_costs
.ti
;
13524 /* Return the cost of a branch. If SPEED_P is true then the compiler is
13525 optimizing for speed. If PREDICTABLE_P is true then the branch is predicted
13529 aarch64_branch_cost (bool speed_p
, bool predictable_p
)
13531 /* When optimizing for speed, use the cost of unpredictable branches. */
13532 const struct cpu_branch_cost
*branch_costs
=
13533 aarch64_tune_params
.branch_costs
;
13535 if (!speed_p
|| predictable_p
)
13536 return branch_costs
->predictable
;
13538 return branch_costs
->unpredictable
;
13541 /* Return true if X is a zero or sign extract
13542 usable in an ADD or SUB (extended register) instruction. */
13544 aarch64_rtx_arith_op_extract_p (rtx x
)
13546 /* The simple case <ARITH>, XD, XN, XM, [us]xt.
13548 if (GET_CODE (x
) == SIGN_EXTEND
13549 || GET_CODE (x
) == ZERO_EXTEND
)
13550 return REG_P (XEXP (x
, 0));
13556 aarch64_frint_unspec_p (unsigned int u
)
13560 case UNSPEC_FRINTZ
:
13561 case UNSPEC_FRINTP
:
13562 case UNSPEC_FRINTM
:
13563 case UNSPEC_FRINTA
:
13564 case UNSPEC_FRINTN
:
13565 case UNSPEC_FRINTX
:
13566 case UNSPEC_FRINTI
:
13574 /* Return true iff X is an rtx that will match an extr instruction
13575 i.e. as described in the *extr<mode>5_insn family of patterns.
13576 OP0 and OP1 will be set to the operands of the shifts involved
13577 on success and will be NULL_RTX otherwise. */
13580 aarch64_extr_rtx_p (rtx x
, rtx
*res_op0
, rtx
*res_op1
)
13583 scalar_int_mode mode
;
13584 if (!is_a
<scalar_int_mode
> (GET_MODE (x
), &mode
))
13587 *res_op0
= NULL_RTX
;
13588 *res_op1
= NULL_RTX
;
13590 if (GET_CODE (x
) != IOR
)
13596 if ((GET_CODE (op0
) == ASHIFT
&& GET_CODE (op1
) == LSHIFTRT
)
13597 || (GET_CODE (op1
) == ASHIFT
&& GET_CODE (op0
) == LSHIFTRT
))
13599 /* Canonicalise locally to ashift in op0, lshiftrt in op1. */
13600 if (GET_CODE (op1
) == ASHIFT
)
13601 std::swap (op0
, op1
);
13603 if (!CONST_INT_P (XEXP (op0
, 1)) || !CONST_INT_P (XEXP (op1
, 1)))
13606 unsigned HOST_WIDE_INT shft_amnt_0
= UINTVAL (XEXP (op0
, 1));
13607 unsigned HOST_WIDE_INT shft_amnt_1
= UINTVAL (XEXP (op1
, 1));
13609 if (shft_amnt_0
< GET_MODE_BITSIZE (mode
)
13610 && shft_amnt_0
+ shft_amnt_1
== GET_MODE_BITSIZE (mode
))
13612 *res_op0
= XEXP (op0
, 0);
13613 *res_op1
= XEXP (op1
, 0);
13621 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
13622 storing it in *COST. Result is true if the total cost of the operation
13623 has now been calculated. */
13625 aarch64_if_then_else_costs (rtx op0
, rtx op1
, rtx op2
, int *cost
, bool speed
)
13629 enum rtx_code cmpcode
;
13630 const struct cpu_cost_table
*extra_cost
13631 = aarch64_tune_params
.insn_extra_cost
;
13633 if (COMPARISON_P (op0
))
13635 inner
= XEXP (op0
, 0);
13636 comparator
= XEXP (op0
, 1);
13637 cmpcode
= GET_CODE (op0
);
13642 comparator
= const0_rtx
;
13646 if (GET_CODE (op1
) == PC
|| GET_CODE (op2
) == PC
)
13648 /* Conditional branch. */
13649 if (GET_MODE_CLASS (GET_MODE (inner
)) == MODE_CC
)
13653 if (cmpcode
== NE
|| cmpcode
== EQ
)
13655 if (comparator
== const0_rtx
)
13657 /* TBZ/TBNZ/CBZ/CBNZ. */
13658 if (GET_CODE (inner
) == ZERO_EXTRACT
)
13660 *cost
+= rtx_cost (XEXP (inner
, 0), VOIDmode
,
13661 ZERO_EXTRACT
, 0, speed
);
13664 *cost
+= rtx_cost (inner
, VOIDmode
, cmpcode
, 0, speed
);
13668 if (register_operand (inner
, VOIDmode
)
13669 && aarch64_imm24 (comparator
, VOIDmode
))
13671 /* SUB and SUBS. */
13672 *cost
+= COSTS_N_INSNS (2);
13674 *cost
+= extra_cost
->alu
.arith
* 2;
13678 else if (cmpcode
== LT
|| cmpcode
== GE
)
13681 if (comparator
== const0_rtx
)
13686 else if (GET_MODE_CLASS (GET_MODE (inner
)) == MODE_CC
)
13689 if (GET_CODE (op1
) == COMPARE
)
13691 /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0. */
13692 if (XEXP (op1
, 1) == const0_rtx
)
13696 machine_mode mode
= GET_MODE (XEXP (op1
, 0));
13698 if (GET_MODE_CLASS (mode
) == MODE_INT
)
13699 *cost
+= extra_cost
->alu
.arith
;
13701 *cost
+= extra_cost
->fp
[mode
== DFmode
].compare
;
13706 /* It's a conditional operation based on the status flags,
13707 so it must be some flavor of CSEL. */
13709 /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL. */
13710 if (GET_CODE (op1
) == NEG
13711 || GET_CODE (op1
) == NOT
13712 || (GET_CODE (op1
) == PLUS
&& XEXP (op1
, 1) == const1_rtx
))
13713 op1
= XEXP (op1
, 0);
13714 else if (GET_CODE (op1
) == ZERO_EXTEND
&& GET_CODE (op2
) == ZERO_EXTEND
)
13716 /* CSEL with zero-extension (*cmovdi_insn_uxtw). */
13717 op1
= XEXP (op1
, 0);
13718 op2
= XEXP (op2
, 0);
13720 else if (GET_CODE (op1
) == ZERO_EXTEND
&& op2
== const0_rtx
)
13722 inner
= XEXP (op1
, 0);
13723 if (GET_CODE (inner
) == NEG
|| GET_CODE (inner
) == NOT
)
13724 /* CSINV/NEG with zero extend + const 0 (*csinv3_uxtw_insn3). */
13725 op1
= XEXP (inner
, 0);
13728 *cost
+= rtx_cost (op1
, VOIDmode
, IF_THEN_ELSE
, 1, speed
);
13729 *cost
+= rtx_cost (op2
, VOIDmode
, IF_THEN_ELSE
, 2, speed
);
13733 /* We don't know what this is, cost all operands. */
13737 /* Check whether X is a bitfield operation of the form shift + extend that
13738 maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction. If so, return the
13739 operand to which the bitfield operation is applied. Otherwise return
13743 aarch64_extend_bitfield_pattern_p (rtx x
)
13745 rtx_code outer_code
= GET_CODE (x
);
13746 machine_mode outer_mode
= GET_MODE (x
);
13748 if (outer_code
!= ZERO_EXTEND
&& outer_code
!= SIGN_EXTEND
13749 && outer_mode
!= SImode
&& outer_mode
!= DImode
)
13752 rtx inner
= XEXP (x
, 0);
13753 rtx_code inner_code
= GET_CODE (inner
);
13754 machine_mode inner_mode
= GET_MODE (inner
);
13757 switch (inner_code
)
13760 if (CONST_INT_P (XEXP (inner
, 1))
13761 && (inner_mode
== QImode
|| inner_mode
== HImode
))
13762 op
= XEXP (inner
, 0);
13765 if (outer_code
== ZERO_EXTEND
&& CONST_INT_P (XEXP (inner
, 1))
13766 && (inner_mode
== QImode
|| inner_mode
== HImode
))
13767 op
= XEXP (inner
, 0);
13770 if (outer_code
== SIGN_EXTEND
&& CONST_INT_P (XEXP (inner
, 1))
13771 && (inner_mode
== QImode
|| inner_mode
== HImode
))
13772 op
= XEXP (inner
, 0);
13781 /* Return true if the mask and a shift amount from an RTX of the form
13782 (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
13783 mode MODE. See the *andim_ashift<mode>_bfiz pattern. */
13786 aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode mode
, rtx mask
,
13789 return CONST_INT_P (mask
) && CONST_INT_P (shft_amnt
)
13790 && INTVAL (mask
) > 0
13791 && UINTVAL (shft_amnt
) < GET_MODE_BITSIZE (mode
)
13792 && exact_log2 ((UINTVAL (mask
) >> UINTVAL (shft_amnt
)) + 1) >= 0
13794 & ((HOST_WIDE_INT_1U
<< UINTVAL (shft_amnt
)) - 1)) == 0;
13797 /* Return true if the masks and a shift amount from an RTX of the form
13798 ((x & MASK1) | ((y << SHIFT_AMNT) & MASK2)) are valid to combine into
13799 a BFI instruction of mode MODE. See *arch64_bfi patterns. */
13802 aarch64_masks_and_shift_for_bfi_p (scalar_int_mode mode
,
13803 unsigned HOST_WIDE_INT mask1
,
13804 unsigned HOST_WIDE_INT shft_amnt
,
13805 unsigned HOST_WIDE_INT mask2
)
13807 unsigned HOST_WIDE_INT t
;
13809 /* Verify that there is no overlap in what bits are set in the two masks. */
13810 if (mask1
!= ~mask2
)
13813 /* Verify that mask2 is not all zeros or ones. */
13814 if (mask2
== 0 || mask2
== HOST_WIDE_INT_M1U
)
13817 /* The shift amount should always be less than the mode size. */
13818 gcc_assert (shft_amnt
< GET_MODE_BITSIZE (mode
));
13820 /* Verify that the mask being shifted is contiguous and would be in the
13821 least significant bits after shifting by shft_amnt. */
13822 t
= mask2
+ (HOST_WIDE_INT_1U
<< shft_amnt
);
13823 return (t
== (t
& -t
));
13826 /* Calculate the cost of calculating X, storing it in *COST. Result
13827 is true if the total cost of the operation has now been calculated. */
13829 aarch64_rtx_costs (rtx x
, machine_mode mode
, int outer ATTRIBUTE_UNUSED
,
13830 int param ATTRIBUTE_UNUSED
, int *cost
, bool speed
)
13833 const struct cpu_cost_table
*extra_cost
13834 = aarch64_tune_params
.insn_extra_cost
;
13835 rtx_code code
= GET_CODE (x
);
13836 scalar_int_mode int_mode
;
13838 /* By default, assume that everything has equivalent cost to the
13839 cheapest instruction. Any additional costs are applied as a delta
13840 above this default. */
13841 *cost
= COSTS_N_INSNS (1);
13846 /* The cost depends entirely on the operands to SET. */
13848 op0
= SET_DEST (x
);
13851 switch (GET_CODE (op0
))
13856 rtx address
= XEXP (op0
, 0);
13857 if (VECTOR_MODE_P (mode
))
13858 *cost
+= extra_cost
->ldst
.storev
;
13859 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
13860 *cost
+= extra_cost
->ldst
.store
;
13861 else if (mode
== SFmode
|| mode
== SDmode
)
13862 *cost
+= extra_cost
->ldst
.storef
;
13863 else if (mode
== DFmode
|| mode
== DDmode
)
13864 *cost
+= extra_cost
->ldst
.stored
;
13867 COSTS_N_INSNS (aarch64_address_cost (address
, mode
,
13871 *cost
+= rtx_cost (op1
, mode
, SET
, 1, speed
);
13875 if (! REG_P (SUBREG_REG (op0
)))
13876 *cost
+= rtx_cost (SUBREG_REG (op0
), VOIDmode
, SET
, 0, speed
);
13878 /* Fall through. */
13880 /* The cost is one per vector-register copied. */
13881 if (VECTOR_MODE_P (GET_MODE (op0
)) && REG_P (op1
))
13883 int nregs
= aarch64_hard_regno_nregs (V0_REGNUM
, GET_MODE (op0
));
13884 *cost
= COSTS_N_INSNS (nregs
);
13886 /* const0_rtx is in general free, but we will use an
13887 instruction to set a register to 0. */
13888 else if (REG_P (op1
) || op1
== const0_rtx
)
13890 /* The cost is 1 per register copied. */
13891 int nregs
= aarch64_hard_regno_nregs (R0_REGNUM
, GET_MODE (op0
));
13892 *cost
= COSTS_N_INSNS (nregs
);
13895 /* Cost is just the cost of the RHS of the set. */
13896 *cost
+= rtx_cost (op1
, mode
, SET
, 1, speed
);
13901 /* Bit-field insertion. Strip any redundant widening of
13902 the RHS to meet the width of the target. */
13903 if (SUBREG_P (op1
))
13904 op1
= SUBREG_REG (op1
);
13905 if ((GET_CODE (op1
) == ZERO_EXTEND
13906 || GET_CODE (op1
) == SIGN_EXTEND
)
13907 && CONST_INT_P (XEXP (op0
, 1))
13908 && is_a
<scalar_int_mode
> (GET_MODE (XEXP (op1
, 0)), &int_mode
)
13909 && GET_MODE_BITSIZE (int_mode
) >= INTVAL (XEXP (op0
, 1)))
13910 op1
= XEXP (op1
, 0);
13912 if (CONST_INT_P (op1
))
13914 /* MOV immediate is assumed to always be cheap. */
13915 *cost
= COSTS_N_INSNS (1);
13921 *cost
+= extra_cost
->alu
.bfi
;
13922 *cost
+= rtx_cost (op1
, VOIDmode
, (enum rtx_code
) code
, 1, speed
);
13928 /* We can't make sense of this, assume default cost. */
13929 *cost
= COSTS_N_INSNS (1);
13935 /* If an instruction can incorporate a constant within the
13936 instruction, the instruction's expression avoids calling
13937 rtx_cost() on the constant. If rtx_cost() is called on a
13938 constant, then it is usually because the constant must be
13939 moved into a register by one or more instructions.
13941 The exception is constant 0, which can be expressed
13942 as XZR/WZR and is therefore free. The exception to this is
13943 if we have (set (reg) (const0_rtx)) in which case we must cost
13944 the move. However, we can catch that when we cost the SET, so
13945 we don't need to consider that here. */
13946 if (x
== const0_rtx
)
13950 /* To an approximation, building any other constant is
13951 proportionally expensive to the number of instructions
13952 required to build that constant. This is true whether we
13953 are compiling for SPEED or otherwise. */
13954 machine_mode imode
= known_le (GET_MODE_SIZE (mode
), 4)
13956 *cost
= COSTS_N_INSNS (aarch64_internal_mov_immediate
13957 (NULL_RTX
, x
, false, imode
));
13963 /* First determine number of instructions to do the move
13964 as an integer constant. */
13965 if (!aarch64_float_const_representable_p (x
)
13966 && !aarch64_can_const_movi_rtx_p (x
, mode
)
13967 && aarch64_float_const_rtx_p (x
))
13969 unsigned HOST_WIDE_INT ival
;
13970 bool succeed
= aarch64_reinterpret_float_as_int (x
, &ival
);
13971 gcc_assert (succeed
);
13973 machine_mode imode
= known_eq (GET_MODE_SIZE (mode
), 8)
13975 int ncost
= aarch64_internal_mov_immediate
13976 (NULL_RTX
, gen_int_mode (ival
, imode
), false, imode
);
13977 *cost
+= COSTS_N_INSNS (ncost
);
13983 /* mov[df,sf]_aarch64. */
13984 if (aarch64_float_const_representable_p (x
))
13985 /* FMOV (scalar immediate). */
13986 *cost
+= extra_cost
->fp
[mode
== DFmode
|| mode
== DDmode
].fpconst
;
13987 else if (!aarch64_float_const_zero_rtx_p (x
))
13989 /* This will be a load from memory. */
13990 if (mode
== DFmode
|| mode
== DDmode
)
13991 *cost
+= extra_cost
->ldst
.loadd
;
13993 *cost
+= extra_cost
->ldst
.loadf
;
13996 /* Otherwise this is +0.0. We get this using MOVI d0, #0
13997 or MOV v0.s[0], wzr - neither of which are modeled by the
13998 cost tables. Just use the default cost. */
14008 /* For loads we want the base cost of a load, plus an
14009 approximation for the additional cost of the addressing
14011 rtx address
= XEXP (x
, 0);
14012 if (VECTOR_MODE_P (mode
))
14013 *cost
+= extra_cost
->ldst
.loadv
;
14014 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
14015 *cost
+= extra_cost
->ldst
.load
;
14016 else if (mode
== SFmode
|| mode
== SDmode
)
14017 *cost
+= extra_cost
->ldst
.loadf
;
14018 else if (mode
== DFmode
|| mode
== DDmode
)
14019 *cost
+= extra_cost
->ldst
.loadd
;
14022 COSTS_N_INSNS (aarch64_address_cost (address
, mode
,
14031 if (VECTOR_MODE_P (mode
))
14036 *cost
+= extra_cost
->vect
.alu
;
14041 if (GET_MODE_CLASS (mode
) == MODE_INT
)
14043 if (GET_RTX_CLASS (GET_CODE (op0
)) == RTX_COMPARE
14044 || GET_RTX_CLASS (GET_CODE (op0
)) == RTX_COMM_COMPARE
)
14047 *cost
+= rtx_cost (XEXP (op0
, 0), VOIDmode
, NEG
, 0, speed
);
14051 /* Cost this as SUB wzr, X. */
14052 op0
= CONST0_RTX (mode
);
14057 if (GET_MODE_CLASS (mode
) == MODE_FLOAT
)
14059 /* Support (neg(fma...)) as a single instruction only if
14060 sign of zeros is unimportant. This matches the decision
14061 making in aarch64.md. */
14062 if (GET_CODE (op0
) == FMA
&& !HONOR_SIGNED_ZEROS (GET_MODE (op0
)))
14065 *cost
= rtx_cost (op0
, mode
, NEG
, 0, speed
);
14068 if (GET_CODE (op0
) == MULT
)
14071 *cost
= rtx_cost (op0
, mode
, NEG
, 0, speed
);
14076 *cost
+= extra_cost
->fp
[mode
== DFmode
].neg
;
14086 if (VECTOR_MODE_P (mode
))
14087 *cost
+= extra_cost
->vect
.alu
;
14089 *cost
+= extra_cost
->alu
.clz
;
14095 *cost
= COSTS_N_INSNS (2);
14098 *cost
+= extra_cost
->alu
.clz
+ extra_cost
->alu
.rev
;
14105 if (op1
== const0_rtx
14106 && GET_CODE (op0
) == AND
)
14109 mode
= GET_MODE (op0
);
14113 if (GET_MODE_CLASS (GET_MODE (op0
)) == MODE_INT
)
14115 /* TODO: A write to the CC flags possibly costs extra, this
14116 needs encoding in the cost tables. */
14118 mode
= GET_MODE (op0
);
14120 if (GET_CODE (op0
) == AND
)
14126 if (GET_CODE (op0
) == PLUS
)
14128 /* ADDS (and CMN alias). */
14133 if (GET_CODE (op0
) == MINUS
)
14140 if (GET_CODE (op0
) == ZERO_EXTRACT
&& op1
== const0_rtx
14141 && GET_MODE (x
) == CC_NZmode
&& CONST_INT_P (XEXP (op0
, 1))
14142 && CONST_INT_P (XEXP (op0
, 2)))
14144 /* COMPARE of ZERO_EXTRACT form of TST-immediate.
14145 Handle it here directly rather than going to cost_logic
14146 since we know the immediate generated for the TST is valid
14147 so we can avoid creating an intermediate rtx for it only
14148 for costing purposes. */
14150 *cost
+= extra_cost
->alu
.logical
;
14152 *cost
+= rtx_cost (XEXP (op0
, 0), GET_MODE (op0
),
14153 ZERO_EXTRACT
, 0, speed
);
14157 if (GET_CODE (op1
) == NEG
)
14161 *cost
+= extra_cost
->alu
.arith
;
14163 *cost
+= rtx_cost (op0
, mode
, COMPARE
, 0, speed
);
14164 *cost
+= rtx_cost (XEXP (op1
, 0), mode
, NEG
, 1, speed
);
14170 Compare can freely swap the order of operands, and
14171 canonicalization puts the more complex operation first.
14172 But the integer MINUS logic expects the shift/extend
14173 operation in op1. */
14175 || (SUBREG_P (op0
) && REG_P (SUBREG_REG (op0
)))))
14183 if (GET_MODE_CLASS (GET_MODE (op0
)) == MODE_FLOAT
)
14187 *cost
+= extra_cost
->fp
[mode
== DFmode
].compare
;
14189 if (CONST_DOUBLE_P (op1
) && aarch64_float_const_zero_rtx_p (op1
))
14191 *cost
+= rtx_cost (op0
, VOIDmode
, COMPARE
, 0, speed
);
14192 /* FCMP supports constant 0.0 for no extra cost. */
14198 if (VECTOR_MODE_P (mode
))
14200 /* Vector compare. */
14202 *cost
+= extra_cost
->vect
.alu
;
14204 if (aarch64_float_const_zero_rtx_p (op1
))
14206 /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
14220 if (VECTOR_MODE_P (mode
))
14222 /* SUBL2 and SUBW2. */
14223 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
14224 if (TARGET_SIMD
&& (vec_flags
& VEC_ADVSIMD
))
14226 /* The select-operand-high-half versions of the sub instruction
14227 have the same cost as the regular three vector version -
14228 don't add the costs of the select into the costs of the sub.
14230 op0
= aarch64_strip_extend_vec_half (op0
);
14231 op1
= aarch64_strip_extend_vec_half (op1
);
14235 *cost
+= rtx_cost (op0
, mode
, MINUS
, 0, speed
);
14237 /* Detect valid immediates. */
14238 if ((GET_MODE_CLASS (mode
) == MODE_INT
14239 || (GET_MODE_CLASS (mode
) == MODE_CC
14240 && GET_MODE_CLASS (GET_MODE (op0
)) == MODE_INT
))
14241 && CONST_INT_P (op1
)
14242 && aarch64_uimm12_shift (INTVAL (op1
)))
14245 /* SUB(S) (immediate). */
14246 *cost
+= extra_cost
->alu
.arith
;
14250 /* Look for SUB (extended register). */
14251 if (is_a
<scalar_int_mode
> (mode
)
14252 && aarch64_rtx_arith_op_extract_p (op1
))
14255 *cost
+= extra_cost
->alu
.extend_arith
;
14257 op1
= aarch64_strip_extend (op1
, true);
14258 *cost
+= rtx_cost (op1
, VOIDmode
,
14259 (enum rtx_code
) GET_CODE (op1
), 0, speed
);
14263 rtx new_op1
= aarch64_strip_extend (op1
, false);
14265 /* Cost this as an FMA-alike operation. */
14266 if ((GET_CODE (new_op1
) == MULT
14267 || aarch64_shift_p (GET_CODE (new_op1
)))
14268 && code
!= COMPARE
)
14270 *cost
+= aarch64_rtx_mult_cost (new_op1
, MULT
,
14271 (enum rtx_code
) code
,
14276 *cost
+= rtx_cost (new_op1
, VOIDmode
, MINUS
, 1, speed
);
14280 if (VECTOR_MODE_P (mode
))
14283 *cost
+= extra_cost
->vect
.alu
;
14285 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
14288 *cost
+= extra_cost
->alu
.arith
;
14290 else if (GET_MODE_CLASS (mode
) == MODE_FLOAT
)
14293 *cost
+= extra_cost
->fp
[mode
== DFmode
].addsub
;
14307 if (VECTOR_MODE_P (mode
))
14309 /* ADDL2 and ADDW2. */
14310 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
14311 if (TARGET_SIMD
&& (vec_flags
& VEC_ADVSIMD
))
14313 /* The select-operand-high-half versions of the add instruction
14314 have the same cost as the regular three vector version -
14315 don't add the costs of the select into the costs of the add.
14317 op0
= aarch64_strip_extend_vec_half (op0
);
14318 op1
= aarch64_strip_extend_vec_half (op1
);
14322 if (GET_RTX_CLASS (GET_CODE (op0
)) == RTX_COMPARE
14323 || GET_RTX_CLASS (GET_CODE (op0
)) == RTX_COMM_COMPARE
)
14326 *cost
+= rtx_cost (XEXP (op0
, 0), mode
, PLUS
, 0, speed
);
14327 *cost
+= rtx_cost (op1
, mode
, PLUS
, 1, speed
);
14331 if (GET_MODE_CLASS (mode
) == MODE_INT
14332 && (aarch64_plus_immediate (op1
, mode
)
14333 || aarch64_sve_addvl_addpl_immediate (op1
, mode
)))
14335 *cost
+= rtx_cost (op0
, mode
, PLUS
, 0, speed
);
14339 /* ADD (immediate). */
14340 *cost
+= extra_cost
->alu
.arith
;
14342 /* Some tunings prefer to not use the VL-based scalar ops.
14343 Increase the cost of the poly immediate to prevent their
14345 if (GET_CODE (op1
) == CONST_POLY_INT
14346 && (aarch64_tune_params
.extra_tuning_flags
14347 & AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
))
14348 *cost
+= COSTS_N_INSNS (1);
14353 if (aarch64_pluslong_immediate (op1
, mode
))
14355 /* 24-bit add in 2 instructions or 12-bit shifted add. */
14356 if ((INTVAL (op1
) & 0xfff) != 0)
14357 *cost
+= COSTS_N_INSNS (1);
14359 *cost
+= rtx_cost (op0
, mode
, PLUS
, 0, speed
);
14363 *cost
+= rtx_cost (op1
, mode
, PLUS
, 1, speed
);
14365 /* Look for ADD (extended register). */
14366 if (is_a
<scalar_int_mode
> (mode
)
14367 && aarch64_rtx_arith_op_extract_p (op0
))
14370 *cost
+= extra_cost
->alu
.extend_arith
;
14372 op0
= aarch64_strip_extend (op0
, true);
14373 *cost
+= rtx_cost (op0
, VOIDmode
,
14374 (enum rtx_code
) GET_CODE (op0
), 0, speed
);
14378 /* Strip any extend, leave shifts behind as we will
14379 cost them through mult_cost. */
14380 new_op0
= aarch64_strip_extend (op0
, false);
14382 if (GET_CODE (new_op0
) == MULT
14383 || aarch64_shift_p (GET_CODE (new_op0
)))
14385 *cost
+= aarch64_rtx_mult_cost (new_op0
, MULT
, PLUS
,
14390 *cost
+= rtx_cost (new_op0
, VOIDmode
, PLUS
, 0, speed
);
14394 if (VECTOR_MODE_P (mode
))
14397 *cost
+= extra_cost
->vect
.alu
;
14399 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
14402 *cost
+= extra_cost
->alu
.arith
;
14404 else if (GET_MODE_CLASS (mode
) == MODE_FLOAT
)
14407 *cost
+= extra_cost
->fp
[mode
== DFmode
].addsub
;
14414 *cost
= COSTS_N_INSNS (1);
14418 if (VECTOR_MODE_P (mode
))
14419 *cost
+= extra_cost
->vect
.alu
;
14421 *cost
+= extra_cost
->alu
.rev
;
14426 if (aarch_rev16_p (x
))
14428 *cost
= COSTS_N_INSNS (1);
14432 if (VECTOR_MODE_P (mode
))
14433 *cost
+= extra_cost
->vect
.alu
;
14435 *cost
+= extra_cost
->alu
.rev
;
14440 if (aarch64_extr_rtx_p (x
, &op0
, &op1
))
14442 *cost
+= rtx_cost (op0
, mode
, IOR
, 0, speed
);
14443 *cost
+= rtx_cost (op1
, mode
, IOR
, 1, speed
);
14445 *cost
+= extra_cost
->alu
.shift
;
14449 /* Fall through. */
14456 if (VECTOR_MODE_P (mode
))
14459 *cost
+= extra_cost
->vect
.alu
;
14464 && GET_CODE (op0
) == MULT
14465 && CONST_INT_P (XEXP (op0
, 1))
14466 && CONST_INT_P (op1
)
14467 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0
, 1))),
14468 INTVAL (op1
)) != 0)
14470 /* This is a UBFM/SBFM. */
14471 *cost
+= rtx_cost (XEXP (op0
, 0), mode
, ZERO_EXTRACT
, 0, speed
);
14473 *cost
+= extra_cost
->alu
.bfx
;
14477 if (is_int_mode (mode
, &int_mode
))
14479 if (CONST_INT_P (op1
))
14481 /* We have a mask + shift version of a UBFIZ
14482 i.e. the *andim_ashift<mode>_bfiz pattern. */
14483 if (GET_CODE (op0
) == ASHIFT
14484 && aarch64_mask_and_shift_for_ubfiz_p (int_mode
, op1
,
14487 *cost
+= rtx_cost (XEXP (op0
, 0), int_mode
,
14488 (enum rtx_code
) code
, 0, speed
);
14490 *cost
+= extra_cost
->alu
.bfx
;
14494 else if (aarch64_bitmask_imm (INTVAL (op1
), int_mode
))
14496 /* We possibly get the immediate for free, this is not
14498 *cost
+= rtx_cost (op0
, int_mode
,
14499 (enum rtx_code
) code
, 0, speed
);
14501 *cost
+= extra_cost
->alu
.logical
;
14510 /* Handle ORN, EON, or BIC. */
14511 if (GET_CODE (op0
) == NOT
)
14512 op0
= XEXP (op0
, 0);
14514 new_op0
= aarch64_strip_shift (op0
);
14516 /* If we had a shift on op0 then this is a logical-shift-
14517 by-register/immediate operation. Otherwise, this is just
14518 a logical operation. */
14521 if (new_op0
!= op0
)
14523 /* Shift by immediate. */
14524 if (CONST_INT_P (XEXP (op0
, 1)))
14525 *cost
+= extra_cost
->alu
.log_shift
;
14527 *cost
+= extra_cost
->alu
.log_shift_reg
;
14530 *cost
+= extra_cost
->alu
.logical
;
14533 /* In both cases we want to cost both operands. */
14534 *cost
+= rtx_cost (new_op0
, int_mode
, (enum rtx_code
) code
,
14536 *cost
+= rtx_cost (op1
, int_mode
, (enum rtx_code
) code
,
14546 op0
= aarch64_strip_shift (x
);
14548 if (VECTOR_MODE_P (mode
))
14551 *cost
+= extra_cost
->vect
.alu
;
14555 /* MVN-shifted-reg. */
14558 *cost
+= rtx_cost (op0
, mode
, (enum rtx_code
) code
, 0, speed
);
14561 *cost
+= extra_cost
->alu
.log_shift
;
14565 /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
14566 Handle the second form here taking care that 'a' in the above can
14568 else if (GET_CODE (op0
) == XOR
)
14570 rtx newop0
= XEXP (op0
, 0);
14571 rtx newop1
= XEXP (op0
, 1);
14572 rtx op0_stripped
= aarch64_strip_shift (newop0
);
14574 *cost
+= rtx_cost (newop1
, mode
, (enum rtx_code
) code
, 1, speed
);
14575 *cost
+= rtx_cost (op0_stripped
, mode
, XOR
, 0, speed
);
14579 if (op0_stripped
!= newop0
)
14580 *cost
+= extra_cost
->alu
.log_shift
;
14582 *cost
+= extra_cost
->alu
.logical
;
14589 *cost
+= extra_cost
->alu
.logical
;
14596 /* If a value is written in SI mode, then zero extended to DI
14597 mode, the operation will in general be free as a write to
14598 a 'w' register implicitly zeroes the upper bits of an 'x'
14599 register. However, if this is
14601 (set (reg) (zero_extend (reg)))
14603 we must cost the explicit register move. */
14605 && GET_MODE (op0
) == SImode
)
14607 int op_cost
= rtx_cost (op0
, VOIDmode
, ZERO_EXTEND
, 0, speed
);
14609 /* If OP_COST is non-zero, then the cost of the zero extend
14610 is effectively the cost of the inner operation. Otherwise
14611 we have a MOV instruction and we take the cost from the MOV
14612 itself. This is true independently of whether we are
14613 optimizing for space or time. */
14619 else if (MEM_P (op0
))
14621 /* All loads can zero extend to any size for free. */
14622 *cost
= rtx_cost (op0
, VOIDmode
, ZERO_EXTEND
, param
, speed
);
14626 op0
= aarch64_extend_bitfield_pattern_p (x
);
14629 *cost
+= rtx_cost (op0
, mode
, ZERO_EXTEND
, 0, speed
);
14631 *cost
+= extra_cost
->alu
.bfx
;
14637 if (VECTOR_MODE_P (mode
))
14640 *cost
+= extra_cost
->vect
.alu
;
14644 /* We generate an AND instead of UXTB/UXTH. */
14645 *cost
+= extra_cost
->alu
.logical
;
14651 if (MEM_P (XEXP (x
, 0)))
14656 rtx address
= XEXP (XEXP (x
, 0), 0);
14657 *cost
+= extra_cost
->ldst
.load_sign_extend
;
14660 COSTS_N_INSNS (aarch64_address_cost (address
, mode
,
14666 op0
= aarch64_extend_bitfield_pattern_p (x
);
14669 *cost
+= rtx_cost (op0
, mode
, SIGN_EXTEND
, 0, speed
);
14671 *cost
+= extra_cost
->alu
.bfx
;
14677 if (VECTOR_MODE_P (mode
))
14678 *cost
+= extra_cost
->vect
.alu
;
14680 *cost
+= extra_cost
->alu
.extend
;
14688 if (CONST_INT_P (op1
))
14692 if (VECTOR_MODE_P (mode
))
14694 /* Vector shift (immediate). */
14695 *cost
+= extra_cost
->vect
.alu
;
14699 /* LSL (immediate), UBMF, UBFIZ and friends. These are all
14701 *cost
+= extra_cost
->alu
.shift
;
14705 /* We can incorporate zero/sign extend for free. */
14706 if (GET_CODE (op0
) == ZERO_EXTEND
14707 || GET_CODE (op0
) == SIGN_EXTEND
)
14708 op0
= XEXP (op0
, 0);
14710 *cost
+= rtx_cost (op0
, VOIDmode
, ASHIFT
, 0, speed
);
14715 if (VECTOR_MODE_P (mode
))
14718 /* Vector shift (register). */
14719 *cost
+= extra_cost
->vect
.alu
;
14725 *cost
+= extra_cost
->alu
.shift_reg
;
14727 if (GET_CODE (op1
) == AND
&& REG_P (XEXP (op1
, 0))
14728 && CONST_INT_P (XEXP (op1
, 1))
14729 && known_eq (INTVAL (XEXP (op1
, 1)),
14730 GET_MODE_BITSIZE (mode
) - 1))
14732 *cost
+= rtx_cost (op0
, mode
, (rtx_code
) code
, 0, speed
);
14733 /* We already demanded XEXP (op1, 0) to be REG_P, so
14734 don't recurse into it. */
14738 return false; /* All arguments need to be in registers. */
14748 if (CONST_INT_P (op1
))
14750 /* ASR (immediate) and friends. */
14753 if (VECTOR_MODE_P (mode
))
14754 *cost
+= extra_cost
->vect
.alu
;
14756 *cost
+= extra_cost
->alu
.shift
;
14759 *cost
+= rtx_cost (op0
, mode
, (enum rtx_code
) code
, 0, speed
);
14764 if (VECTOR_MODE_P (mode
))
14767 /* Vector shift (register). */
14768 *cost
+= extra_cost
->vect
.alu
;
14773 /* ASR (register) and friends. */
14774 *cost
+= extra_cost
->alu
.shift_reg
;
14776 if (GET_CODE (op1
) == AND
&& REG_P (XEXP (op1
, 0))
14777 && CONST_INT_P (XEXP (op1
, 1))
14778 && known_eq (INTVAL (XEXP (op1
, 1)),
14779 GET_MODE_BITSIZE (mode
) - 1))
14781 *cost
+= rtx_cost (op0
, mode
, (rtx_code
) code
, 0, speed
);
14782 /* We already demanded XEXP (op1, 0) to be REG_P, so
14783 don't recurse into it. */
14787 return false; /* All arguments need to be in registers. */
14792 if (aarch64_cmodel
== AARCH64_CMODEL_LARGE
14793 || aarch64_cmodel
== AARCH64_CMODEL_SMALL_SPIC
)
14797 *cost
+= extra_cost
->ldst
.load
;
14799 else if (aarch64_cmodel
== AARCH64_CMODEL_SMALL
14800 || aarch64_cmodel
== AARCH64_CMODEL_SMALL_PIC
)
14802 /* ADRP, followed by ADD. */
14803 *cost
+= COSTS_N_INSNS (1);
14805 *cost
+= 2 * extra_cost
->alu
.arith
;
14807 else if (aarch64_cmodel
== AARCH64_CMODEL_TINY
14808 || aarch64_cmodel
== AARCH64_CMODEL_TINY_PIC
)
14812 *cost
+= extra_cost
->alu
.arith
;
14817 /* One extra load instruction, after accessing the GOT. */
14818 *cost
+= COSTS_N_INSNS (1);
14820 *cost
+= extra_cost
->ldst
.load
;
14826 /* ADRP/ADD (immediate). */
14828 *cost
+= extra_cost
->alu
.arith
;
14836 if (VECTOR_MODE_P (mode
))
14837 *cost
+= extra_cost
->vect
.alu
;
14839 *cost
+= extra_cost
->alu
.bfx
;
14842 /* We can trust that the immediates used will be correct (there
14843 are no by-register forms), so we need only cost op0. */
14844 *cost
+= rtx_cost (XEXP (x
, 0), VOIDmode
, (enum rtx_code
) code
, 0, speed
);
14848 *cost
+= aarch64_rtx_mult_cost (x
, MULT
, 0, speed
);
14849 /* aarch64_rtx_mult_cost always handles recursion to its
14854 /* We can expand signed mod by power of 2 using a NEGS, two parallel
14855 ANDs and a CSNEG. Assume here that CSNEG is the same as the cost of
14856 an unconditional negate. This case should only ever be reached through
14857 the set_smod_pow2_cheap check in expmed.cc. */
14858 if (CONST_INT_P (XEXP (x
, 1))
14859 && exact_log2 (INTVAL (XEXP (x
, 1))) > 0
14860 && (mode
== SImode
|| mode
== DImode
))
14862 /* We expand to 4 instructions. Reset the baseline. */
14863 *cost
= COSTS_N_INSNS (4);
14866 *cost
+= 2 * extra_cost
->alu
.logical
14867 + 2 * extra_cost
->alu
.arith
;
14872 /* Fall-through. */
14876 /* Slighly prefer UMOD over SMOD. */
14877 if (VECTOR_MODE_P (mode
))
14878 *cost
+= extra_cost
->vect
.alu
;
14879 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
14880 *cost
+= (extra_cost
->mult
[mode
== DImode
].add
14881 + extra_cost
->mult
[mode
== DImode
].idiv
14882 + (code
== MOD
? 1 : 0));
14884 return false; /* All arguments need to be in registers. */
14891 if (VECTOR_MODE_P (mode
))
14892 *cost
+= extra_cost
->vect
.alu
;
14893 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
14894 /* There is no integer SQRT, so only DIV and UDIV can get
14896 *cost
+= (extra_cost
->mult
[mode
== DImode
].idiv
14897 /* Slighly prefer UDIV over SDIV. */
14898 + (code
== DIV
? 1 : 0));
14900 *cost
+= extra_cost
->fp
[mode
== DFmode
].div
;
14902 return false; /* All arguments need to be in registers. */
14905 return aarch64_if_then_else_costs (XEXP (x
, 0), XEXP (x
, 1),
14906 XEXP (x
, 2), cost
, speed
);
14919 return false; /* All arguments must be in registers. */
14928 if (VECTOR_MODE_P (mode
))
14929 *cost
+= extra_cost
->vect
.alu
;
14931 *cost
+= extra_cost
->fp
[mode
== DFmode
].fma
;
14934 /* FMSUB, FNMADD, and FNMSUB are free. */
14935 if (GET_CODE (op0
) == NEG
)
14936 op0
= XEXP (op0
, 0);
14938 if (GET_CODE (op2
) == NEG
)
14939 op2
= XEXP (op2
, 0);
14941 /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
14942 and the by-element operand as operand 0. */
14943 if (GET_CODE (op1
) == NEG
)
14944 op1
= XEXP (op1
, 0);
14946 /* Catch vector-by-element operations. The by-element operand can
14947 either be (vec_duplicate (vec_select (x))) or just
14948 (vec_select (x)), depending on whether we are multiplying by
14949 a vector or a scalar.
14951 Canonicalization is not very good in these cases, FMA4 will put the
14952 by-element operand as operand 0, FNMA4 will have it as operand 1. */
14953 if (GET_CODE (op0
) == VEC_DUPLICATE
)
14954 op0
= XEXP (op0
, 0);
14955 else if (GET_CODE (op1
) == VEC_DUPLICATE
)
14956 op1
= XEXP (op1
, 0);
14958 if (GET_CODE (op0
) == VEC_SELECT
)
14959 op0
= XEXP (op0
, 0);
14960 else if (GET_CODE (op1
) == VEC_SELECT
)
14961 op1
= XEXP (op1
, 0);
14963 /* If the remaining parameters are not registers,
14964 get the cost to put them into registers. */
14965 *cost
+= rtx_cost (op0
, mode
, FMA
, 0, speed
);
14966 *cost
+= rtx_cost (op1
, mode
, FMA
, 1, speed
);
14967 *cost
+= rtx_cost (op2
, mode
, FMA
, 2, speed
);
14971 case UNSIGNED_FLOAT
:
14973 *cost
+= extra_cost
->fp
[mode
== DFmode
].fromint
;
14979 if (VECTOR_MODE_P (mode
))
14981 /*Vector truncate. */
14982 *cost
+= extra_cost
->vect
.alu
;
14985 *cost
+= extra_cost
->fp
[mode
== DFmode
].widen
;
14989 case FLOAT_TRUNCATE
:
14992 if (VECTOR_MODE_P (mode
))
14994 /*Vector conversion. */
14995 *cost
+= extra_cost
->vect
.alu
;
14998 *cost
+= extra_cost
->fp
[mode
== DFmode
].narrow
;
15005 /* Strip the rounding part. They will all be implemented
15006 by the fcvt* family of instructions anyway. */
15007 if (GET_CODE (x
) == UNSPEC
)
15009 unsigned int uns_code
= XINT (x
, 1);
15011 if (uns_code
== UNSPEC_FRINTA
15012 || uns_code
== UNSPEC_FRINTM
15013 || uns_code
== UNSPEC_FRINTN
15014 || uns_code
== UNSPEC_FRINTP
15015 || uns_code
== UNSPEC_FRINTZ
)
15016 x
= XVECEXP (x
, 0, 0);
15021 if (VECTOR_MODE_P (mode
))
15022 *cost
+= extra_cost
->vect
.alu
;
15024 *cost
+= extra_cost
->fp
[GET_MODE (x
) == DFmode
].toint
;
15027 /* We can combine fmul by a power of 2 followed by a fcvt into a single
15028 fixed-point fcvt. */
15029 if (GET_CODE (x
) == MULT
15030 && ((VECTOR_MODE_P (mode
)
15031 && aarch64_vec_fpconst_pow_of_2 (XEXP (x
, 1)) > 0)
15032 || aarch64_fpconst_pow_of_2 (XEXP (x
, 1)) > 0))
15034 *cost
+= rtx_cost (XEXP (x
, 0), VOIDmode
, (rtx_code
) code
,
15039 *cost
+= rtx_cost (x
, VOIDmode
, (enum rtx_code
) code
, 0, speed
);
15043 if (VECTOR_MODE_P (mode
))
15045 /* ABS (vector). */
15047 *cost
+= extra_cost
->vect
.alu
;
15049 else if (GET_MODE_CLASS (mode
) == MODE_FLOAT
)
15053 /* FABD, which is analogous to FADD. */
15054 if (GET_CODE (op0
) == MINUS
)
15056 *cost
+= rtx_cost (XEXP (op0
, 0), mode
, MINUS
, 0, speed
);
15057 *cost
+= rtx_cost (XEXP (op0
, 1), mode
, MINUS
, 1, speed
);
15059 *cost
+= extra_cost
->fp
[mode
== DFmode
].addsub
;
15063 /* Simple FABS is analogous to FNEG. */
15065 *cost
+= extra_cost
->fp
[mode
== DFmode
].neg
;
15069 /* Integer ABS will either be split to
15070 two arithmetic instructions, or will be an ABS
15071 (scalar), which we don't model. */
15072 *cost
= COSTS_N_INSNS (2);
15074 *cost
+= 2 * extra_cost
->alu
.arith
;
15082 if (VECTOR_MODE_P (mode
))
15083 *cost
+= extra_cost
->vect
.alu
;
15086 /* FMAXNM/FMINNM/FMAX/FMIN.
15087 TODO: This may not be accurate for all implementations, but
15088 we do not model this in the cost tables. */
15089 *cost
+= extra_cost
->fp
[mode
== DFmode
].addsub
;
15095 /* The floating point round to integer frint* instructions. */
15096 if (aarch64_frint_unspec_p (XINT (x
, 1)))
15099 *cost
+= extra_cost
->fp
[mode
== DFmode
].roundint
;
15104 if (XINT (x
, 1) == UNSPEC_RBIT
)
15107 *cost
+= extra_cost
->alu
.rev
;
15115 /* Decompose <su>muldi3_highpart. */
15116 if (/* (truncate:DI */
15119 && GET_MODE (XEXP (x
, 0)) == TImode
15120 && GET_CODE (XEXP (x
, 0)) == LSHIFTRT
15122 && GET_CODE (XEXP (XEXP (x
, 0), 0)) == MULT
15123 /* (ANY_EXTEND:TI (reg:DI))
15124 (ANY_EXTEND:TI (reg:DI))) */
15125 && ((GET_CODE (XEXP (XEXP (XEXP (x
, 0), 0), 0)) == ZERO_EXTEND
15126 && GET_CODE (XEXP (XEXP (XEXP (x
, 0), 0), 1)) == ZERO_EXTEND
)
15127 || (GET_CODE (XEXP (XEXP (XEXP (x
, 0), 0), 0)) == SIGN_EXTEND
15128 && GET_CODE (XEXP (XEXP (XEXP (x
, 0), 0), 1)) == SIGN_EXTEND
))
15129 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x
, 0), 0), 0), 0)) == DImode
15130 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x
, 0), 0), 1), 0)) == DImode
15131 /* (const_int 64) */
15132 && CONST_INT_P (XEXP (XEXP (x
, 0), 1))
15133 && UINTVAL (XEXP (XEXP (x
, 0), 1)) == 64)
15137 *cost
+= extra_cost
->mult
[mode
== DImode
].extend
;
15138 *cost
+= rtx_cost (XEXP (XEXP (XEXP (XEXP (x
, 0), 0), 0), 0),
15139 mode
, MULT
, 0, speed
);
15140 *cost
+= rtx_cost (XEXP (XEXP (XEXP (XEXP (x
, 0), 0), 1), 0),
15141 mode
, MULT
, 1, speed
);
15147 /* Load using MOVI/MVNI. */
15148 if (aarch64_simd_valid_immediate (x
, NULL
))
15149 *cost
= extra_cost
->vect
.movi
;
15150 else /* Load using constant pool. */
15151 *cost
= extra_cost
->ldst
.load
;
15155 /* depending on the operation, either DUP or INS.
15156 For now, keep default costing. */
15158 case VEC_DUPLICATE
:
15159 /* Load using a DUP. */
15160 *cost
= extra_cost
->vect
.dup
;
15164 rtx op0
= XEXP (x
, 0);
15165 *cost
= rtx_cost (op0
, GET_MODE (op0
), VEC_SELECT
, 0, speed
);
15167 /* cost subreg of 0 as free, otherwise as DUP */
15168 rtx op1
= XEXP (x
, 1);
15169 if (vec_series_lowpart_p (mode
, GET_MODE (op1
), op1
))
15171 else if (vec_series_highpart_p (mode
, GET_MODE (op1
), op1
))
15172 *cost
= extra_cost
->vect
.dup
;
15174 *cost
= extra_cost
->vect
.extract
;
15182 && flag_aarch64_verbose_cost
)
15183 fprintf (dump_file
,
15184 "\nFailed to cost RTX. Assuming default cost.\n");
15189 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
15190 calculated for X. This cost is stored in *COST. Returns true
15191 if the total cost of X was calculated. */
15193 aarch64_rtx_costs_wrapper (rtx x
, machine_mode mode
, int outer
,
15194 int param
, int *cost
, bool speed
)
15196 bool result
= aarch64_rtx_costs (x
, mode
, outer
, param
, cost
, speed
);
15199 && flag_aarch64_verbose_cost
)
15201 print_rtl_single (dump_file
, x
);
15202 fprintf (dump_file
, "\n%s cost: %d (%s)\n",
15203 speed
? "Hot" : "Cold",
15204 *cost
, result
? "final" : "partial");
15211 aarch64_register_move_cost (machine_mode mode
,
15212 reg_class_t from_i
, reg_class_t to_i
)
15214 enum reg_class from
= (enum reg_class
) from_i
;
15215 enum reg_class to
= (enum reg_class
) to_i
;
15216 const struct cpu_regmove_cost
*regmove_cost
15217 = aarch64_tune_params
.regmove_cost
;
15219 /* Caller save and pointer regs are equivalent to GENERAL_REGS. */
15220 if (to
== TAILCALL_ADDR_REGS
|| to
== POINTER_REGS
15221 || to
== STUB_REGS
)
15224 if (from
== TAILCALL_ADDR_REGS
|| from
== POINTER_REGS
15225 || from
== STUB_REGS
)
15226 from
= GENERAL_REGS
;
15228 /* Make RDFFR very expensive. In particular, if we know that the FFR
15229 contains a PTRUE (e.g. after a SETFFR), we must never use RDFFR
15230 as a way of obtaining a PTRUE. */
15231 if (GET_MODE_CLASS (mode
) == MODE_VECTOR_BOOL
15232 && hard_reg_set_subset_p (reg_class_contents
[from_i
],
15233 reg_class_contents
[FFR_REGS
]))
15236 /* Moving between GPR and stack cost is the same as GP2GP. */
15237 if ((from
== GENERAL_REGS
&& to
== STACK_REG
)
15238 || (to
== GENERAL_REGS
&& from
== STACK_REG
))
15239 return regmove_cost
->GP2GP
;
15241 /* To/From the stack register, we move via the gprs. */
15242 if (to
== STACK_REG
|| from
== STACK_REG
)
15243 return aarch64_register_move_cost (mode
, from
, GENERAL_REGS
)
15244 + aarch64_register_move_cost (mode
, GENERAL_REGS
, to
);
15246 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
15247 if (vec_flags
!= (VEC_ADVSIMD
| VEC_STRUCT
| VEC_PARTIAL
)
15248 && known_eq (GET_MODE_SIZE (mode
), 16))
15250 /* 128-bit operations on general registers require 2 instructions. */
15251 if (from
== GENERAL_REGS
&& to
== GENERAL_REGS
)
15252 return regmove_cost
->GP2GP
* 2;
15253 else if (from
== GENERAL_REGS
)
15254 return regmove_cost
->GP2FP
* 2;
15255 else if (to
== GENERAL_REGS
)
15256 return regmove_cost
->FP2GP
* 2;
15258 /* When AdvSIMD instructions are disabled it is not possible to move
15259 a 128-bit value directly between Q registers. This is handled in
15260 secondary reload. A general register is used as a scratch to move
15261 the upper DI value and the lower DI value is moved directly,
15262 hence the cost is the sum of three moves. */
15264 return regmove_cost
->GP2FP
+ regmove_cost
->FP2GP
+ regmove_cost
->FP2FP
;
15266 return regmove_cost
->FP2FP
;
15269 if (from
== GENERAL_REGS
&& to
== GENERAL_REGS
)
15270 return regmove_cost
->GP2GP
;
15271 else if (from
== GENERAL_REGS
)
15272 return regmove_cost
->GP2FP
;
15273 else if (to
== GENERAL_REGS
)
15274 return regmove_cost
->FP2GP
;
15276 if (!TARGET_SIMD
&& vec_flags
== (VEC_ADVSIMD
| VEC_STRUCT
))
15278 /* Needs a round-trip through memory, which can use LDP/STP for pairs.
15279 The cost must be greater than 2 units to indicate that direct
15280 moves aren't possible. */
15281 auto per_vector
= (aarch64_tune_params
.memmov_cost
.load_fp
15282 + aarch64_tune_params
.memmov_cost
.store_fp
);
15283 return MIN (CEIL (per_vector
, 2), 4);
15286 return regmove_cost
->FP2FP
;
15289 /* Implements TARGET_MEMORY_MOVE_COST. */
15291 aarch64_memory_move_cost (machine_mode mode
, reg_class_t rclass_i
, bool in
)
15293 enum reg_class rclass
= (enum reg_class
) rclass_i
;
15294 if (GET_MODE_CLASS (mode
) == MODE_VECTOR_BOOL
15295 ? reg_classes_intersect_p (rclass
, PR_REGS
)
15296 : reg_class_subset_p (rclass
, PR_REGS
))
15298 ? aarch64_tune_params
.memmov_cost
.load_pred
15299 : aarch64_tune_params
.memmov_cost
.store_pred
);
15301 if (VECTOR_MODE_P (mode
) || FLOAT_MODE_P (mode
)
15302 ? reg_classes_intersect_p (rclass
, FP_REGS
)
15303 : reg_class_subset_p (rclass
, FP_REGS
))
15305 ? aarch64_tune_params
.memmov_cost
.load_fp
15306 : aarch64_tune_params
.memmov_cost
.store_fp
);
15309 ? aarch64_tune_params
.memmov_cost
.load_int
15310 : aarch64_tune_params
.memmov_cost
.store_int
);
15313 /* Implement TARGET_INIT_BUILTINS. */
15315 aarch64_init_builtins ()
15317 aarch64_general_init_builtins ();
15318 aarch64_sve::init_builtins ();
15319 #ifdef SUBTARGET_INIT_BUILTINS
15320 SUBTARGET_INIT_BUILTINS
;
15324 /* Implement TARGET_FOLD_BUILTIN. */
15326 aarch64_fold_builtin (tree fndecl
, int nargs
, tree
*args
, bool)
15328 unsigned int code
= DECL_MD_FUNCTION_CODE (fndecl
);
15329 unsigned int subcode
= code
>> AARCH64_BUILTIN_SHIFT
;
15330 tree type
= TREE_TYPE (TREE_TYPE (fndecl
));
15331 switch (code
& AARCH64_BUILTIN_CLASS
)
15333 case AARCH64_BUILTIN_GENERAL
:
15334 return aarch64_general_fold_builtin (subcode
, type
, nargs
, args
);
15336 case AARCH64_BUILTIN_SVE
:
15339 gcc_unreachable ();
15342 /* Implement TARGET_GIMPLE_FOLD_BUILTIN. */
15344 aarch64_gimple_fold_builtin (gimple_stmt_iterator
*gsi
)
15346 gcall
*stmt
= as_a
<gcall
*> (gsi_stmt (*gsi
));
15347 tree fndecl
= gimple_call_fndecl (stmt
);
15348 unsigned int code
= DECL_MD_FUNCTION_CODE (fndecl
);
15349 unsigned int subcode
= code
>> AARCH64_BUILTIN_SHIFT
;
15350 gimple
*new_stmt
= NULL
;
15351 switch (code
& AARCH64_BUILTIN_CLASS
)
15353 case AARCH64_BUILTIN_GENERAL
:
15354 new_stmt
= aarch64_general_gimple_fold_builtin (subcode
, stmt
, gsi
);
15357 case AARCH64_BUILTIN_SVE
:
15358 new_stmt
= aarch64_sve::gimple_fold_builtin (subcode
, gsi
, stmt
);
15365 gsi_replace (gsi
, new_stmt
, false);
15369 /* Implement TARGET_EXPAND_BUILTIN. */
15371 aarch64_expand_builtin (tree exp
, rtx target
, rtx
, machine_mode
, int ignore
)
15373 tree fndecl
= TREE_OPERAND (CALL_EXPR_FN (exp
), 0);
15374 unsigned int code
= DECL_MD_FUNCTION_CODE (fndecl
);
15375 unsigned int subcode
= code
>> AARCH64_BUILTIN_SHIFT
;
15376 switch (code
& AARCH64_BUILTIN_CLASS
)
15378 case AARCH64_BUILTIN_GENERAL
:
15379 return aarch64_general_expand_builtin (subcode
, exp
, target
, ignore
);
15381 case AARCH64_BUILTIN_SVE
:
15382 return aarch64_sve::expand_builtin (subcode
, exp
, target
);
15384 gcc_unreachable ();
15387 /* Implement TARGET_BUILTIN_DECL. */
15389 aarch64_builtin_decl (unsigned int code
, bool initialize_p
)
15391 unsigned int subcode
= code
>> AARCH64_BUILTIN_SHIFT
;
15392 switch (code
& AARCH64_BUILTIN_CLASS
)
15394 case AARCH64_BUILTIN_GENERAL
:
15395 return aarch64_general_builtin_decl (subcode
, initialize_p
);
15397 case AARCH64_BUILTIN_SVE
:
15398 return aarch64_sve::builtin_decl (subcode
, initialize_p
);
15400 gcc_unreachable ();
15403 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
15404 to optimize 1.0/sqrt. */
15407 use_rsqrt_p (machine_mode mode
)
15409 return (!flag_trapping_math
15410 && flag_unsafe_math_optimizations
15411 && ((aarch64_tune_params
.approx_modes
->recip_sqrt
15412 & AARCH64_APPROX_MODE (mode
))
15413 || flag_mrecip_low_precision_sqrt
));
15416 /* Function to decide when to use the approximate reciprocal square root
15420 aarch64_builtin_reciprocal (tree fndecl
)
15422 machine_mode mode
= TYPE_MODE (TREE_TYPE (fndecl
));
15424 if (!use_rsqrt_p (mode
))
15426 unsigned int code
= DECL_MD_FUNCTION_CODE (fndecl
);
15427 unsigned int subcode
= code
>> AARCH64_BUILTIN_SHIFT
;
15428 switch (code
& AARCH64_BUILTIN_CLASS
)
15430 case AARCH64_BUILTIN_GENERAL
:
15431 return aarch64_general_builtin_rsqrt (subcode
);
15433 case AARCH64_BUILTIN_SVE
:
15436 gcc_unreachable ();
15439 /* Emit code to perform the floating-point operation:
15443 where all three operands are already known to be registers.
15444 If the operation is an SVE one, PTRUE is a suitable all-true
15448 aarch64_emit_mult (rtx dst
, rtx ptrue
, rtx src1
, rtx src2
)
15451 emit_insn (gen_aarch64_pred (UNSPEC_COND_FMUL
, GET_MODE (dst
),
15452 dst
, ptrue
, src1
, src2
,
15453 gen_int_mode (SVE_RELAXED_GP
, SImode
)));
15455 emit_set_insn (dst
, gen_rtx_MULT (GET_MODE (dst
), src1
, src2
));
15458 /* Emit instruction sequence to compute either the approximate square root
15459 or its approximate reciprocal, depending on the flag RECP, and return
15460 whether the sequence was emitted or not. */
15463 aarch64_emit_approx_sqrt (rtx dst
, rtx src
, bool recp
)
15465 machine_mode mode
= GET_MODE (dst
);
15467 if (GET_MODE_INNER (mode
) == HFmode
)
15469 gcc_assert (!recp
);
15475 if (!(flag_mlow_precision_sqrt
15476 || (aarch64_tune_params
.approx_modes
->sqrt
15477 & AARCH64_APPROX_MODE (mode
))))
15480 if (!flag_finite_math_only
15481 || flag_trapping_math
15482 || !flag_unsafe_math_optimizations
15483 || optimize_function_for_size_p (cfun
))
15487 /* Caller assumes we cannot fail. */
15488 gcc_assert (use_rsqrt_p (mode
));
15491 if (aarch64_sve_mode_p (mode
))
15492 pg
= aarch64_ptrue_reg (aarch64_sve_pred_mode (mode
));
15493 machine_mode mmsk
= (VECTOR_MODE_P (mode
)
15494 ? related_int_vector_mode (mode
).require ()
15495 : int_mode_for_mode (mode
).require ());
15496 rtx xmsk
= NULL_RTX
;
15499 /* When calculating the approximate square root, compare the
15500 argument with 0.0 and create a mask. */
15501 rtx zero
= CONST0_RTX (mode
);
15504 xmsk
= gen_reg_rtx (GET_MODE (pg
));
15505 rtx hint
= gen_int_mode (SVE_KNOWN_PTRUE
, SImode
);
15506 emit_insn (gen_aarch64_pred_fcm (UNSPEC_COND_FCMNE
, mode
,
15507 xmsk
, pg
, hint
, src
, zero
));
15511 xmsk
= gen_reg_rtx (mmsk
);
15512 emit_insn (gen_rtx_SET (xmsk
,
15514 gen_rtx_EQ (mmsk
, src
, zero
))));
15518 /* Estimate the approximate reciprocal square root. */
15519 rtx xdst
= gen_reg_rtx (mode
);
15520 emit_insn (gen_aarch64_rsqrte (mode
, xdst
, src
));
15522 /* Iterate over the series twice for SF and thrice for DF. */
15523 int iterations
= (GET_MODE_INNER (mode
) == DFmode
) ? 3 : 2;
15525 /* Optionally iterate over the series once less for faster performance
15526 while sacrificing the accuracy. */
15527 if ((recp
&& flag_mrecip_low_precision_sqrt
)
15528 || (!recp
&& flag_mlow_precision_sqrt
))
15531 /* Iterate over the series to calculate the approximate reciprocal square
15533 rtx x1
= gen_reg_rtx (mode
);
15534 while (iterations
--)
15536 rtx x2
= gen_reg_rtx (mode
);
15537 aarch64_emit_mult (x2
, pg
, xdst
, xdst
);
15539 emit_insn (gen_aarch64_rsqrts (mode
, x1
, src
, x2
));
15541 if (iterations
> 0)
15542 aarch64_emit_mult (xdst
, pg
, xdst
, x1
);
15548 /* Multiply nonzero source values by the corresponding intermediate
15549 result elements, so that the final calculation is the approximate
15550 square root rather than its reciprocal. Select a zero result for
15551 zero source values, to avoid the Inf * 0 -> NaN that we'd get
15553 emit_insn (gen_cond (UNSPEC_COND_FMUL
, mode
,
15554 xdst
, xmsk
, xdst
, src
, CONST0_RTX (mode
)));
15557 /* Qualify the approximate reciprocal square root when the
15558 argument is 0.0 by squashing the intermediary result to 0.0. */
15559 rtx xtmp
= gen_reg_rtx (mmsk
);
15560 emit_set_insn (xtmp
, gen_rtx_AND (mmsk
, gen_rtx_NOT (mmsk
, xmsk
),
15561 gen_rtx_SUBREG (mmsk
, xdst
, 0)));
15562 emit_move_insn (xdst
, gen_rtx_SUBREG (mode
, xtmp
, 0));
15564 /* Calculate the approximate square root. */
15565 aarch64_emit_mult (xdst
, pg
, xdst
, src
);
15569 /* Finalize the approximation. */
15570 aarch64_emit_mult (dst
, pg
, xdst
, x1
);
15575 /* Emit the instruction sequence to compute the approximation for the division
15576 of NUM by DEN in QUO and return whether the sequence was emitted or not. */
15579 aarch64_emit_approx_div (rtx quo
, rtx num
, rtx den
)
15581 machine_mode mode
= GET_MODE (quo
);
15583 if (GET_MODE_INNER (mode
) == HFmode
)
15586 bool use_approx_division_p
= (flag_mlow_precision_div
15587 || (aarch64_tune_params
.approx_modes
->division
15588 & AARCH64_APPROX_MODE (mode
)));
15590 if (!flag_finite_math_only
15591 || flag_trapping_math
15592 || !flag_unsafe_math_optimizations
15593 || optimize_function_for_size_p (cfun
)
15594 || !use_approx_division_p
)
15597 if (!TARGET_SIMD
&& VECTOR_MODE_P (mode
))
15601 if (aarch64_sve_mode_p (mode
))
15602 pg
= aarch64_ptrue_reg (aarch64_sve_pred_mode (mode
));
15604 /* Estimate the approximate reciprocal. */
15605 rtx xrcp
= gen_reg_rtx (mode
);
15606 emit_insn (gen_aarch64_frecpe (mode
, xrcp
, den
));
15608 /* Iterate over the series twice for SF and thrice for DF. */
15609 int iterations
= (GET_MODE_INNER (mode
) == DFmode
) ? 3 : 2;
15611 /* Optionally iterate over the series less for faster performance,
15612 while sacrificing the accuracy. The default is 2 for DF and 1 for SF. */
15613 if (flag_mlow_precision_div
)
15614 iterations
= (GET_MODE_INNER (mode
) == DFmode
15615 ? aarch64_double_recp_precision
15616 : aarch64_float_recp_precision
);
15618 /* Iterate over the series to calculate the approximate reciprocal. */
15619 rtx xtmp
= gen_reg_rtx (mode
);
15620 while (iterations
--)
15622 emit_insn (gen_aarch64_frecps (mode
, xtmp
, xrcp
, den
));
15624 if (iterations
> 0)
15625 aarch64_emit_mult (xrcp
, pg
, xrcp
, xtmp
);
15628 if (num
!= CONST1_RTX (mode
))
15630 /* As the approximate reciprocal of DEN is already calculated, only
15631 calculate the approximate division when NUM is not 1.0. */
15632 rtx xnum
= force_reg (mode
, num
);
15633 aarch64_emit_mult (xrcp
, pg
, xrcp
, xnum
);
15636 /* Finalize the approximation. */
15637 aarch64_emit_mult (quo
, pg
, xrcp
, xtmp
);
15641 /* Return the number of instructions that can be issued per cycle. */
15643 aarch64_sched_issue_rate (void)
15645 return aarch64_tune_params
.issue_rate
;
15648 /* Implement TARGET_SCHED_VARIABLE_ISSUE. */
15650 aarch64_sched_variable_issue (FILE *, int, rtx_insn
*insn
, int more
)
15652 if (DEBUG_INSN_P (insn
))
15655 rtx_code code
= GET_CODE (PATTERN (insn
));
15656 if (code
== USE
|| code
== CLOBBER
)
15659 if (get_attr_type (insn
) == TYPE_NO_INSN
)
15666 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
15668 int issue_rate
= aarch64_sched_issue_rate ();
15670 return issue_rate
> 1 && !sched_fusion
? issue_rate
: 0;
15674 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
15675 autopref_multipass_dfa_lookahead_guard from haifa-sched.cc. It only
15676 has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0. */
15679 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn
*insn
,
15682 return autopref_multipass_dfa_lookahead_guard (insn
, ready_index
);
15686 /* Vectorizer cost model target hooks. */
15688 /* Information about how the CPU would issue the scalar, Advanced SIMD
15689 or SVE version of a vector loop, using the scheme defined by the
15690 aarch64_base_vec_issue_info hierarchy of structures. */
15691 class aarch64_vec_op_count
15694 aarch64_vec_op_count () = default;
15695 aarch64_vec_op_count (const aarch64_vec_issue_info
*, unsigned int,
15698 unsigned int vec_flags () const { return m_vec_flags
; }
15699 unsigned int vf_factor () const { return m_vf_factor
; }
15701 const aarch64_base_vec_issue_info
*base_issue_info () const;
15702 const aarch64_simd_vec_issue_info
*simd_issue_info () const;
15703 const aarch64_sve_vec_issue_info
*sve_issue_info () const;
15705 fractional_cost
rename_cycles_per_iter () const;
15706 fractional_cost
min_nonpred_cycles_per_iter () const;
15707 fractional_cost
min_pred_cycles_per_iter () const;
15708 fractional_cost
min_cycles_per_iter () const;
15710 void dump () const;
15712 /* The number of individual "general" operations. See the comments
15713 in aarch64_base_vec_issue_info for details. */
15714 unsigned int general_ops
= 0;
15716 /* The number of load and store operations, under the same scheme
15718 unsigned int loads
= 0;
15719 unsigned int stores
= 0;
15721 /* The minimum number of cycles needed to execute all loop-carried
15722 operations, which in the vector code become associated with
15724 unsigned int reduction_latency
= 0;
15726 /* The number of individual predicate operations. See the comments
15727 in aarch64_sve_vec_issue_info for details. */
15728 unsigned int pred_ops
= 0;
15731 /* The issue information for the core. */
15732 const aarch64_vec_issue_info
*m_issue_info
= nullptr;
15734 /* - If M_VEC_FLAGS is zero then this structure describes scalar code
15735 - If M_VEC_FLAGS & VEC_ADVSIMD is nonzero then this structure describes
15736 Advanced SIMD code.
15737 - If M_VEC_FLAGS & VEC_ANY_SVE is nonzero then this structure describes
15739 unsigned int m_vec_flags
= 0;
15741 /* Assume that, when the code is executing on the core described
15742 by M_ISSUE_INFO, one iteration of the loop will handle M_VF_FACTOR
15743 times more data than the vectorizer anticipates.
15745 This is only ever different from 1 for SVE. It allows us to consider
15746 what would happen on a 256-bit SVE target even when the -mtune
15747 parameters say that the “likely” SVE length is 128 bits. */
15748 unsigned int m_vf_factor
= 1;
15751 aarch64_vec_op_count::
15752 aarch64_vec_op_count (const aarch64_vec_issue_info
*issue_info
,
15753 unsigned int vec_flags
, unsigned int vf_factor
)
15754 : m_issue_info (issue_info
),
15755 m_vec_flags (vec_flags
),
15756 m_vf_factor (vf_factor
)
15760 /* Return the base issue information (i.e. the parts that make sense
15761 for both scalar and vector code). Return null if we have no issue
15763 const aarch64_base_vec_issue_info
*
15764 aarch64_vec_op_count::base_issue_info () const
15766 if (auto *ret
= simd_issue_info ())
15768 return m_issue_info
->scalar
;
15771 /* If the structure describes vector code and we have associated issue
15772 information, return that issue information, otherwise return null. */
15773 const aarch64_simd_vec_issue_info
*
15774 aarch64_vec_op_count::simd_issue_info () const
15776 if (auto *ret
= sve_issue_info ())
15779 return m_issue_info
->advsimd
;
15783 /* If the structure describes SVE code and we have associated issue
15784 information, return that issue information, otherwise return null. */
15785 const aarch64_sve_vec_issue_info
*
15786 aarch64_vec_op_count::sve_issue_info () const
15788 if (m_vec_flags
& VEC_ANY_SVE
)
15789 return m_issue_info
->sve
;
15793 /* Estimate the minimum number of cycles per iteration needed to rename
15796 ??? For now this is done inline rather than via cost tables, since it
15797 isn't clear how it should be parameterized for the general case. */
15799 aarch64_vec_op_count::rename_cycles_per_iter () const
15801 if (sve_issue_info () == &neoverse512tvb_sve_issue_info
15802 || sve_issue_info () == &neoversen2_sve_issue_info
15803 || sve_issue_info () == &neoversev2_sve_issue_info
)
15804 /* + 1 for an addition. We've already counted a general op for each
15805 store, so we don't need to account for stores separately. The branch
15806 reads no registers and so does not need to be counted either.
15808 ??? This value is very much on the pessimistic side, but seems to work
15809 pretty well in practice. */
15810 return { general_ops
+ loads
+ pred_ops
+ 1, 5 };
15815 /* Like min_cycles_per_iter, but excluding predicate operations. */
15817 aarch64_vec_op_count::min_nonpred_cycles_per_iter () const
15819 auto *issue_info
= base_issue_info ();
15821 fractional_cost cycles
= MAX (reduction_latency
, 1);
15822 cycles
= std::max (cycles
, { stores
, issue_info
->stores_per_cycle
});
15823 cycles
= std::max (cycles
, { loads
+ stores
,
15824 issue_info
->loads_stores_per_cycle
});
15825 cycles
= std::max (cycles
, { general_ops
,
15826 issue_info
->general_ops_per_cycle
});
15827 cycles
= std::max (cycles
, rename_cycles_per_iter ());
15831 /* Like min_cycles_per_iter, but including only the predicate operations. */
15833 aarch64_vec_op_count::min_pred_cycles_per_iter () const
15835 if (auto *issue_info
= sve_issue_info ())
15836 return { pred_ops
, issue_info
->pred_ops_per_cycle
};
15840 /* Estimate the minimum number of cycles needed to issue the operations.
15841 This is a very simplistic model! */
15843 aarch64_vec_op_count::min_cycles_per_iter () const
15845 return std::max (min_nonpred_cycles_per_iter (),
15846 min_pred_cycles_per_iter ());
15849 /* Dump information about the structure. */
15851 aarch64_vec_op_count::dump () const
15853 dump_printf_loc (MSG_NOTE
, vect_location
,
15854 " load operations = %d\n", loads
);
15855 dump_printf_loc (MSG_NOTE
, vect_location
,
15856 " store operations = %d\n", stores
);
15857 dump_printf_loc (MSG_NOTE
, vect_location
,
15858 " general operations = %d\n", general_ops
);
15859 if (sve_issue_info ())
15860 dump_printf_loc (MSG_NOTE
, vect_location
,
15861 " predicate operations = %d\n", pred_ops
);
15862 dump_printf_loc (MSG_NOTE
, vect_location
,
15863 " reduction latency = %d\n", reduction_latency
);
15864 if (auto rcpi
= rename_cycles_per_iter ())
15865 dump_printf_loc (MSG_NOTE
, vect_location
,
15866 " estimated cycles per iteration to rename = %f\n",
15867 rcpi
.as_double ());
15868 if (auto pred_cpi
= min_pred_cycles_per_iter ())
15870 dump_printf_loc (MSG_NOTE
, vect_location
,
15871 " estimated min cycles per iteration"
15872 " without predication = %f\n",
15873 min_nonpred_cycles_per_iter ().as_double ());
15874 dump_printf_loc (MSG_NOTE
, vect_location
,
15875 " estimated min cycles per iteration"
15876 " for predication = %f\n", pred_cpi
.as_double ());
15878 if (auto cpi
= min_cycles_per_iter ())
15879 dump_printf_loc (MSG_NOTE
, vect_location
,
15880 " estimated min cycles per iteration = %f\n",
15884 /* Information about vector code that we're in the process of costing. */
15885 class aarch64_vector_costs
: public vector_costs
15888 aarch64_vector_costs (vec_info
*, bool);
15890 unsigned int add_stmt_cost (int count
, vect_cost_for_stmt kind
,
15891 stmt_vec_info stmt_info
, slp_tree
, tree vectype
,
15893 vect_cost_model_location where
) override
;
15894 void finish_cost (const vector_costs
*) override
;
15895 bool better_main_loop_than_p (const vector_costs
*other
) const override
;
15898 void record_potential_advsimd_unrolling (loop_vec_info
);
15899 void analyze_loop_vinfo (loop_vec_info
);
15900 void count_ops (unsigned int, vect_cost_for_stmt
, stmt_vec_info
,
15901 aarch64_vec_op_count
*);
15902 fractional_cost
adjust_body_cost_sve (const aarch64_vec_op_count
*,
15903 fractional_cost
, unsigned int,
15904 unsigned int *, bool *);
15905 unsigned int adjust_body_cost (loop_vec_info
, const aarch64_vector_costs
*,
15907 bool prefer_unrolled_loop () const;
15908 unsigned int determine_suggested_unroll_factor ();
15910 /* True if we have performed one-time initialization based on the
15912 bool m_analyzed_vinfo
= false;
15914 /* This loop uses an average operation that is not supported by SVE, but is
15915 supported by Advanced SIMD and SVE2. */
15916 bool m_has_avg
= false;
15918 /* - If M_VEC_FLAGS is zero then we're costing the original scalar code.
15919 - If M_VEC_FLAGS & VEC_ADVSIMD is nonzero then we're costing Advanced
15921 - If M_VEC_FLAGS & VEC_ANY_SVE is nonzero then we're costing SVE code. */
15922 unsigned int m_vec_flags
= 0;
15924 /* At the moment, we do not model LDP and STP in the vector and scalar costs.
15925 This means that code such as:
15930 will be costed as two scalar instructions and two vector instructions
15931 (a scalar_to_vec and an unaligned_store). For SLP, the vector form
15932 wins if the costs are equal, because of the fact that the vector costs
15933 include constant initializations whereas the scalar costs don't.
15934 We would therefore tend to vectorize the code above, even though
15935 the scalar version can use a single STP.
15937 We should eventually fix this and model LDP and STP in the main costs;
15938 see the comment in aarch64_sve_adjust_stmt_cost for some of the problems.
15939 Until then, we look specifically for code that does nothing more than
15940 STP-like operations. We cost them on that basis in addition to the
15941 normal latency-based costs.
15943 If the scalar or vector code could be a sequence of STPs +
15944 initialization, this variable counts the cost of the sequence,
15945 with 2 units per instruction. The variable is ~0U for other
15947 unsigned int m_stp_sequence_cost
= 0;
15949 /* On some CPUs, SVE and Advanced SIMD provide the same theoretical vector
15950 throughput, such as 4x128 Advanced SIMD vs. 2x256 SVE. In those
15951 situations, we try to predict whether an Advanced SIMD implementation
15952 of the loop could be completely unrolled and become straight-line code.
15953 If so, it is generally better to use the Advanced SIMD version rather
15954 than length-agnostic SVE, since the SVE loop would execute an unknown
15955 number of times and so could not be completely unrolled in the same way.
15957 If we're applying this heuristic, M_UNROLLED_ADVSIMD_NITERS is the
15958 number of Advanced SIMD loop iterations that would be unrolled and
15959 M_UNROLLED_ADVSIMD_STMTS estimates the total number of statements
15960 in the unrolled loop. Both values are zero if we're not applying
15962 unsigned HOST_WIDE_INT m_unrolled_advsimd_niters
= 0;
15963 unsigned HOST_WIDE_INT m_unrolled_advsimd_stmts
= 0;
15965 /* If we're vectorizing a loop that executes a constant number of times,
15966 this variable gives the number of times that the vector loop would
15967 iterate, otherwise it is zero. */
15968 uint64_t m_num_vector_iterations
= 0;
15970 /* Used only when vectorizing loops. Estimates the number and kind of
15971 operations that would be needed by one iteration of the scalar
15972 or vector loop. There is one entry for each tuning option of
15974 auto_vec
<aarch64_vec_op_count
, 2> m_ops
;
15977 aarch64_vector_costs::aarch64_vector_costs (vec_info
*vinfo
,
15978 bool costing_for_scalar
)
15979 : vector_costs (vinfo
, costing_for_scalar
),
15980 m_vec_flags (costing_for_scalar
? 0
15981 : aarch64_classify_vector_mode (vinfo
->vector_mode
))
15983 if (auto *issue_info
= aarch64_tune_params
.vec_costs
->issue_info
)
15985 m_ops
.quick_push ({ issue_info
, m_vec_flags
});
15986 if (aarch64_tune_params
.vec_costs
== &neoverse512tvb_vector_cost
)
15988 unsigned int vf_factor
= (m_vec_flags
& VEC_ANY_SVE
) ? 2 : 1;
15989 m_ops
.quick_push ({ &neoversev1_vec_issue_info
, m_vec_flags
,
15995 /* Implement TARGET_VECTORIZE_CREATE_COSTS. */
15997 aarch64_vectorize_create_costs (vec_info
*vinfo
, bool costing_for_scalar
)
15999 return new aarch64_vector_costs (vinfo
, costing_for_scalar
);
16002 /* Return true if the current CPU should use the new costs defined
16003 in GCC 11. This should be removed for GCC 12 and above, with the
16004 costs applying to all CPUs instead. */
16006 aarch64_use_new_vector_costs_p ()
16008 return (aarch64_tune_params
.extra_tuning_flags
16009 & AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
);
16012 /* Return the appropriate SIMD costs for vectors of type VECTYPE. */
16013 static const simd_vec_cost
*
16014 aarch64_simd_vec_costs (tree vectype
)
16016 const cpu_vector_cost
*costs
= aarch64_tune_params
.vec_costs
;
16017 if (vectype
!= NULL
16018 && aarch64_sve_mode_p (TYPE_MODE (vectype
))
16019 && costs
->sve
!= NULL
)
16021 return costs
->advsimd
;
16024 /* Return the appropriate SIMD costs for vectors with VEC_* flags FLAGS. */
16025 static const simd_vec_cost
*
16026 aarch64_simd_vec_costs_for_flags (unsigned int flags
)
16028 const cpu_vector_cost
*costs
= aarch64_tune_params
.vec_costs
;
16029 if ((flags
& VEC_ANY_SVE
) && costs
->sve
)
16031 return costs
->advsimd
;
16034 /* If STMT_INFO is a memory reference, return the scalar memory type,
16035 otherwise return null. */
16037 aarch64_dr_type (stmt_vec_info stmt_info
)
16039 if (auto dr
= STMT_VINFO_DATA_REF (stmt_info
))
16040 return TREE_TYPE (DR_REF (dr
));
16044 /* Decide whether to use the unrolling heuristic described above
16045 m_unrolled_advsimd_niters, updating that field if so. LOOP_VINFO
16046 describes the loop that we're vectorizing. */
16048 aarch64_vector_costs::
16049 record_potential_advsimd_unrolling (loop_vec_info loop_vinfo
)
16051 /* The heuristic only makes sense on targets that have the same
16052 vector throughput for SVE and Advanced SIMD. */
16053 if (!(aarch64_tune_params
.extra_tuning_flags
16054 & AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT
))
16057 /* We only want to apply the heuristic if LOOP_VINFO is being
16058 vectorized for SVE. */
16059 if (!(m_vec_flags
& VEC_ANY_SVE
))
16062 /* Check whether it is possible in principle to use Advanced SIMD
16064 if (aarch64_autovec_preference
== 2)
16067 /* We don't want to apply the heuristic to outer loops, since it's
16068 harder to track two levels of unrolling. */
16069 if (LOOP_VINFO_LOOP (loop_vinfo
)->inner
)
16072 /* Only handle cases in which the number of Advanced SIMD iterations
16073 would be known at compile time but the number of SVE iterations
16075 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
)
16076 || aarch64_sve_vg
.is_constant ())
16079 /* Guess how many times the Advanced SIMD loop would iterate and make
16080 sure that it is within the complete unrolling limit. Even if the
16081 number of iterations is small enough, the number of statements might
16082 not be, which is why we need to estimate the number of statements too. */
16083 unsigned int estimated_vq
= aarch64_estimated_sve_vq ();
16084 unsigned int advsimd_vf
= CEIL (vect_vf_for_cost (loop_vinfo
), estimated_vq
);
16085 unsigned HOST_WIDE_INT unrolled_advsimd_niters
16086 = LOOP_VINFO_INT_NITERS (loop_vinfo
) / advsimd_vf
;
16087 if (unrolled_advsimd_niters
> (unsigned int) param_max_completely_peel_times
)
16090 /* Record that we're applying the heuristic and should try to estimate
16091 the number of statements in the Advanced SIMD loop. */
16092 m_unrolled_advsimd_niters
= unrolled_advsimd_niters
;
16095 /* Do one-time initialization of the aarch64_vector_costs given that we're
16096 costing the loop vectorization described by LOOP_VINFO. */
16098 aarch64_vector_costs::analyze_loop_vinfo (loop_vec_info loop_vinfo
)
16100 /* Record the number of times that the vector loop would execute,
16102 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
16103 auto scalar_niters
= max_stmt_executions_int (loop
);
16104 if (scalar_niters
>= 0)
16106 unsigned int vf
= vect_vf_for_cost (loop_vinfo
);
16107 if (LOOP_VINFO_MASKS (loop_vinfo
).is_empty ())
16108 m_num_vector_iterations
= scalar_niters
/ vf
;
16110 m_num_vector_iterations
= CEIL (scalar_niters
, vf
);
16113 /* Detect whether we're vectorizing for SVE and should apply the unrolling
16114 heuristic described above m_unrolled_advsimd_niters. */
16115 record_potential_advsimd_unrolling (loop_vinfo
);
16117 /* Record the issue information for any SVE WHILE instructions that the
16119 if (!m_ops
.is_empty () && !LOOP_VINFO_MASKS (loop_vinfo
).is_empty ())
16121 unsigned int num_masks
= 0;
16122 rgroup_controls
*rgm
;
16123 unsigned int num_vectors_m1
;
16124 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo
), num_vectors_m1
, rgm
)
16126 num_masks
+= num_vectors_m1
+ 1;
16127 for (auto &ops
: m_ops
)
16128 if (auto *issue
= ops
.sve_issue_info ())
16129 ops
.pred_ops
+= num_masks
* issue
->while_pred_ops
;
16133 /* Implement targetm.vectorize.builtin_vectorization_cost. */
16135 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost
,
16137 int misalign ATTRIBUTE_UNUSED
)
16140 const cpu_vector_cost
*costs
= aarch64_tune_params
.vec_costs
;
16143 if (vectype
!= NULL
)
16144 fp
= FLOAT_TYPE_P (vectype
);
16146 const simd_vec_cost
*simd_costs
= aarch64_simd_vec_costs (vectype
);
16148 switch (type_of_cost
)
16151 return fp
? costs
->scalar_fp_stmt_cost
: costs
->scalar_int_stmt_cost
;
16154 return costs
->scalar_load_cost
;
16157 return costs
->scalar_store_cost
;
16160 return fp
? simd_costs
->fp_stmt_cost
16161 : simd_costs
->int_stmt_cost
;
16164 return simd_costs
->align_load_cost
;
16167 return simd_costs
->store_cost
;
16169 case vec_to_scalar
:
16170 return simd_costs
->vec_to_scalar_cost
;
16172 case scalar_to_vec
:
16173 return simd_costs
->scalar_to_vec_cost
;
16175 case unaligned_load
:
16176 case vector_gather_load
:
16177 return simd_costs
->unalign_load_cost
;
16179 case unaligned_store
:
16180 case vector_scatter_store
:
16181 return simd_costs
->unalign_store_cost
;
16183 case cond_branch_taken
:
16184 return costs
->cond_taken_branch_cost
;
16186 case cond_branch_not_taken
:
16187 return costs
->cond_not_taken_branch_cost
;
16190 return simd_costs
->permute_cost
;
16192 case vec_promote_demote
:
16193 return fp
? simd_costs
->fp_stmt_cost
16194 : simd_costs
->int_stmt_cost
;
16196 case vec_construct
:
16197 elements
= estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype
));
16198 return elements
/ 2 + 1;
16201 gcc_unreachable ();
16205 /* Return true if an access of kind KIND for STMT_INFO represents one
16206 vector of an LD[234] or ST[234] operation. Return the total number of
16207 vectors (2, 3 or 4) if so, otherwise return a value outside that range. */
16209 aarch64_ld234_st234_vectors (vect_cost_for_stmt kind
, stmt_vec_info stmt_info
)
16211 if ((kind
== vector_load
16212 || kind
== unaligned_load
16213 || kind
== vector_store
16214 || kind
== unaligned_store
)
16215 && STMT_VINFO_DATA_REF (stmt_info
))
16217 stmt_info
= DR_GROUP_FIRST_ELEMENT (stmt_info
);
16219 && STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info
) == VMAT_LOAD_STORE_LANES
)
16220 return DR_GROUP_SIZE (stmt_info
);
16225 /* Return true if creating multiple copies of STMT_INFO for Advanced SIMD
16226 vectors would produce a series of LDP or STP operations. KIND is the
16227 kind of statement that STMT_INFO represents. */
16229 aarch64_advsimd_ldp_stp_p (enum vect_cost_for_stmt kind
,
16230 stmt_vec_info stmt_info
)
16236 case unaligned_load
:
16237 case unaligned_store
:
16244 if (aarch64_tune_params
.extra_tuning_flags
16245 & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS
)
16248 return is_gimple_assign (stmt_info
->stmt
);
16251 /* Return true if STMT_INFO is the second part of a two-statement multiply-add
16252 or multiply-subtract sequence that might be suitable for fusing into a
16253 single instruction. If VEC_FLAGS is zero, analyze the operation as
16254 a scalar one, otherwise analyze it as an operation on vectors with those
16257 aarch64_multiply_add_p (vec_info
*vinfo
, stmt_vec_info stmt_info
,
16258 unsigned int vec_flags
)
16260 gassign
*assign
= dyn_cast
<gassign
*> (stmt_info
->stmt
);
16263 tree_code code
= gimple_assign_rhs_code (assign
);
16264 if (code
!= PLUS_EXPR
&& code
!= MINUS_EXPR
)
16267 if (CONSTANT_CLASS_P (gimple_assign_rhs1 (assign
))
16268 || CONSTANT_CLASS_P (gimple_assign_rhs2 (assign
)))
16271 for (int i
= 1; i
< 3; ++i
)
16273 tree rhs
= gimple_op (assign
, i
);
16274 /* ??? Should we try to check for a single use as well? */
16275 if (TREE_CODE (rhs
) != SSA_NAME
)
16278 stmt_vec_info def_stmt_info
= vinfo
->lookup_def (rhs
);
16280 || STMT_VINFO_DEF_TYPE (def_stmt_info
) != vect_internal_def
)
16282 gassign
*rhs_assign
= dyn_cast
<gassign
*> (def_stmt_info
->stmt
);
16283 if (!rhs_assign
|| gimple_assign_rhs_code (rhs_assign
) != MULT_EXPR
)
16286 if (vec_flags
& VEC_ADVSIMD
)
16288 /* Scalar and SVE code can tie the result to any FMLA input (or none,
16289 although that requires a MOVPRFX for SVE). However, Advanced SIMD
16290 only supports MLA forms, so will require a move if the result
16291 cannot be tied to the accumulator. The most important case in
16292 which this is true is when the accumulator input is invariant. */
16293 rhs
= gimple_op (assign
, 3 - i
);
16294 if (TREE_CODE (rhs
) != SSA_NAME
)
16296 def_stmt_info
= vinfo
->lookup_def (rhs
);
16298 || STMT_VINFO_DEF_TYPE (def_stmt_info
) == vect_external_def
)
16307 /* We are considering implementing STMT_INFO using SVE. If STMT_INFO is an
16308 in-loop reduction that SVE supports directly, return its latency in cycles,
16309 otherwise return zero. SVE_COSTS specifies the latencies of the relevant
16311 static unsigned int
16312 aarch64_sve_in_loop_reduction_latency (vec_info
*vinfo
,
16313 stmt_vec_info stmt_info
,
16314 const sve_vec_cost
*sve_costs
)
16316 switch (vect_reduc_type (vinfo
, stmt_info
))
16318 case EXTRACT_LAST_REDUCTION
:
16319 return sve_costs
->clast_cost
;
16321 case FOLD_LEFT_REDUCTION
:
16322 switch (TYPE_MODE (TREE_TYPE (gimple_get_lhs (stmt_info
->stmt
))))
16326 return sve_costs
->fadda_f16_cost
;
16329 return sve_costs
->fadda_f32_cost
;
16332 return sve_costs
->fadda_f64_cost
;
16343 /* STMT_INFO describes a loop-carried operation in the original scalar code
16344 that we are considering implementing as a reduction. Return one of the
16345 following values, depending on VEC_FLAGS:
16347 - If VEC_FLAGS is zero, return the loop carry latency of the original
16350 - If VEC_FLAGS & VEC_ADVSIMD, return the loop carry latency of the
16351 Advanced SIMD implementation.
16353 - If VEC_FLAGS & VEC_ANY_SVE, return the loop carry latency of the
16354 SVE implementation. */
16355 static unsigned int
16356 aarch64_in_loop_reduction_latency (vec_info
*vinfo
, stmt_vec_info stmt_info
,
16357 unsigned int vec_flags
)
16359 const cpu_vector_cost
*vec_costs
= aarch64_tune_params
.vec_costs
;
16360 const sve_vec_cost
*sve_costs
= nullptr;
16361 if (vec_flags
& VEC_ANY_SVE
)
16362 sve_costs
= aarch64_tune_params
.vec_costs
->sve
;
16364 /* If the caller is asking for the SVE latency, check for forms of reduction
16365 that only SVE can handle directly. */
16368 unsigned int latency
16369 = aarch64_sve_in_loop_reduction_latency (vinfo
, stmt_info
, sve_costs
);
16374 /* Handle scalar costs. */
16375 bool is_float
= FLOAT_TYPE_P (TREE_TYPE (gimple_get_lhs (stmt_info
->stmt
)));
16376 if (vec_flags
== 0)
16379 return vec_costs
->scalar_fp_stmt_cost
;
16380 return vec_costs
->scalar_int_stmt_cost
;
16383 /* Otherwise, the loop body just contains normal integer or FP operations,
16384 with a vector reduction outside the loop. */
16385 const simd_vec_cost
*simd_costs
16386 = aarch64_simd_vec_costs_for_flags (vec_flags
);
16388 return simd_costs
->fp_stmt_cost
;
16389 return simd_costs
->int_stmt_cost
;
16392 /* STMT_COST is the cost calculated by aarch64_builtin_vectorization_cost
16393 for STMT_INFO, which has cost kind KIND. If this is a scalar operation,
16394 try to subdivide the target-independent categorization provided by KIND
16395 to get a more accurate cost. */
16396 static fractional_cost
16397 aarch64_detect_scalar_stmt_subtype (vec_info
*vinfo
, vect_cost_for_stmt kind
,
16398 stmt_vec_info stmt_info
,
16399 fractional_cost stmt_cost
)
16401 /* Detect an extension of a loaded value. In general, we'll be able to fuse
16402 the extension with the load. */
16403 if (kind
== scalar_stmt
&& vect_is_extending_load (vinfo
, stmt_info
))
16409 /* STMT_COST is the cost calculated by aarch64_builtin_vectorization_cost
16410 for the vectorized form of STMT_INFO, which has cost kind KIND and which
16411 when vectorized would operate on vector type VECTYPE. Try to subdivide
16412 the target-independent categorization provided by KIND to get a more
16413 accurate cost. WHERE specifies where the cost associated with KIND
16415 static fractional_cost
16416 aarch64_detect_vector_stmt_subtype (vec_info
*vinfo
, vect_cost_for_stmt kind
,
16417 stmt_vec_info stmt_info
, tree vectype
,
16418 enum vect_cost_model_location where
,
16419 fractional_cost stmt_cost
)
16421 const simd_vec_cost
*simd_costs
= aarch64_simd_vec_costs (vectype
);
16422 const sve_vec_cost
*sve_costs
= nullptr;
16423 if (aarch64_sve_mode_p (TYPE_MODE (vectype
)))
16424 sve_costs
= aarch64_tune_params
.vec_costs
->sve
;
16426 /* It's generally better to avoid costing inductions, since the induction
16427 will usually be hidden by other operations. This is particularly true
16428 for things like COND_REDUCTIONS. */
16429 if (is_a
<gphi
*> (stmt_info
->stmt
))
16432 /* Detect cases in which vec_to_scalar is describing the extraction of a
16433 vector element in preparation for a scalar store. The store itself is
16434 costed separately. */
16435 if (vect_is_store_elt_extraction (kind
, stmt_info
))
16436 return simd_costs
->store_elt_extra_cost
;
16438 /* Detect SVE gather loads, which are costed as a single scalar_load
16439 for each element. We therefore need to divide the full-instruction
16440 cost by the number of elements in the vector. */
16441 if (kind
== scalar_load
16443 && STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info
) == VMAT_GATHER_SCATTER
)
16445 unsigned int nunits
= vect_nunits_for_cost (vectype
);
16446 if (GET_MODE_UNIT_BITSIZE (TYPE_MODE (vectype
)) == 64)
16447 return { sve_costs
->gather_load_x64_cost
, nunits
};
16448 return { sve_costs
->gather_load_x32_cost
, nunits
};
16451 /* Detect cases in which a scalar_store is really storing one element
16452 in a scatter operation. */
16453 if (kind
== scalar_store
16455 && STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info
) == VMAT_GATHER_SCATTER
)
16456 return sve_costs
->scatter_store_elt_cost
;
16458 /* Detect cases in which vec_to_scalar represents an in-loop reduction. */
16459 if (kind
== vec_to_scalar
16460 && where
== vect_body
16463 unsigned int latency
16464 = aarch64_sve_in_loop_reduction_latency (vinfo
, stmt_info
, sve_costs
);
16469 /* Detect cases in which vec_to_scalar represents a single reduction
16470 instruction like FADDP or MAXV. */
16471 if (kind
== vec_to_scalar
16472 && where
== vect_epilogue
16473 && vect_is_reduction (stmt_info
))
16474 switch (GET_MODE_INNER (TYPE_MODE (vectype
)))
16477 return simd_costs
->reduc_i8_cost
;
16480 return simd_costs
->reduc_i16_cost
;
16483 return simd_costs
->reduc_i32_cost
;
16486 return simd_costs
->reduc_i64_cost
;
16490 return simd_costs
->reduc_f16_cost
;
16493 return simd_costs
->reduc_f32_cost
;
16496 return simd_costs
->reduc_f64_cost
;
16502 /* Otherwise stick with the original categorization. */
16506 /* STMT_COST is the cost calculated by aarch64_builtin_vectorization_cost
16507 for STMT_INFO, which has cost kind KIND and which when vectorized would
16508 operate on vector type VECTYPE. Adjust the cost as necessary for SVE
16510 static fractional_cost
16511 aarch64_sve_adjust_stmt_cost (class vec_info
*vinfo
, vect_cost_for_stmt kind
,
16512 stmt_vec_info stmt_info
, tree vectype
,
16513 fractional_cost stmt_cost
)
16515 /* Unlike vec_promote_demote, vector_stmt conversions do not change the
16516 vector register size or number of units. Integer promotions of this
16517 type therefore map to SXT[BHW] or UXT[BHW].
16519 Most loads have extending forms that can do the sign or zero extension
16520 on the fly. Optimistically assume that a load followed by an extension
16521 will fold to this form during combine, and that the extension therefore
16523 if (kind
== vector_stmt
&& vect_is_extending_load (vinfo
, stmt_info
))
16526 /* For similar reasons, vector_stmt integer truncations are a no-op,
16527 because we can just ignore the unused upper bits of the source. */
16528 if (kind
== vector_stmt
&& vect_is_integer_truncation (stmt_info
))
16531 /* Advanced SIMD can load and store pairs of registers using LDP and STP,
16532 but there are no equivalent instructions for SVE. This means that
16533 (all other things being equal) 128-bit SVE needs twice as many load
16534 and store instructions as Advanced SIMD in order to process vector pairs.
16536 Also, scalar code can often use LDP and STP to access pairs of values,
16537 so it is too simplistic to say that one SVE load or store replaces
16538 VF scalar loads and stores.
16540 Ideally we would account for this in the scalar and Advanced SIMD
16541 costs by making suitable load/store pairs as cheap as a single
16542 load/store. However, that would be a very invasive change and in
16543 practice it tends to stress other parts of the cost model too much.
16544 E.g. stores of scalar constants currently count just a store,
16545 whereas stores of vector constants count a store and a vec_init.
16546 This is an artificial distinction for AArch64, where stores of
16547 nonzero scalar constants need the same kind of register invariant
16550 An alternative would be to double the cost of any SVE loads and stores
16551 that could be paired in Advanced SIMD (and possibly also paired in
16552 scalar code). But this tends to stress other parts of the cost model
16553 in the same way. It also means that we can fall back to Advanced SIMD
16554 even if full-loop predication would have been useful.
16556 Here we go for a more conservative version: double the costs of SVE
16557 loads and stores if one iteration of the scalar loop processes enough
16558 elements for it to use a whole number of Advanced SIMD LDP or STP
16559 instructions. This makes it very likely that the VF would be 1 for
16560 Advanced SIMD, and so no epilogue should be needed. */
16561 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
))
16563 stmt_vec_info first
= DR_GROUP_FIRST_ELEMENT (stmt_info
);
16564 unsigned int count
= DR_GROUP_SIZE (first
) - DR_GROUP_GAP (first
);
16565 unsigned int elt_bits
= GET_MODE_UNIT_BITSIZE (TYPE_MODE (vectype
));
16566 if (multiple_p (count
* elt_bits
, 256)
16567 && aarch64_advsimd_ldp_stp_p (kind
, stmt_info
))
16574 /* STMT_COST is the cost calculated for STMT_INFO, which has cost kind KIND
16575 and which when vectorized would operate on vector type VECTYPE. Add the
16576 cost of any embedded operations. */
16577 static fractional_cost
16578 aarch64_adjust_stmt_cost (vect_cost_for_stmt kind
, stmt_vec_info stmt_info
,
16579 tree vectype
, fractional_cost stmt_cost
)
16583 const simd_vec_cost
*simd_costs
= aarch64_simd_vec_costs (vectype
);
16585 /* Detect cases in which a vector load or store represents an
16586 LD[234] or ST[234] instruction. */
16587 switch (aarch64_ld234_st234_vectors (kind
, stmt_info
))
16590 stmt_cost
+= simd_costs
->ld2_st2_permute_cost
;
16594 stmt_cost
+= simd_costs
->ld3_st3_permute_cost
;
16598 stmt_cost
+= simd_costs
->ld4_st4_permute_cost
;
16602 if (kind
== vector_stmt
|| kind
== vec_to_scalar
)
16603 if (tree cmp_type
= vect_embedded_comparison_type (stmt_info
))
16605 if (FLOAT_TYPE_P (cmp_type
))
16606 stmt_cost
+= simd_costs
->fp_stmt_cost
;
16608 stmt_cost
+= simd_costs
->int_stmt_cost
;
16612 if (kind
== scalar_stmt
)
16613 if (tree cmp_type
= vect_embedded_comparison_type (stmt_info
))
16615 if (FLOAT_TYPE_P (cmp_type
))
16616 stmt_cost
+= aarch64_tune_params
.vec_costs
->scalar_fp_stmt_cost
;
16618 stmt_cost
+= aarch64_tune_params
.vec_costs
->scalar_int_stmt_cost
;
16624 /* COUNT, KIND and STMT_INFO are the same as for vector_costs::add_stmt_cost
16625 and they describe an operation in the body of a vector loop. Record issue
16626 information relating to the vector operation in OPS. */
16628 aarch64_vector_costs::count_ops (unsigned int count
, vect_cost_for_stmt kind
,
16629 stmt_vec_info stmt_info
,
16630 aarch64_vec_op_count
*ops
)
16632 const aarch64_base_vec_issue_info
*base_issue
= ops
->base_issue_info ();
16635 const aarch64_simd_vec_issue_info
*simd_issue
= ops
->simd_issue_info ();
16636 const aarch64_sve_vec_issue_info
*sve_issue
= ops
->sve_issue_info ();
16638 /* Calculate the minimum cycles per iteration imposed by a reduction
16640 if ((kind
== scalar_stmt
|| kind
== vector_stmt
|| kind
== vec_to_scalar
)
16641 && vect_is_reduction (stmt_info
))
16644 = aarch64_in_loop_reduction_latency (m_vinfo
, stmt_info
, m_vec_flags
);
16646 /* ??? Ideally we'd do COUNT reductions in parallel, but unfortunately
16647 that's not yet the case. */
16648 ops
->reduction_latency
= MAX (ops
->reduction_latency
, base
* count
);
16651 /* Assume that multiply-adds will become a single operation. */
16652 if (stmt_info
&& aarch64_multiply_add_p (m_vinfo
, stmt_info
, m_vec_flags
))
16655 /* Count the basic operation cost associated with KIND. */
16658 case cond_branch_taken
:
16659 case cond_branch_not_taken
:
16660 case vector_gather_load
:
16661 case vector_scatter_store
:
16662 /* We currently don't expect these to be used in a loop body. */
16666 case vec_promote_demote
:
16667 case vec_construct
:
16668 case vec_to_scalar
:
16669 case scalar_to_vec
:
16672 ops
->general_ops
+= count
;
16677 case unaligned_load
:
16678 ops
->loads
+= count
;
16679 if (m_vec_flags
|| FLOAT_TYPE_P (aarch64_dr_type (stmt_info
)))
16680 ops
->general_ops
+= base_issue
->fp_simd_load_general_ops
* count
;
16684 case unaligned_store
:
16686 ops
->stores
+= count
;
16687 if (m_vec_flags
|| FLOAT_TYPE_P (aarch64_dr_type (stmt_info
)))
16688 ops
->general_ops
+= base_issue
->fp_simd_store_general_ops
* count
;
16692 /* Add any embedded comparison operations. */
16693 if ((kind
== scalar_stmt
|| kind
== vector_stmt
|| kind
== vec_to_scalar
)
16694 && vect_embedded_comparison_type (stmt_info
))
16695 ops
->general_ops
+= count
;
16697 /* COND_REDUCTIONS need two sets of VEC_COND_EXPRs, whereas so far we
16698 have only accounted for one. */
16699 if ((kind
== vector_stmt
|| kind
== vec_to_scalar
)
16700 && vect_reduc_type (m_vinfo
, stmt_info
) == COND_REDUCTION
)
16701 ops
->general_ops
+= count
;
16703 /* Count the predicate operations needed by an SVE comparison. */
16704 if (sve_issue
&& (kind
== vector_stmt
|| kind
== vec_to_scalar
))
16705 if (tree type
= vect_comparison_type (stmt_info
))
16707 unsigned int base
= (FLOAT_TYPE_P (type
)
16708 ? sve_issue
->fp_cmp_pred_ops
16709 : sve_issue
->int_cmp_pred_ops
);
16710 ops
->pred_ops
+= base
* count
;
16713 /* Add any extra overhead associated with LD[234] and ST[234] operations. */
16715 switch (aarch64_ld234_st234_vectors (kind
, stmt_info
))
16718 ops
->general_ops
+= simd_issue
->ld2_st2_general_ops
* count
;
16722 ops
->general_ops
+= simd_issue
->ld3_st3_general_ops
* count
;
16726 ops
->general_ops
+= simd_issue
->ld4_st4_general_ops
* count
;
16730 /* Add any overhead associated with gather loads and scatter stores. */
16732 && (kind
== scalar_load
|| kind
== scalar_store
)
16733 && STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info
) == VMAT_GATHER_SCATTER
)
16735 unsigned int pairs
= CEIL (count
, 2);
16736 ops
->pred_ops
+= sve_issue
->gather_scatter_pair_pred_ops
* pairs
;
16737 ops
->general_ops
+= sve_issue
->gather_scatter_pair_general_ops
* pairs
;
16741 /* Return true if STMT_INFO contains a memory access and if the constant
16742 component of the memory address is aligned to SIZE bytes. */
16744 aarch64_aligned_constant_offset_p (stmt_vec_info stmt_info
,
16747 if (!STMT_VINFO_DATA_REF (stmt_info
))
16750 if (auto first_stmt
= DR_GROUP_FIRST_ELEMENT (stmt_info
))
16751 stmt_info
= first_stmt
;
16752 tree constant_offset
= DR_INIT (STMT_VINFO_DATA_REF (stmt_info
));
16753 /* Needed for gathers & scatters, for example. */
16754 if (!constant_offset
)
16757 return multiple_p (wi::to_poly_offset (constant_offset
), size
);
16760 /* Check if a scalar or vector stmt could be part of a region of code
16761 that does nothing more than store values to memory, in the scalar
16762 case using STP. Return the cost of the stmt if so, counting 2 for
16763 one instruction. Return ~0U otherwise.
16765 The arguments are a subset of those passed to add_stmt_cost. */
16767 aarch64_stp_sequence_cost (unsigned int count
, vect_cost_for_stmt kind
,
16768 stmt_vec_info stmt_info
, tree vectype
)
16770 /* Code that stores vector constants uses a vector_load to create
16771 the constant. We don't apply the heuristic to that case for two
16774 - At the moment, STPs are only formed via peephole2, and the
16775 constant scalar moves would often come between STRs and so
16776 prevent STP formation.
16778 - The scalar code also has to load the constant somehow, and that
16782 case scalar_to_vec
:
16783 /* Count 2 insns for a GPR->SIMD dup and 1 insn for a FPR->SIMD dup. */
16784 return (FLOAT_TYPE_P (vectype
) ? 2 : 4) * count
;
16786 case vec_construct
:
16787 if (FLOAT_TYPE_P (vectype
))
16788 /* Count 1 insn for the maximum number of FP->SIMD INS
16790 return (vect_nunits_for_cost (vectype
) - 1) * 2 * count
;
16792 /* Count 2 insns for a GPR->SIMD move and 2 insns for the
16793 maximum number of GPR->SIMD INS instructions. */
16794 return vect_nunits_for_cost (vectype
) * 4 * count
;
16797 case unaligned_store
:
16798 /* Count 1 insn per vector if we can't form STP Q pairs. */
16799 if (aarch64_sve_mode_p (TYPE_MODE (vectype
)))
16801 if (aarch64_tune_params
.extra_tuning_flags
16802 & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS
)
16807 /* Assume we won't be able to use STP if the constant offset
16808 component of the address is misaligned. ??? This could be
16809 removed if we formed STP pairs earlier, rather than relying
16811 auto size
= GET_MODE_SIZE (TYPE_MODE (vectype
));
16812 if (!aarch64_aligned_constant_offset_p (stmt_info
, size
))
16815 return CEIL (count
, 2) * 2;
16818 if (stmt_info
&& STMT_VINFO_DATA_REF (stmt_info
))
16820 /* Check for a mode in which STP pairs can be formed. */
16821 auto size
= GET_MODE_SIZE (TYPE_MODE (aarch64_dr_type (stmt_info
)));
16822 if (maybe_ne (size
, 4) && maybe_ne (size
, 8))
16825 /* Assume we won't be able to use STP if the constant offset
16826 component of the address is misaligned. ??? This could be
16827 removed if we formed STP pairs earlier, rather than relying
16829 if (!aarch64_aligned_constant_offset_p (stmt_info
, size
))
16840 aarch64_vector_costs::add_stmt_cost (int count
, vect_cost_for_stmt kind
,
16841 stmt_vec_info stmt_info
, slp_tree
,
16842 tree vectype
, int misalign
,
16843 vect_cost_model_location where
)
16845 fractional_cost stmt_cost
16846 = aarch64_builtin_vectorization_cost (kind
, vectype
, misalign
);
16848 bool in_inner_loop_p
= (where
== vect_body
16850 && stmt_in_inner_loop_p (m_vinfo
, stmt_info
));
16852 /* Do one-time initialization based on the vinfo. */
16853 loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (m_vinfo
);
16854 if (!m_analyzed_vinfo
&& aarch64_use_new_vector_costs_p ())
16857 analyze_loop_vinfo (loop_vinfo
);
16859 m_analyzed_vinfo
= true;
16862 /* Apply the heuristic described above m_stp_sequence_cost. */
16863 if (m_stp_sequence_cost
!= ~0U)
16865 uint64_t cost
= aarch64_stp_sequence_cost (count
, kind
,
16866 stmt_info
, vectype
);
16867 m_stp_sequence_cost
= MIN (m_stp_sequence_cost
+ cost
, ~0U);
16870 /* Try to get a more accurate cost by looking at STMT_INFO instead
16871 of just looking at KIND. */
16872 if (stmt_info
&& aarch64_use_new_vector_costs_p ())
16874 /* If we scalarize a strided store, the vectorizer costs one
16875 vec_to_scalar for each element. However, we can store the first
16876 element using an FP store without a separate extract step. */
16877 if (vect_is_store_elt_extraction (kind
, stmt_info
))
16880 stmt_cost
= aarch64_detect_scalar_stmt_subtype (m_vinfo
, kind
,
16881 stmt_info
, stmt_cost
);
16883 if (vectype
&& m_vec_flags
)
16884 stmt_cost
= aarch64_detect_vector_stmt_subtype (m_vinfo
, kind
,
16885 stmt_info
, vectype
,
16889 /* Do any SVE-specific adjustments to the cost. */
16890 if (stmt_info
&& vectype
&& aarch64_sve_mode_p (TYPE_MODE (vectype
)))
16891 stmt_cost
= aarch64_sve_adjust_stmt_cost (m_vinfo
, kind
, stmt_info
,
16892 vectype
, stmt_cost
);
16894 if (stmt_info
&& aarch64_use_new_vector_costs_p ())
16896 /* Account for any extra "embedded" costs that apply additively
16897 to the base cost calculated above. */
16898 stmt_cost
= aarch64_adjust_stmt_cost (kind
, stmt_info
, vectype
,
16901 /* If we're recording a nonzero vector loop body cost for the
16902 innermost loop, also estimate the operations that would need
16903 to be issued by all relevant implementations of the loop. */
16905 && (m_costing_for_scalar
|| where
== vect_body
)
16906 && (!LOOP_VINFO_LOOP (loop_vinfo
)->inner
|| in_inner_loop_p
)
16908 for (auto &ops
: m_ops
)
16909 count_ops (count
, kind
, stmt_info
, &ops
);
16911 /* If we're applying the SVE vs. Advanced SIMD unrolling heuristic,
16912 estimate the number of statements in the unrolled Advanced SIMD
16913 loop. For simplicitly, we assume that one iteration of the
16914 Advanced SIMD loop would need the same number of statements
16915 as one iteration of the SVE loop. */
16916 if (where
== vect_body
&& m_unrolled_advsimd_niters
)
16917 m_unrolled_advsimd_stmts
+= count
* m_unrolled_advsimd_niters
;
16919 /* Detect the use of an averaging operation. */
16920 gimple
*stmt
= stmt_info
->stmt
;
16921 if (is_gimple_call (stmt
)
16922 && gimple_call_internal_p (stmt
))
16924 switch (gimple_call_internal_fn (stmt
))
16926 case IFN_AVG_FLOOR
:
16934 return record_stmt_cost (stmt_info
, where
, (count
* stmt_cost
).ceil ());
16937 /* Return true if (a) we're applying the Advanced SIMD vs. SVE unrolling
16938 heuristic described above m_unrolled_advsimd_niters and (b) the heuristic
16939 says that we should prefer the Advanced SIMD loop. */
16941 aarch64_vector_costs::prefer_unrolled_loop () const
16943 if (!m_unrolled_advsimd_stmts
)
16946 if (dump_enabled_p ())
16947 dump_printf_loc (MSG_NOTE
, vect_location
, "Number of insns in"
16948 " unrolled Advanced SIMD loop = "
16949 HOST_WIDE_INT_PRINT_UNSIGNED
"\n",
16950 m_unrolled_advsimd_stmts
);
16952 /* The balance here is tricky. On the one hand, we can't be sure whether
16953 the code is vectorizable with Advanced SIMD or not. However, even if
16954 it isn't vectorizable with Advanced SIMD, there's a possibility that
16955 the scalar code could also be unrolled. Some of the code might then
16956 benefit from SLP, or from using LDP and STP. We therefore apply
16957 the heuristic regardless of can_use_advsimd_p. */
16958 return (m_unrolled_advsimd_stmts
16959 && (m_unrolled_advsimd_stmts
16960 <= (unsigned int) param_max_completely_peeled_insns
));
16963 /* Subroutine of adjust_body_cost for handling SVE. Use ISSUE_INFO to work out
16964 how fast the SVE code can be issued and compare it to the equivalent value
16965 for scalar code (SCALAR_CYCLES_PER_ITER). If COULD_USE_ADVSIMD is true,
16966 also compare it to the issue rate of Advanced SIMD code
16967 (ADVSIMD_CYCLES_PER_ITER).
16969 ORIG_BODY_COST is the cost originally passed to adjust_body_cost and
16970 *BODY_COST is the current value of the adjusted cost. *SHOULD_DISPARAGE
16971 is true if we think the loop body is too expensive. */
16974 aarch64_vector_costs::
16975 adjust_body_cost_sve (const aarch64_vec_op_count
*ops
,
16976 fractional_cost scalar_cycles_per_iter
,
16977 unsigned int orig_body_cost
, unsigned int *body_cost
,
16978 bool *should_disparage
)
16980 if (dump_enabled_p ())
16983 fractional_cost sve_pred_cycles_per_iter
= ops
->min_pred_cycles_per_iter ();
16984 fractional_cost sve_cycles_per_iter
= ops
->min_cycles_per_iter ();
16986 /* If the scalar version of the loop could issue at least as
16987 quickly as the predicate parts of the SVE loop, make the SVE loop
16988 prohibitively expensive. In this case vectorization is adding an
16989 overhead that the original scalar code didn't have.
16991 This is mostly intended to detect cases in which WHILELOs dominate
16992 for very tight loops, which is something that normal latency-based
16993 costs would not model. Adding this kind of cliffedge would be
16994 too drastic for scalar_cycles_per_iter vs. sve_cycles_per_iter;
16995 code in the caller handles that case in a more conservative way. */
16996 fractional_cost sve_estimate
= sve_pred_cycles_per_iter
+ 1;
16997 if (scalar_cycles_per_iter
< sve_estimate
)
16999 unsigned int min_cost
17000 = orig_body_cost
* estimated_poly_value (BYTES_PER_SVE_VECTOR
);
17001 if (*body_cost
< min_cost
)
17003 if (dump_enabled_p ())
17004 dump_printf_loc (MSG_NOTE
, vect_location
,
17005 "Increasing body cost to %d because the"
17006 " scalar code could issue within the limit"
17007 " imposed by predicate operations\n",
17009 *body_cost
= min_cost
;
17010 *should_disparage
= true;
17014 return sve_cycles_per_iter
;
17018 aarch64_vector_costs::determine_suggested_unroll_factor ()
17020 bool sve
= m_vec_flags
& VEC_ANY_SVE
;
17021 /* If we are trying to unroll an Advanced SIMD main loop that contains
17022 an averaging operation that we do not support with SVE and we might use a
17023 predicated epilogue, we need to be conservative and block unrolling as
17024 this might lead to a less optimal loop for the first and only epilogue
17025 using the original loop's vectorization factor.
17026 TODO: Remove this constraint when we add support for multiple epilogue
17028 if (!sve
&& !TARGET_SVE2
&& m_has_avg
)
17031 unsigned int max_unroll_factor
= 1;
17032 for (auto vec_ops
: m_ops
)
17034 aarch64_simd_vec_issue_info
const *vec_issue
17035 = vec_ops
.simd_issue_info ();
17038 /* Limit unroll factor to a value adjustable by the user, the default
17040 unsigned int unroll_factor
= aarch64_vect_unroll_limit
;
17041 unsigned int factor
17042 = vec_ops
.reduction_latency
> 1 ? vec_ops
.reduction_latency
: 1;
17045 /* Sanity check, this should never happen. */
17046 if ((vec_ops
.stores
+ vec_ops
.loads
+ vec_ops
.general_ops
) == 0)
17049 /* Check stores. */
17050 if (vec_ops
.stores
> 0)
17052 temp
= CEIL (factor
* vec_issue
->stores_per_cycle
,
17054 unroll_factor
= MIN (unroll_factor
, temp
);
17057 /* Check loads + stores. */
17058 if (vec_ops
.loads
> 0)
17060 temp
= CEIL (factor
* vec_issue
->loads_stores_per_cycle
,
17061 vec_ops
.loads
+ vec_ops
.stores
);
17062 unroll_factor
= MIN (unroll_factor
, temp
);
17065 /* Check general ops. */
17066 if (vec_ops
.general_ops
> 0)
17068 temp
= CEIL (factor
* vec_issue
->general_ops_per_cycle
,
17069 vec_ops
.general_ops
);
17070 unroll_factor
= MIN (unroll_factor
, temp
);
17072 max_unroll_factor
= MAX (max_unroll_factor
, unroll_factor
);
17075 /* Make sure unroll factor is power of 2. */
17076 return 1 << ceil_log2 (max_unroll_factor
);
17079 /* BODY_COST is the cost of a vector loop body. Adjust the cost as necessary
17080 and return the new cost. */
17082 aarch64_vector_costs::
17083 adjust_body_cost (loop_vec_info loop_vinfo
,
17084 const aarch64_vector_costs
*scalar_costs
,
17085 unsigned int body_cost
)
17087 if (scalar_costs
->m_ops
.is_empty () || m_ops
.is_empty ())
17090 const auto &scalar_ops
= scalar_costs
->m_ops
[0];
17091 const auto &vector_ops
= m_ops
[0];
17092 unsigned int estimated_vf
= vect_vf_for_cost (loop_vinfo
);
17093 unsigned int orig_body_cost
= body_cost
;
17094 bool should_disparage
= false;
17096 if (dump_enabled_p ())
17097 dump_printf_loc (MSG_NOTE
, vect_location
,
17098 "Original vector body cost = %d\n", body_cost
);
17100 fractional_cost scalar_cycles_per_iter
17101 = scalar_ops
.min_cycles_per_iter () * estimated_vf
;
17103 fractional_cost vector_cycles_per_iter
= vector_ops
.min_cycles_per_iter ();
17105 if (dump_enabled_p ())
17107 if (IN_RANGE (m_num_vector_iterations
, 0, 65536))
17108 dump_printf_loc (MSG_NOTE
, vect_location
,
17109 "Vector loop iterates at most %wd times\n",
17110 m_num_vector_iterations
);
17111 dump_printf_loc (MSG_NOTE
, vect_location
, "Scalar issue estimate:\n");
17112 scalar_ops
.dump ();
17113 dump_printf_loc (MSG_NOTE
, vect_location
,
17114 " estimated cycles per vector iteration"
17115 " (for VF %d) = %f\n",
17116 estimated_vf
, scalar_cycles_per_iter
.as_double ());
17119 if (vector_ops
.sve_issue_info ())
17121 if (dump_enabled_p ())
17122 dump_printf_loc (MSG_NOTE
, vect_location
, "SVE issue estimate:\n");
17123 vector_cycles_per_iter
17124 = adjust_body_cost_sve (&vector_ops
, scalar_cycles_per_iter
,
17125 orig_body_cost
, &body_cost
, &should_disparage
);
17127 if (aarch64_tune_params
.vec_costs
== &neoverse512tvb_vector_cost
)
17129 /* Also take Neoverse V1 tuning into account, doubling the
17130 scalar and Advanced SIMD estimates to account for the
17131 doubling in SVE vector length. */
17132 if (dump_enabled_p ())
17133 dump_printf_loc (MSG_NOTE
, vect_location
,
17134 "Neoverse V1 estimate:\n");
17135 auto vf_factor
= m_ops
[1].vf_factor ();
17136 adjust_body_cost_sve (&m_ops
[1], scalar_cycles_per_iter
* vf_factor
,
17137 orig_body_cost
, &body_cost
, &should_disparage
);
17142 if (dump_enabled_p ())
17144 dump_printf_loc (MSG_NOTE
, vect_location
,
17145 "Vector issue estimate:\n");
17146 vector_ops
.dump ();
17150 /* Decide whether to stick to latency-based costs or whether to try to
17151 take issue rates into account. */
17152 unsigned int threshold
= aarch64_loop_vect_issue_rate_niters
;
17153 if (m_vec_flags
& VEC_ANY_SVE
)
17154 threshold
= CEIL (threshold
, aarch64_estimated_sve_vq ());
17156 if (m_num_vector_iterations
>= 1
17157 && m_num_vector_iterations
< threshold
)
17159 if (dump_enabled_p ())
17160 dump_printf_loc (MSG_NOTE
, vect_location
,
17161 "Low iteration count, so using pure latency"
17164 /* Increase the cost of the vector code if it looks like the scalar code
17165 could issue more quickly. These values are only rough estimates,
17166 so minor differences should only result in minor changes. */
17167 else if (scalar_cycles_per_iter
< vector_cycles_per_iter
)
17169 body_cost
= fractional_cost::scale (body_cost
, vector_cycles_per_iter
,
17170 scalar_cycles_per_iter
);
17171 if (dump_enabled_p ())
17172 dump_printf_loc (MSG_NOTE
, vect_location
,
17173 "Increasing body cost to %d because scalar code"
17174 " would issue more quickly\n", body_cost
);
17176 /* In general, it's expected that the proposed vector code would be able
17177 to issue more quickly than the original scalar code. This should
17178 already be reflected to some extent in the latency-based costs.
17180 However, the latency-based costs effectively assume that the scalar
17181 code and the vector code execute serially, which tends to underplay
17182 one important case: if the real (non-serialized) execution time of
17183 a scalar iteration is dominated by loop-carried dependencies,
17184 and if the vector code is able to reduce both the length of
17185 the loop-carried dependencies *and* the number of cycles needed
17186 to issue the code in general, we can be more confident that the
17187 vector code is an improvement, even if adding the other (non-loop-carried)
17188 latencies tends to hide this saving. We therefore reduce the cost of the
17189 vector loop body in proportion to the saving. */
17190 else if (scalar_ops
.reduction_latency
> vector_ops
.reduction_latency
17191 && scalar_ops
.reduction_latency
== scalar_cycles_per_iter
17192 && scalar_cycles_per_iter
> vector_cycles_per_iter
17193 && !should_disparage
)
17195 body_cost
= fractional_cost::scale (body_cost
, vector_cycles_per_iter
,
17196 scalar_cycles_per_iter
);
17197 if (dump_enabled_p ())
17198 dump_printf_loc (MSG_NOTE
, vect_location
,
17199 "Decreasing body cost to %d account for smaller"
17200 " reduction latency\n", body_cost
);
17207 aarch64_vector_costs::finish_cost (const vector_costs
*uncast_scalar_costs
)
17210 = static_cast<const aarch64_vector_costs
*> (uncast_scalar_costs
);
17211 loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (m_vinfo
);
17214 && aarch64_use_new_vector_costs_p ())
17216 m_costs
[vect_body
] = adjust_body_cost (loop_vinfo
, scalar_costs
,
17217 m_costs
[vect_body
]);
17218 m_suggested_unroll_factor
= determine_suggested_unroll_factor ();
17221 /* Apply the heuristic described above m_stp_sequence_cost. Prefer
17222 the scalar code in the event of a tie, since there is more chance
17223 of scalar code being optimized with surrounding operations. */
17226 && m_stp_sequence_cost
!= ~0U
17227 && m_stp_sequence_cost
>= scalar_costs
->m_stp_sequence_cost
)
17228 m_costs
[vect_body
] = 2 * scalar_costs
->total_cost ();
17230 vector_costs::finish_cost (scalar_costs
);
17234 aarch64_vector_costs::
17235 better_main_loop_than_p (const vector_costs
*uncast_other
) const
17237 auto other
= static_cast<const aarch64_vector_costs
*> (uncast_other
);
17239 auto this_loop_vinfo
= as_a
<loop_vec_info
> (this->m_vinfo
);
17240 auto other_loop_vinfo
= as_a
<loop_vec_info
> (other
->m_vinfo
);
17242 if (dump_enabled_p ())
17243 dump_printf_loc (MSG_NOTE
, vect_location
,
17244 "Comparing two main loops (%s at VF %d vs %s at VF %d)\n",
17245 GET_MODE_NAME (this_loop_vinfo
->vector_mode
),
17246 vect_vf_for_cost (this_loop_vinfo
),
17247 GET_MODE_NAME (other_loop_vinfo
->vector_mode
),
17248 vect_vf_for_cost (other_loop_vinfo
));
17250 /* Apply the unrolling heuristic described above
17251 m_unrolled_advsimd_niters. */
17252 if (bool (m_unrolled_advsimd_stmts
)
17253 != bool (other
->m_unrolled_advsimd_stmts
))
17255 bool this_prefer_unrolled
= this->prefer_unrolled_loop ();
17256 bool other_prefer_unrolled
= other
->prefer_unrolled_loop ();
17257 if (this_prefer_unrolled
!= other_prefer_unrolled
)
17259 if (dump_enabled_p ())
17260 dump_printf_loc (MSG_NOTE
, vect_location
,
17261 "Preferring Advanced SIMD loop because"
17262 " it can be unrolled\n");
17263 return other_prefer_unrolled
;
17267 for (unsigned int i
= 0; i
< m_ops
.length (); ++i
)
17269 if (dump_enabled_p ())
17272 dump_printf_loc (MSG_NOTE
, vect_location
,
17273 "Reconsidering with subtuning %d\n", i
);
17274 dump_printf_loc (MSG_NOTE
, vect_location
,
17275 "Issue info for %s loop:\n",
17276 GET_MODE_NAME (this_loop_vinfo
->vector_mode
));
17277 this->m_ops
[i
].dump ();
17278 dump_printf_loc (MSG_NOTE
, vect_location
,
17279 "Issue info for %s loop:\n",
17280 GET_MODE_NAME (other_loop_vinfo
->vector_mode
));
17281 other
->m_ops
[i
].dump ();
17284 auto this_estimated_vf
= (vect_vf_for_cost (this_loop_vinfo
)
17285 * this->m_ops
[i
].vf_factor ());
17286 auto other_estimated_vf
= (vect_vf_for_cost (other_loop_vinfo
)
17287 * other
->m_ops
[i
].vf_factor ());
17289 /* If it appears that one loop could process the same amount of data
17290 in fewer cycles, prefer that loop over the other one. */
17291 fractional_cost this_cost
17292 = this->m_ops
[i
].min_cycles_per_iter () * other_estimated_vf
;
17293 fractional_cost other_cost
17294 = other
->m_ops
[i
].min_cycles_per_iter () * this_estimated_vf
;
17295 if (dump_enabled_p ())
17297 dump_printf_loc (MSG_NOTE
, vect_location
,
17298 "Weighted cycles per iteration of %s loop ~= %f\n",
17299 GET_MODE_NAME (this_loop_vinfo
->vector_mode
),
17300 this_cost
.as_double ());
17301 dump_printf_loc (MSG_NOTE
, vect_location
,
17302 "Weighted cycles per iteration of %s loop ~= %f\n",
17303 GET_MODE_NAME (other_loop_vinfo
->vector_mode
),
17304 other_cost
.as_double ());
17306 if (this_cost
!= other_cost
)
17308 if (dump_enabled_p ())
17309 dump_printf_loc (MSG_NOTE
, vect_location
,
17310 "Preferring loop with lower cycles"
17311 " per iteration\n");
17312 return this_cost
< other_cost
;
17315 /* If the issue rate of SVE code is limited by predicate operations
17316 (i.e. if sve_pred_cycles_per_iter > sve_nonpred_cycles_per_iter),
17317 and if Advanced SIMD code could issue within the limit imposed
17318 by the predicate operations, the predicate operations are adding an
17319 overhead that the original code didn't have and so we should prefer
17320 the Advanced SIMD version. */
17321 auto better_pred_limit_p
= [](const aarch64_vec_op_count
&a
,
17322 const aarch64_vec_op_count
&b
) -> bool
17324 if (a
.pred_ops
== 0
17325 && (b
.min_pred_cycles_per_iter ()
17326 > b
.min_nonpred_cycles_per_iter ()))
17328 if (dump_enabled_p ())
17329 dump_printf_loc (MSG_NOTE
, vect_location
,
17330 "Preferring Advanced SIMD loop since"
17331 " SVE loop is predicate-limited\n");
17336 if (better_pred_limit_p (this->m_ops
[i
], other
->m_ops
[i
]))
17338 if (better_pred_limit_p (other
->m_ops
[i
], this->m_ops
[i
]))
17342 return vector_costs::better_main_loop_than_p (other
);
17345 static void initialize_aarch64_code_model (struct gcc_options
*);
17347 /* Parse the TO_PARSE string and put the architecture struct that it
17348 selects into RES and the architectural features into ISA_FLAGS.
17349 Return an aarch64_parse_opt_result describing the parse result.
17350 If there is an error parsing, RES and ISA_FLAGS are left unchanged.
17351 When the TO_PARSE string contains an invalid extension,
17352 a copy of the string is created and stored to INVALID_EXTENSION. */
17354 static enum aarch64_parse_opt_result
17355 aarch64_parse_arch (const char *to_parse
, const struct processor
**res
,
17356 aarch64_feature_flags
*isa_flags
,
17357 std::string
*invalid_extension
)
17360 const struct processor
*arch
;
17363 ext
= strchr (to_parse
, '+');
17366 len
= ext
- to_parse
;
17368 len
= strlen (to_parse
);
17371 return AARCH64_PARSE_MISSING_ARG
;
17374 /* Loop through the list of supported ARCHes to find a match. */
17375 for (arch
= all_architectures
; arch
->name
!= NULL
; arch
++)
17377 if (strlen (arch
->name
) == len
17378 && strncmp (arch
->name
, to_parse
, len
) == 0)
17380 auto isa_temp
= arch
->flags
;
17384 /* TO_PARSE string contains at least one extension. */
17385 enum aarch64_parse_opt_result ext_res
17386 = aarch64_parse_extension (ext
, &isa_temp
, invalid_extension
);
17388 if (ext_res
!= AARCH64_PARSE_OK
)
17391 /* Extension parsing was successful. Confirm the result
17392 arch and ISA flags. */
17394 *isa_flags
= isa_temp
;
17395 return AARCH64_PARSE_OK
;
17399 /* ARCH name not found in list. */
17400 return AARCH64_PARSE_INVALID_ARG
;
17403 /* Parse the TO_PARSE string and put the result tuning in RES and the
17404 architecture flags in ISA_FLAGS. Return an aarch64_parse_opt_result
17405 describing the parse result. If there is an error parsing, RES and
17406 ISA_FLAGS are left unchanged.
17407 When the TO_PARSE string contains an invalid extension,
17408 a copy of the string is created and stored to INVALID_EXTENSION. */
17410 static enum aarch64_parse_opt_result
17411 aarch64_parse_cpu (const char *to_parse
, const struct processor
**res
,
17412 aarch64_feature_flags
*isa_flags
,
17413 std::string
*invalid_extension
)
17416 const struct processor
*cpu
;
17419 ext
= strchr (to_parse
, '+');
17422 len
= ext
- to_parse
;
17424 len
= strlen (to_parse
);
17427 return AARCH64_PARSE_MISSING_ARG
;
17430 /* Loop through the list of supported CPUs to find a match. */
17431 for (cpu
= all_cores
; cpu
->name
!= NULL
; cpu
++)
17433 if (strlen (cpu
->name
) == len
&& strncmp (cpu
->name
, to_parse
, len
) == 0)
17435 auto isa_temp
= cpu
->flags
;
17439 /* TO_PARSE string contains at least one extension. */
17440 enum aarch64_parse_opt_result ext_res
17441 = aarch64_parse_extension (ext
, &isa_temp
, invalid_extension
);
17443 if (ext_res
!= AARCH64_PARSE_OK
)
17446 /* Extension parsing was successfull. Confirm the result
17447 cpu and ISA flags. */
17449 *isa_flags
= isa_temp
;
17450 return AARCH64_PARSE_OK
;
17454 /* CPU name not found in list. */
17455 return AARCH64_PARSE_INVALID_ARG
;
17458 /* Parse the TO_PARSE string and put the cpu it selects into RES.
17459 Return an aarch64_parse_opt_result describing the parse result.
17460 If the parsing fails the RES does not change. */
17462 static enum aarch64_parse_opt_result
17463 aarch64_parse_tune (const char *to_parse
, const struct processor
**res
)
17465 const struct processor
*cpu
;
17467 /* Loop through the list of supported CPUs to find a match. */
17468 for (cpu
= all_cores
; cpu
->name
!= NULL
; cpu
++)
17470 if (strcmp (cpu
->name
, to_parse
) == 0)
17473 return AARCH64_PARSE_OK
;
17477 /* CPU name not found in list. */
17478 return AARCH64_PARSE_INVALID_ARG
;
17481 /* Parse TOKEN, which has length LENGTH to see if it is an option
17482 described in FLAG. If it is, return the index bit for that fusion type.
17483 If not, error (printing OPTION_NAME) and return zero. */
17485 static unsigned int
17486 aarch64_parse_one_option_token (const char *token
,
17488 const struct aarch64_flag_desc
*flag
,
17489 const char *option_name
)
17491 for (; flag
->name
!= NULL
; flag
++)
17493 if (length
== strlen (flag
->name
)
17494 && !strncmp (flag
->name
, token
, length
))
17498 error ("unknown flag passed in %<-moverride=%s%> (%s)", option_name
, token
);
17502 /* Parse OPTION which is a comma-separated list of flags to enable.
17503 FLAGS gives the list of flags we understand, INITIAL_STATE gives any
17504 default state we inherit from the CPU tuning structures. OPTION_NAME
17505 gives the top-level option we are parsing in the -moverride string,
17506 for use in error messages. */
17508 static unsigned int
17509 aarch64_parse_boolean_options (const char *option
,
17510 const struct aarch64_flag_desc
*flags
,
17511 unsigned int initial_state
,
17512 const char *option_name
)
17514 const char separator
= '.';
17515 const char* specs
= option
;
17516 const char* ntoken
= option
;
17517 unsigned int found_flags
= initial_state
;
17519 while ((ntoken
= strchr (specs
, separator
)))
17521 size_t token_length
= ntoken
- specs
;
17522 unsigned token_ops
= aarch64_parse_one_option_token (specs
,
17526 /* If we find "none" (or, for simplicity's sake, an error) anywhere
17527 in the token stream, reset the supported operations. So:
17529 adrp+add.cmp+branch.none.adrp+add
17531 would have the result of turning on only adrp+add fusion. */
17535 found_flags
|= token_ops
;
17539 /* We ended with a comma, print something. */
17542 error ("%qs string ill-formed", option_name
);
17546 /* We still have one more token to parse. */
17547 size_t token_length
= strlen (specs
);
17548 unsigned token_ops
= aarch64_parse_one_option_token (specs
,
17555 found_flags
|= token_ops
;
17556 return found_flags
;
17559 /* Support for overriding instruction fusion. */
17562 aarch64_parse_fuse_string (const char *fuse_string
,
17563 struct tune_params
*tune
)
17565 tune
->fusible_ops
= aarch64_parse_boolean_options (fuse_string
,
17566 aarch64_fusible_pairs
,
17571 /* Support for overriding other tuning flags. */
17574 aarch64_parse_tune_string (const char *tune_string
,
17575 struct tune_params
*tune
)
17577 tune
->extra_tuning_flags
17578 = aarch64_parse_boolean_options (tune_string
,
17579 aarch64_tuning_flags
,
17580 tune
->extra_tuning_flags
,
17584 /* Parse the sve_width tuning moverride string in TUNE_STRING.
17585 Accept the valid SVE vector widths allowed by
17586 aarch64_sve_vector_bits_enum and use it to override sve_width
17590 aarch64_parse_sve_width_string (const char *tune_string
,
17591 struct tune_params
*tune
)
17595 int n
= sscanf (tune_string
, "%d", &width
);
17598 error ("invalid format for %<sve_width%>");
17610 error ("invalid %<sve_width%> value: %d", width
);
17612 tune
->sve_width
= (enum aarch64_sve_vector_bits_enum
) width
;
17615 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
17616 we understand. If it is, extract the option string and handoff to
17617 the appropriate function. */
17620 aarch64_parse_one_override_token (const char* token
,
17622 struct tune_params
*tune
)
17624 const struct aarch64_tuning_override_function
*fn
17625 = aarch64_tuning_override_functions
;
17627 const char *option_part
= strchr (token
, '=');
17630 error ("tuning string missing in option (%s)", token
);
17634 /* Get the length of the option name. */
17635 length
= option_part
- token
;
17636 /* Skip the '=' to get to the option string. */
17639 for (; fn
->name
!= NULL
; fn
++)
17641 if (!strncmp (fn
->name
, token
, length
))
17643 fn
->parse_override (option_part
, tune
);
17648 error ("unknown tuning option (%s)",token
);
17652 /* A checking mechanism for the implementation of the tls size. */
17655 initialize_aarch64_tls_size (struct gcc_options
*opts
)
17657 if (aarch64_tls_size
== 0)
17658 aarch64_tls_size
= 24;
17660 switch (opts
->x_aarch64_cmodel_var
)
17662 case AARCH64_CMODEL_TINY
:
17663 /* Both the default and maximum TLS size allowed under tiny is 1M which
17664 needs two instructions to address, so we clamp the size to 24. */
17665 if (aarch64_tls_size
> 24)
17666 aarch64_tls_size
= 24;
17668 case AARCH64_CMODEL_SMALL
:
17669 /* The maximum TLS size allowed under small is 4G. */
17670 if (aarch64_tls_size
> 32)
17671 aarch64_tls_size
= 32;
17673 case AARCH64_CMODEL_LARGE
:
17674 /* The maximum TLS size allowed under large is 16E.
17675 FIXME: 16E should be 64bit, we only support 48bit offset now. */
17676 if (aarch64_tls_size
> 48)
17677 aarch64_tls_size
= 48;
17680 gcc_unreachable ();
17686 /* Return the CPU corresponding to the enum CPU. */
17688 static const struct processor
*
17689 aarch64_get_tune_cpu (enum aarch64_processor cpu
)
17691 gcc_assert (cpu
!= aarch64_none
);
17693 return &all_cores
[cpu
];
17696 /* Return the architecture corresponding to the enum ARCH. */
17698 static const struct processor
*
17699 aarch64_get_arch (enum aarch64_arch arch
)
17701 gcc_assert (arch
!= aarch64_no_arch
);
17703 return &all_architectures
[arch
];
17706 /* Parse STRING looking for options in the format:
17707 string :: option:string
17708 option :: name=substring
17710 substring :: defined by option. */
17713 aarch64_parse_override_string (const char* input_string
,
17714 struct tune_params
* tune
)
17716 const char separator
= ':';
17717 size_t string_length
= strlen (input_string
) + 1;
17718 char *string_root
= (char *) xmalloc (sizeof (*string_root
) * string_length
);
17719 char *string
= string_root
;
17720 strncpy (string
, input_string
, string_length
);
17721 string
[string_length
- 1] = '\0';
17723 char* ntoken
= string
;
17725 while ((ntoken
= strchr (string
, separator
)))
17727 size_t token_length
= ntoken
- string
;
17728 /* Make this substring look like a string. */
17730 aarch64_parse_one_override_token (string
, token_length
, tune
);
17734 /* One last option to parse. */
17735 aarch64_parse_one_override_token (string
, strlen (string
), tune
);
17736 free (string_root
);
17739 /* Adjust CURRENT_TUNE (a generic tuning struct) with settings that
17740 are best for a generic target with the currently-enabled architecture
17743 aarch64_adjust_generic_arch_tuning (struct tune_params
¤t_tune
)
17745 /* Neoverse V1 is the only core that is known to benefit from
17746 AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS. There is therefore no
17747 point enabling it for SVE2 and above. */
17749 current_tune
.extra_tuning_flags
17750 &= ~AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
;
17754 aarch64_override_options_after_change_1 (struct gcc_options
*opts
)
17756 if (accepted_branch_protection_string
)
17758 opts
->x_aarch64_branch_protection_string
17759 = xstrdup (accepted_branch_protection_string
);
17762 /* PR 70044: We have to be careful about being called multiple times for the
17763 same function. This means all changes should be repeatable. */
17765 /* Set aarch64_use_frame_pointer based on -fno-omit-frame-pointer.
17766 Disable the frame pointer flag so the mid-end will not use a frame
17767 pointer in leaf functions in order to support -fomit-leaf-frame-pointer.
17768 Set x_flag_omit_frame_pointer to the special value 2 to differentiate
17769 between -fomit-frame-pointer (1) and -fno-omit-frame-pointer (2). */
17770 aarch64_use_frame_pointer
= opts
->x_flag_omit_frame_pointer
!= 1;
17771 if (opts
->x_flag_omit_frame_pointer
== 0)
17772 opts
->x_flag_omit_frame_pointer
= 2;
17774 /* If not optimizing for size, set the default
17775 alignment to what the target wants. */
17776 if (!opts
->x_optimize_size
)
17778 if (opts
->x_flag_align_loops
&& !opts
->x_str_align_loops
)
17779 opts
->x_str_align_loops
= aarch64_tune_params
.loop_align
;
17780 if (opts
->x_flag_align_jumps
&& !opts
->x_str_align_jumps
)
17781 opts
->x_str_align_jumps
= aarch64_tune_params
.jump_align
;
17782 if (opts
->x_flag_align_functions
&& !opts
->x_str_align_functions
)
17783 opts
->x_str_align_functions
= aarch64_tune_params
.function_align
;
17786 /* We default to no pc-relative literal loads. */
17788 aarch64_pcrelative_literal_loads
= false;
17790 /* If -mpc-relative-literal-loads is set on the command line, this
17791 implies that the user asked for PC relative literal loads. */
17792 if (opts
->x_pcrelative_literal_loads
== 1)
17793 aarch64_pcrelative_literal_loads
= true;
17795 /* In the tiny memory model it makes no sense to disallow PC relative
17796 literal pool loads. */
17797 if (aarch64_cmodel
== AARCH64_CMODEL_TINY
17798 || aarch64_cmodel
== AARCH64_CMODEL_TINY_PIC
)
17799 aarch64_pcrelative_literal_loads
= true;
17801 /* When enabling the lower precision Newton series for the square root, also
17802 enable it for the reciprocal square root, since the latter is an
17803 intermediary step for the former. */
17804 if (flag_mlow_precision_sqrt
)
17805 flag_mrecip_low_precision_sqrt
= true;
17808 /* 'Unpack' up the internal tuning structs and update the options
17809 in OPTS. The caller must have set up selected_tune and selected_arch
17810 as all the other target-specific codegen decisions are
17811 derived from them. */
17814 aarch64_override_options_internal (struct gcc_options
*opts
)
17816 const struct processor
*tune
= aarch64_get_tune_cpu (opts
->x_selected_tune
);
17817 aarch64_tune_flags
= tune
->flags
;
17818 aarch64_tune
= tune
->sched_core
;
17819 /* Make a copy of the tuning parameters attached to the core, which
17820 we may later overwrite. */
17821 aarch64_tune_params
= *(tune
->tune
);
17822 if (tune
->tune
== &generic_tunings
)
17823 aarch64_adjust_generic_arch_tuning (aarch64_tune_params
);
17825 if (opts
->x_aarch64_override_tune_string
)
17826 aarch64_parse_override_string (opts
->x_aarch64_override_tune_string
,
17827 &aarch64_tune_params
);
17829 /* This target defaults to strict volatile bitfields. */
17830 if (opts
->x_flag_strict_volatile_bitfields
< 0 && abi_version_at_least (2))
17831 opts
->x_flag_strict_volatile_bitfields
= 1;
17833 if (aarch64_stack_protector_guard
== SSP_GLOBAL
17834 && opts
->x_aarch64_stack_protector_guard_offset_str
)
17836 error ("incompatible options %<-mstack-protector-guard=global%> and "
17837 "%<-mstack-protector-guard-offset=%s%>",
17838 aarch64_stack_protector_guard_offset_str
);
17841 if (aarch64_stack_protector_guard
== SSP_SYSREG
17842 && !(opts
->x_aarch64_stack_protector_guard_offset_str
17843 && opts
->x_aarch64_stack_protector_guard_reg_str
))
17845 error ("both %<-mstack-protector-guard-offset%> and "
17846 "%<-mstack-protector-guard-reg%> must be used "
17847 "with %<-mstack-protector-guard=sysreg%>");
17850 if (opts
->x_aarch64_stack_protector_guard_reg_str
)
17852 if (strlen (opts
->x_aarch64_stack_protector_guard_reg_str
) > 100)
17853 error ("specify a system register with a small string length");
17856 if (opts
->x_aarch64_stack_protector_guard_offset_str
)
17859 const char *str
= aarch64_stack_protector_guard_offset_str
;
17861 long offs
= strtol (aarch64_stack_protector_guard_offset_str
, &end
, 0);
17862 if (!*str
|| *end
|| errno
)
17863 error ("%qs is not a valid offset in %qs", str
,
17864 "-mstack-protector-guard-offset=");
17865 aarch64_stack_protector_guard_offset
= offs
;
17868 if ((flag_sanitize
& SANITIZE_SHADOW_CALL_STACK
)
17869 && !fixed_regs
[R18_REGNUM
])
17870 error ("%<-fsanitize=shadow-call-stack%> requires %<-ffixed-x18%>");
17872 initialize_aarch64_code_model (opts
);
17873 initialize_aarch64_tls_size (opts
);
17875 int queue_depth
= 0;
17876 switch (aarch64_tune_params
.autoprefetcher_model
)
17878 case tune_params::AUTOPREFETCHER_OFF
:
17881 case tune_params::AUTOPREFETCHER_WEAK
:
17884 case tune_params::AUTOPREFETCHER_STRONG
:
17885 queue_depth
= max_insn_queue_index
+ 1;
17888 gcc_unreachable ();
17891 /* We don't mind passing in global_options_set here as we don't use
17892 the *options_set structs anyway. */
17893 SET_OPTION_IF_UNSET (opts
, &global_options_set
,
17894 param_sched_autopref_queue_depth
, queue_depth
);
17896 /* If using Advanced SIMD only for autovectorization disable SVE vector costs
17898 if (aarch64_autovec_preference
== 1)
17899 SET_OPTION_IF_UNSET (opts
, &global_options_set
,
17900 aarch64_sve_compare_costs
, 0);
17902 /* Set up parameters to be used in prefetching algorithm. Do not
17903 override the defaults unless we are tuning for a core we have
17904 researched values for. */
17905 if (aarch64_tune_params
.prefetch
->num_slots
> 0)
17906 SET_OPTION_IF_UNSET (opts
, &global_options_set
,
17907 param_simultaneous_prefetches
,
17908 aarch64_tune_params
.prefetch
->num_slots
);
17909 if (aarch64_tune_params
.prefetch
->l1_cache_size
>= 0)
17910 SET_OPTION_IF_UNSET (opts
, &global_options_set
,
17911 param_l1_cache_size
,
17912 aarch64_tune_params
.prefetch
->l1_cache_size
);
17913 if (aarch64_tune_params
.prefetch
->l1_cache_line_size
>= 0)
17914 SET_OPTION_IF_UNSET (opts
, &global_options_set
,
17915 param_l1_cache_line_size
,
17916 aarch64_tune_params
.prefetch
->l1_cache_line_size
);
17918 if (aarch64_tune_params
.prefetch
->l1_cache_line_size
>= 0)
17920 SET_OPTION_IF_UNSET (opts
, &global_options_set
,
17921 param_destruct_interfere_size
,
17922 aarch64_tune_params
.prefetch
->l1_cache_line_size
);
17923 SET_OPTION_IF_UNSET (opts
, &global_options_set
,
17924 param_construct_interfere_size
,
17925 aarch64_tune_params
.prefetch
->l1_cache_line_size
);
17929 /* For a generic AArch64 target, cover the current range of cache line
17931 SET_OPTION_IF_UNSET (opts
, &global_options_set
,
17932 param_destruct_interfere_size
,
17934 SET_OPTION_IF_UNSET (opts
, &global_options_set
,
17935 param_construct_interfere_size
,
17939 if (aarch64_tune_params
.prefetch
->l2_cache_size
>= 0)
17940 SET_OPTION_IF_UNSET (opts
, &global_options_set
,
17941 param_l2_cache_size
,
17942 aarch64_tune_params
.prefetch
->l2_cache_size
);
17943 if (!aarch64_tune_params
.prefetch
->prefetch_dynamic_strides
)
17944 SET_OPTION_IF_UNSET (opts
, &global_options_set
,
17945 param_prefetch_dynamic_strides
, 0);
17946 if (aarch64_tune_params
.prefetch
->minimum_stride
>= 0)
17947 SET_OPTION_IF_UNSET (opts
, &global_options_set
,
17948 param_prefetch_minimum_stride
,
17949 aarch64_tune_params
.prefetch
->minimum_stride
);
17951 /* Use the alternative scheduling-pressure algorithm by default. */
17952 SET_OPTION_IF_UNSET (opts
, &global_options_set
,
17953 param_sched_pressure_algorithm
,
17954 SCHED_PRESSURE_MODEL
);
17956 /* Validate the guard size. */
17957 int guard_size
= param_stack_clash_protection_guard_size
;
17959 if (guard_size
!= 12 && guard_size
!= 16)
17960 error ("only values 12 (4 KB) and 16 (64 KB) are supported for guard "
17961 "size. Given value %d (%llu KB) is out of range",
17962 guard_size
, (1ULL << guard_size
) / 1024ULL);
17964 /* Enforce that interval is the same size as size so the mid-end does the
17966 SET_OPTION_IF_UNSET (opts
, &global_options_set
,
17967 param_stack_clash_protection_probe_interval
,
17970 /* The maybe_set calls won't update the value if the user has explicitly set
17971 one. Which means we need to validate that probing interval and guard size
17974 = param_stack_clash_protection_probe_interval
;
17975 if (guard_size
!= probe_interval
)
17976 error ("stack clash guard size %<%d%> must be equal to probing interval "
17977 "%<%d%>", guard_size
, probe_interval
);
17979 /* Enable sw prefetching at specified optimization level for
17980 CPUS that have prefetch. Lower optimization level threshold by 1
17981 when profiling is enabled. */
17982 if (opts
->x_flag_prefetch_loop_arrays
< 0
17983 && !opts
->x_optimize_size
17984 && aarch64_tune_params
.prefetch
->default_opt_level
>= 0
17985 && opts
->x_optimize
>= aarch64_tune_params
.prefetch
->default_opt_level
)
17986 opts
->x_flag_prefetch_loop_arrays
= 1;
17988 aarch64_override_options_after_change_1 (opts
);
17991 /* Print a hint with a suggestion for a core or architecture name that
17992 most closely resembles what the user passed in STR. ARCH is true if
17993 the user is asking for an architecture name. ARCH is false if the user
17994 is asking for a core name. */
17997 aarch64_print_hint_for_core_or_arch (const char *str
, bool arch
)
17999 auto_vec
<const char *> candidates
;
18000 const struct processor
*entry
= arch
? all_architectures
: all_cores
;
18001 for (; entry
->name
!= NULL
; entry
++)
18002 candidates
.safe_push (entry
->name
);
18004 #ifdef HAVE_LOCAL_CPU_DETECT
18005 /* Add also "native" as possible value. */
18007 candidates
.safe_push ("native");
18011 const char *hint
= candidates_list_and_hint (str
, s
, candidates
);
18013 inform (input_location
, "valid arguments are: %s;"
18014 " did you mean %qs?", s
, hint
);
18016 inform (input_location
, "valid arguments are: %s", s
);
18021 /* Print a hint with a suggestion for a core name that most closely resembles
18022 what the user passed in STR. */
18025 aarch64_print_hint_for_core (const char *str
)
18027 aarch64_print_hint_for_core_or_arch (str
, false);
18030 /* Print a hint with a suggestion for an architecture name that most closely
18031 resembles what the user passed in STR. */
18034 aarch64_print_hint_for_arch (const char *str
)
18036 aarch64_print_hint_for_core_or_arch (str
, true);
18040 /* Print a hint with a suggestion for an extension name
18041 that most closely resembles what the user passed in STR. */
18044 aarch64_print_hint_for_extensions (const std::string
&str
)
18046 auto_vec
<const char *> candidates
;
18047 aarch64_get_all_extension_candidates (&candidates
);
18049 const char *hint
= candidates_list_and_hint (str
.c_str (), s
, candidates
);
18051 inform (input_location
, "valid arguments are: %s;"
18052 " did you mean %qs?", s
, hint
);
18054 inform (input_location
, "valid arguments are: %s", s
);
18059 /* Validate a command-line -mcpu option. Parse the cpu and extensions (if any)
18060 specified in STR and throw errors if appropriate. Put the results if
18061 they are valid in RES and ISA_FLAGS. Return whether the option is
18065 aarch64_validate_mcpu (const char *str
, const struct processor
**res
,
18066 aarch64_feature_flags
*isa_flags
)
18068 std::string invalid_extension
;
18069 enum aarch64_parse_opt_result parse_res
18070 = aarch64_parse_cpu (str
, res
, isa_flags
, &invalid_extension
);
18072 if (parse_res
== AARCH64_PARSE_OK
)
18077 case AARCH64_PARSE_MISSING_ARG
:
18078 error ("missing cpu name in %<-mcpu=%s%>", str
);
18080 case AARCH64_PARSE_INVALID_ARG
:
18081 error ("unknown value %qs for %<-mcpu%>", str
);
18082 aarch64_print_hint_for_core (str
);
18084 case AARCH64_PARSE_INVALID_FEATURE
:
18085 error ("invalid feature modifier %qs in %<-mcpu=%s%>",
18086 invalid_extension
.c_str (), str
);
18087 aarch64_print_hint_for_extensions (invalid_extension
);
18090 gcc_unreachable ();
18096 /* Straight line speculation indicators. */
18097 enum aarch64_sls_hardening_type
18104 static enum aarch64_sls_hardening_type aarch64_sls_hardening
;
18106 /* Return whether we should mitigatate Straight Line Speculation for the RET
18107 and BR instructions. */
18109 aarch64_harden_sls_retbr_p (void)
18111 return aarch64_sls_hardening
& SLS_RETBR
;
18114 /* Return whether we should mitigatate Straight Line Speculation for the BLR
18117 aarch64_harden_sls_blr_p (void)
18119 return aarch64_sls_hardening
& SLS_BLR
;
18122 /* As of yet we only allow setting these options globally, in the future we may
18123 allow setting them per function. */
18125 aarch64_validate_sls_mitigation (const char *const_str
)
18127 char *token_save
= NULL
;
18130 if (strcmp (const_str
, "none") == 0)
18132 aarch64_sls_hardening
= SLS_NONE
;
18135 if (strcmp (const_str
, "all") == 0)
18137 aarch64_sls_hardening
= SLS_ALL
;
18141 char *str_root
= xstrdup (const_str
);
18142 str
= strtok_r (str_root
, ",", &token_save
);
18144 error ("invalid argument given to %<-mharden-sls=%>");
18146 int temp
= SLS_NONE
;
18149 if (strcmp (str
, "blr") == 0)
18151 else if (strcmp (str
, "retbr") == 0)
18153 else if (strcmp (str
, "none") == 0 || strcmp (str
, "all") == 0)
18155 error ("%qs must be by itself for %<-mharden-sls=%>", str
);
18160 error ("invalid argument %<%s%> for %<-mharden-sls=%>", str
);
18163 str
= strtok_r (NULL
, ",", &token_save
);
18165 aarch64_sls_hardening
= (aarch64_sls_hardening_type
) temp
;
18169 /* Parses CONST_STR for branch protection features specified in
18170 aarch64_branch_protect_types, and set any global variables required. Returns
18171 the parsing result and assigns LAST_STR to the last processed token from
18172 CONST_STR so that it can be used for error reporting. */
18175 aarch64_parse_opt_result
aarch64_parse_branch_protection (const char *const_str
,
18178 char *str_root
= xstrdup (const_str
);
18179 char* token_save
= NULL
;
18180 char *str
= strtok_r (str_root
, "+", &token_save
);
18181 enum aarch64_parse_opt_result res
= AARCH64_PARSE_OK
;
18183 res
= AARCH64_PARSE_MISSING_ARG
;
18186 char *next_str
= strtok_r (NULL
, "+", &token_save
);
18187 /* Reset the branch protection features to their defaults. */
18188 aarch64_handle_no_branch_protection (NULL
, NULL
);
18190 while (str
&& res
== AARCH64_PARSE_OK
)
18192 const aarch64_branch_protect_type
* type
= aarch64_branch_protect_types
;
18193 bool found
= false;
18194 /* Search for this type. */
18195 while (type
&& type
->name
&& !found
&& res
== AARCH64_PARSE_OK
)
18197 if (strcmp (str
, type
->name
) == 0)
18200 res
= type
->handler (str
, next_str
);
18202 next_str
= strtok_r (NULL
, "+", &token_save
);
18207 if (found
&& res
== AARCH64_PARSE_OK
)
18209 bool found_subtype
= true;
18210 /* Loop through each token until we find one that isn't a
18212 while (found_subtype
)
18214 found_subtype
= false;
18215 const aarch64_branch_protect_type
*subtype
= type
->subtypes
;
18216 /* Search for the subtype. */
18217 while (str
&& subtype
&& subtype
->name
&& !found_subtype
18218 && res
== AARCH64_PARSE_OK
)
18220 if (strcmp (str
, subtype
->name
) == 0)
18222 found_subtype
= true;
18223 res
= subtype
->handler (str
, next_str
);
18225 next_str
= strtok_r (NULL
, "+", &token_save
);
18233 res
= AARCH64_PARSE_INVALID_ARG
;
18236 /* Copy the last processed token into the argument to pass it back.
18237 Used by option and attribute validation to print the offending token. */
18240 if (str
) strcpy (*last_str
, str
);
18241 else *last_str
= NULL
;
18243 if (res
== AARCH64_PARSE_OK
)
18245 /* If needed, alloc the accepted string then copy in const_str.
18246 Used by override_option_after_change_1. */
18247 if (!accepted_branch_protection_string
)
18248 accepted_branch_protection_string
= (char *) xmalloc (
18249 BRANCH_PROTECT_STR_MAX
18251 strncpy (accepted_branch_protection_string
, const_str
,
18252 BRANCH_PROTECT_STR_MAX
+ 1);
18253 /* Forcibly null-terminate. */
18254 accepted_branch_protection_string
[BRANCH_PROTECT_STR_MAX
] = '\0';
18260 aarch64_validate_mbranch_protection (const char *const_str
)
18262 char *str
= (char *) xmalloc (strlen (const_str
));
18263 enum aarch64_parse_opt_result res
=
18264 aarch64_parse_branch_protection (const_str
, &str
);
18265 if (res
== AARCH64_PARSE_INVALID_ARG
)
18266 error ("invalid argument %<%s%> for %<-mbranch-protection=%>", str
);
18267 else if (res
== AARCH64_PARSE_MISSING_ARG
)
18268 error ("missing argument for %<-mbranch-protection=%>");
18270 return res
== AARCH64_PARSE_OK
;
18273 /* Validate a command-line -march option. Parse the arch and extensions
18274 (if any) specified in STR and throw errors if appropriate. Put the
18275 results, if they are valid, in RES and ISA_FLAGS. Return whether the
18276 option is valid. */
18279 aarch64_validate_march (const char *str
, const struct processor
**res
,
18280 aarch64_feature_flags
*isa_flags
)
18282 std::string invalid_extension
;
18283 enum aarch64_parse_opt_result parse_res
18284 = aarch64_parse_arch (str
, res
, isa_flags
, &invalid_extension
);
18286 if (parse_res
== AARCH64_PARSE_OK
)
18291 case AARCH64_PARSE_MISSING_ARG
:
18292 error ("missing arch name in %<-march=%s%>", str
);
18294 case AARCH64_PARSE_INVALID_ARG
:
18295 error ("unknown value %qs for %<-march%>", str
);
18296 aarch64_print_hint_for_arch (str
);
18297 /* A common user error is confusing -march and -mcpu.
18298 If the -march string matches a known CPU suggest -mcpu. */
18299 parse_res
= aarch64_parse_cpu (str
, res
, isa_flags
, &invalid_extension
);
18300 if (parse_res
== AARCH64_PARSE_OK
)
18301 inform (input_location
, "did you mean %<-mcpu=%s%>?", str
);
18303 case AARCH64_PARSE_INVALID_FEATURE
:
18304 error ("invalid feature modifier %qs in %<-march=%s%>",
18305 invalid_extension
.c_str (), str
);
18306 aarch64_print_hint_for_extensions (invalid_extension
);
18309 gcc_unreachable ();
18315 /* Validate a command-line -mtune option. Parse the cpu
18316 specified in STR and throw errors if appropriate. Put the
18317 result, if it is valid, in RES. Return whether the option is
18321 aarch64_validate_mtune (const char *str
, const struct processor
**res
)
18323 enum aarch64_parse_opt_result parse_res
18324 = aarch64_parse_tune (str
, res
);
18326 if (parse_res
== AARCH64_PARSE_OK
)
18331 case AARCH64_PARSE_MISSING_ARG
:
18332 error ("missing cpu name in %<-mtune=%s%>", str
);
18334 case AARCH64_PARSE_INVALID_ARG
:
18335 error ("unknown value %qs for %<-mtune%>", str
);
18336 aarch64_print_hint_for_core (str
);
18339 gcc_unreachable ();
18344 /* Return the VG value associated with -msve-vector-bits= value VALUE. */
18347 aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits_enum value
)
18349 /* 128-bit SVE and Advanced SIMD modes use different register layouts
18350 on big-endian targets, so we would need to forbid subregs that convert
18351 from one to the other. By default a reinterpret sequence would then
18352 involve a store to memory in one mode and a load back in the other.
18353 Even if we optimize that sequence using reverse instructions,
18354 it would still be a significant potential overhead.
18356 For now, it seems better to generate length-agnostic code for that
18358 if (value
== SVE_SCALABLE
18359 || (value
== SVE_128
&& BYTES_BIG_ENDIAN
))
18360 return poly_uint16 (2, 2);
18362 return (int) value
/ 64;
18365 /* Set the global aarch64_asm_isa_flags to FLAGS and update
18366 aarch64_isa_flags accordingly. */
18369 aarch64_set_asm_isa_flags (aarch64_feature_flags flags
)
18371 aarch64_set_asm_isa_flags (&global_options
, flags
);
18374 /* Implement TARGET_OPTION_OVERRIDE. This is called once in the beginning
18375 and is used to parse the -m{cpu,tune,arch} strings and setup the initial
18376 tuning structs. In particular it must set selected_tune and
18377 aarch64_asm_isa_flags that define the available ISA features and tuning
18378 decisions. It must also set selected_arch as this will be used to
18379 output the .arch asm tags for each function. */
18382 aarch64_override_options (void)
18384 aarch64_feature_flags cpu_isa
= 0;
18385 aarch64_feature_flags arch_isa
= 0;
18386 aarch64_set_asm_isa_flags (0);
18388 const struct processor
*cpu
= NULL
;
18389 const struct processor
*arch
= NULL
;
18390 const struct processor
*tune
= NULL
;
18392 if (aarch64_harden_sls_string
)
18393 aarch64_validate_sls_mitigation (aarch64_harden_sls_string
);
18395 if (aarch64_branch_protection_string
)
18396 aarch64_validate_mbranch_protection (aarch64_branch_protection_string
);
18398 /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
18399 If either of -march or -mtune is given, they override their
18400 respective component of -mcpu. */
18401 if (aarch64_cpu_string
)
18402 aarch64_validate_mcpu (aarch64_cpu_string
, &cpu
, &cpu_isa
);
18404 if (aarch64_arch_string
)
18405 aarch64_validate_march (aarch64_arch_string
, &arch
, &arch_isa
);
18407 if (aarch64_tune_string
)
18408 aarch64_validate_mtune (aarch64_tune_string
, &tune
);
18410 #ifdef SUBTARGET_OVERRIDE_OPTIONS
18411 SUBTARGET_OVERRIDE_OPTIONS
;
18416 /* If both -mcpu and -march are specified, warn if they are not
18417 architecturally compatible and prefer the -march ISA flags. */
18418 if (arch
->arch
!= cpu
->arch
)
18420 warning (0, "switch %<-mcpu=%s%> conflicts with %<-march=%s%> switch",
18421 aarch64_cpu_string
,
18422 aarch64_arch_string
);
18425 selected_arch
= arch
->arch
;
18426 aarch64_set_asm_isa_flags (arch_isa
);
18430 selected_arch
= cpu
->arch
;
18431 aarch64_set_asm_isa_flags (cpu_isa
);
18435 cpu
= &all_cores
[arch
->ident
];
18436 selected_arch
= arch
->arch
;
18437 aarch64_set_asm_isa_flags (arch_isa
);
18441 /* No -mcpu or -march specified, so use the default CPU. */
18442 cpu
= &all_cores
[TARGET_CPU_DEFAULT
];
18443 selected_arch
= cpu
->arch
;
18444 aarch64_set_asm_isa_flags (cpu
->flags
);
18447 selected_tune
= tune
? tune
->ident
: cpu
->ident
;
18449 if (aarch64_enable_bti
== 2)
18451 #ifdef TARGET_ENABLE_BTI
18452 aarch64_enable_bti
= 1;
18454 aarch64_enable_bti
= 0;
18458 /* Return address signing is currently not supported for ILP32 targets. For
18459 LP64 targets use the configured option in the absence of a command-line
18460 option for -mbranch-protection. */
18461 if (!TARGET_ILP32
&& accepted_branch_protection_string
== NULL
)
18463 #ifdef TARGET_ENABLE_PAC_RET
18464 aarch64_ra_sign_scope
= AARCH64_FUNCTION_NON_LEAF
;
18466 aarch64_ra_sign_scope
= AARCH64_FUNCTION_NONE
;
18470 #ifndef HAVE_AS_MABI_OPTION
18471 /* The compiler may have been configured with 2.23.* binutils, which does
18472 not have support for ILP32. */
18474 error ("assembler does not support %<-mabi=ilp32%>");
18477 /* Convert -msve-vector-bits to a VG count. */
18478 aarch64_sve_vg
= aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits
);
18480 if (aarch64_ra_sign_scope
!= AARCH64_FUNCTION_NONE
&& TARGET_ILP32
)
18481 sorry ("return address signing is only supported for %<-mabi=lp64%>");
18483 /* The pass to insert speculation tracking runs before
18484 shrink-wrapping and the latter does not know how to update the
18485 tracking status. So disable it in this case. */
18486 if (aarch64_track_speculation
)
18487 flag_shrink_wrap
= 0;
18489 aarch64_override_options_internal (&global_options
);
18491 /* Save these options as the default ones in case we push and pop them later
18492 while processing functions with potential target attributes. */
18493 target_option_default_node
= target_option_current_node
18494 = build_target_option_node (&global_options
, &global_options_set
);
18497 /* Implement targetm.override_options_after_change. */
18500 aarch64_override_options_after_change (void)
18502 aarch64_override_options_after_change_1 (&global_options
);
18505 /* Implement the TARGET_OFFLOAD_OPTIONS hook. */
18507 aarch64_offload_options (void)
18510 return xstrdup ("-foffload-abi=ilp32");
18512 return xstrdup ("-foffload-abi=lp64");
18515 static struct machine_function
*
18516 aarch64_init_machine_status (void)
18518 struct machine_function
*machine
;
18519 machine
= ggc_cleared_alloc
<machine_function
> ();
18524 aarch64_init_expanders (void)
18526 init_machine_status
= aarch64_init_machine_status
;
18529 /* A checking mechanism for the implementation of the various code models. */
18531 initialize_aarch64_code_model (struct gcc_options
*opts
)
18533 aarch64_cmodel
= opts
->x_aarch64_cmodel_var
;
18534 switch (opts
->x_aarch64_cmodel_var
)
18536 case AARCH64_CMODEL_TINY
:
18537 if (opts
->x_flag_pic
)
18538 aarch64_cmodel
= AARCH64_CMODEL_TINY_PIC
;
18540 case AARCH64_CMODEL_SMALL
:
18541 if (opts
->x_flag_pic
)
18543 #ifdef HAVE_AS_SMALL_PIC_RELOCS
18544 aarch64_cmodel
= (flag_pic
== 2
18545 ? AARCH64_CMODEL_SMALL_PIC
18546 : AARCH64_CMODEL_SMALL_SPIC
);
18548 aarch64_cmodel
= AARCH64_CMODEL_SMALL_PIC
;
18552 case AARCH64_CMODEL_LARGE
:
18553 if (opts
->x_flag_pic
)
18554 sorry ("code model %qs with %<-f%s%>", "large",
18555 opts
->x_flag_pic
> 1 ? "PIC" : "pic");
18556 if (opts
->x_aarch64_abi
== AARCH64_ABI_ILP32
)
18557 sorry ("code model %qs not supported in ilp32 mode", "large");
18559 case AARCH64_CMODEL_TINY_PIC
:
18560 case AARCH64_CMODEL_SMALL_PIC
:
18561 case AARCH64_CMODEL_SMALL_SPIC
:
18562 gcc_unreachable ();
18566 /* Implements TARGET_OPTION_RESTORE. Restore the backend codegen decisions
18567 using the information saved in PTR. */
18570 aarch64_option_restore (struct gcc_options
*opts
,
18571 struct gcc_options
* /* opts_set */,
18572 struct cl_target_option
* /* ptr */)
18574 aarch64_override_options_internal (opts
);
18577 /* Implement TARGET_OPTION_PRINT. */
18580 aarch64_option_print (FILE *file
, int indent
, struct cl_target_option
*ptr
)
18582 const struct processor
*cpu
18583 = aarch64_get_tune_cpu (ptr
->x_selected_tune
);
18584 const struct processor
*arch
= aarch64_get_arch (ptr
->x_selected_arch
);
18585 std::string extension
18586 = aarch64_get_extension_string_for_isa_flags (ptr
->x_aarch64_asm_isa_flags
,
18589 fprintf (file
, "%*sselected tune = %s\n", indent
, "", cpu
->name
);
18590 fprintf (file
, "%*sselected arch = %s%s\n", indent
, "",
18591 arch
->name
, extension
.c_str ());
18594 static GTY(()) tree aarch64_previous_fndecl
;
18597 aarch64_reset_previous_fndecl (void)
18599 aarch64_previous_fndecl
= NULL
;
18602 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
18603 Used by aarch64_set_current_function and aarch64_pragma_target_parse to
18604 make sure optab availability predicates are recomputed when necessary. */
18607 aarch64_save_restore_target_globals (tree new_tree
)
18609 if (TREE_TARGET_GLOBALS (new_tree
))
18610 restore_target_globals (TREE_TARGET_GLOBALS (new_tree
));
18611 else if (new_tree
== target_option_default_node
)
18612 restore_target_globals (&default_target_globals
);
18614 TREE_TARGET_GLOBALS (new_tree
) = save_target_globals_default_opts ();
18617 /* Implement TARGET_SET_CURRENT_FUNCTION. Unpack the codegen decisions
18618 like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
18619 of the function, if such exists. This function may be called multiple
18620 times on a single function so use aarch64_previous_fndecl to avoid
18621 setting up identical state. */
18624 aarch64_set_current_function (tree fndecl
)
18626 if (!fndecl
|| fndecl
== aarch64_previous_fndecl
)
18629 tree old_tree
= (aarch64_previous_fndecl
18630 ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl
)
18633 tree new_tree
= DECL_FUNCTION_SPECIFIC_TARGET (fndecl
);
18635 /* If current function has no attributes but the previous one did,
18636 use the default node. */
18637 if (!new_tree
&& old_tree
)
18638 new_tree
= target_option_default_node
;
18640 /* If nothing to do, return. #pragma GCC reset or #pragma GCC pop to
18641 the default have been handled by aarch64_save_restore_target_globals from
18642 aarch64_pragma_target_parse. */
18643 if (old_tree
== new_tree
)
18646 aarch64_previous_fndecl
= fndecl
;
18648 /* First set the target options. */
18649 cl_target_option_restore (&global_options
, &global_options_set
,
18650 TREE_TARGET_OPTION (new_tree
));
18652 aarch64_save_restore_target_globals (new_tree
);
18655 /* Enum describing the various ways we can handle attributes.
18656 In many cases we can reuse the generic option handling machinery. */
18658 enum aarch64_attr_opt_type
18660 aarch64_attr_mask
, /* Attribute should set a bit in target_flags. */
18661 aarch64_attr_bool
, /* Attribute sets or unsets a boolean variable. */
18662 aarch64_attr_enum
, /* Attribute sets an enum variable. */
18663 aarch64_attr_custom
/* Attribute requires a custom handling function. */
18666 /* All the information needed to handle a target attribute.
18667 NAME is the name of the attribute.
18668 ATTR_TYPE specifies the type of behavior of the attribute as described
18669 in the definition of enum aarch64_attr_opt_type.
18670 ALLOW_NEG is true if the attribute supports a "no-" form.
18671 HANDLER is the function that takes the attribute string as an argument
18672 It is needed only when the ATTR_TYPE is aarch64_attr_custom.
18673 OPT_NUM is the enum specifying the option that the attribute modifies.
18674 This is needed for attributes that mirror the behavior of a command-line
18675 option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
18676 aarch64_attr_enum. */
18678 struct aarch64_attribute_info
18681 enum aarch64_attr_opt_type attr_type
;
18683 bool (*handler
) (const char *);
18684 enum opt_code opt_num
;
18687 /* Handle the ARCH_STR argument to the arch= target attribute. */
18690 aarch64_handle_attr_arch (const char *str
)
18692 const struct processor
*tmp_arch
= NULL
;
18693 std::string invalid_extension
;
18694 aarch64_feature_flags tmp_flags
;
18695 enum aarch64_parse_opt_result parse_res
18696 = aarch64_parse_arch (str
, &tmp_arch
, &tmp_flags
, &invalid_extension
);
18698 if (parse_res
== AARCH64_PARSE_OK
)
18700 gcc_assert (tmp_arch
);
18701 selected_arch
= tmp_arch
->arch
;
18702 aarch64_set_asm_isa_flags (tmp_flags
);
18708 case AARCH64_PARSE_MISSING_ARG
:
18709 error ("missing name in %<target(\"arch=\")%> pragma or attribute");
18711 case AARCH64_PARSE_INVALID_ARG
:
18712 error ("invalid name %qs in %<target(\"arch=\")%> pragma or attribute", str
);
18713 aarch64_print_hint_for_arch (str
);
18715 case AARCH64_PARSE_INVALID_FEATURE
:
18716 error ("invalid feature modifier %s of value %qs in "
18717 "%<target()%> pragma or attribute", invalid_extension
.c_str (), str
);
18718 aarch64_print_hint_for_extensions (invalid_extension
);
18721 gcc_unreachable ();
18727 /* Handle the argument CPU_STR to the cpu= target attribute. */
18730 aarch64_handle_attr_cpu (const char *str
)
18732 const struct processor
*tmp_cpu
= NULL
;
18733 std::string invalid_extension
;
18734 aarch64_feature_flags tmp_flags
;
18735 enum aarch64_parse_opt_result parse_res
18736 = aarch64_parse_cpu (str
, &tmp_cpu
, &tmp_flags
, &invalid_extension
);
18738 if (parse_res
== AARCH64_PARSE_OK
)
18740 gcc_assert (tmp_cpu
);
18741 selected_tune
= tmp_cpu
->ident
;
18742 selected_arch
= tmp_cpu
->arch
;
18743 aarch64_set_asm_isa_flags (tmp_flags
);
18749 case AARCH64_PARSE_MISSING_ARG
:
18750 error ("missing name in %<target(\"cpu=\")%> pragma or attribute");
18752 case AARCH64_PARSE_INVALID_ARG
:
18753 error ("invalid name %qs in %<target(\"cpu=\")%> pragma or attribute", str
);
18754 aarch64_print_hint_for_core (str
);
18756 case AARCH64_PARSE_INVALID_FEATURE
:
18757 error ("invalid feature modifier %qs of value %qs in "
18758 "%<target()%> pragma or attribute", invalid_extension
.c_str (), str
);
18759 aarch64_print_hint_for_extensions (invalid_extension
);
18762 gcc_unreachable ();
18768 /* Handle the argument STR to the branch-protection= attribute. */
18771 aarch64_handle_attr_branch_protection (const char* str
)
18773 char *err_str
= (char *) xmalloc (strlen (str
) + 1);
18774 enum aarch64_parse_opt_result res
= aarch64_parse_branch_protection (str
,
18776 bool success
= false;
18779 case AARCH64_PARSE_MISSING_ARG
:
18780 error ("missing argument to %<target(\"branch-protection=\")%> pragma or"
18783 case AARCH64_PARSE_INVALID_ARG
:
18784 error ("invalid protection type %qs in %<target(\"branch-protection"
18785 "=\")%> pragma or attribute", err_str
);
18787 case AARCH64_PARSE_OK
:
18789 /* Fall through. */
18790 case AARCH64_PARSE_INVALID_FEATURE
:
18793 gcc_unreachable ();
18799 /* Handle the argument STR to the tune= target attribute. */
18802 aarch64_handle_attr_tune (const char *str
)
18804 const struct processor
*tmp_tune
= NULL
;
18805 enum aarch64_parse_opt_result parse_res
18806 = aarch64_parse_tune (str
, &tmp_tune
);
18808 if (parse_res
== AARCH64_PARSE_OK
)
18810 gcc_assert (tmp_tune
);
18811 selected_tune
= tmp_tune
->ident
;
18817 case AARCH64_PARSE_INVALID_ARG
:
18818 error ("invalid name %qs in %<target(\"tune=\")%> pragma or attribute", str
);
18819 aarch64_print_hint_for_core (str
);
18822 gcc_unreachable ();
18828 /* Parse an architecture extensions target attribute string specified in STR.
18829 For example "+fp+nosimd". Show any errors if needed. Return TRUE
18830 if successful. Update aarch64_isa_flags to reflect the ISA features
18834 aarch64_handle_attr_isa_flags (char *str
)
18836 enum aarch64_parse_opt_result parse_res
;
18837 auto isa_flags
= aarch64_asm_isa_flags
;
18839 /* We allow "+nothing" in the beginning to clear out all architectural
18840 features if the user wants to handpick specific features. */
18841 if (strncmp ("+nothing", str
, 8) == 0)
18847 std::string invalid_extension
;
18848 parse_res
= aarch64_parse_extension (str
, &isa_flags
, &invalid_extension
);
18850 if (parse_res
== AARCH64_PARSE_OK
)
18852 aarch64_set_asm_isa_flags (isa_flags
);
18858 case AARCH64_PARSE_MISSING_ARG
:
18859 error ("missing value in %<target()%> pragma or attribute");
18862 case AARCH64_PARSE_INVALID_FEATURE
:
18863 error ("invalid feature modifier %qs of value %qs in "
18864 "%<target()%> pragma or attribute", invalid_extension
.c_str (), str
);
18868 gcc_unreachable ();
18874 /* The target attributes that we support. On top of these we also support just
18875 ISA extensions, like __attribute__ ((target ("+crc"))), but that case is
18876 handled explicitly in aarch64_process_one_target_attr. */
18878 static const struct aarch64_attribute_info aarch64_attributes
[] =
18880 { "general-regs-only", aarch64_attr_mask
, false, NULL
,
18881 OPT_mgeneral_regs_only
},
18882 { "fix-cortex-a53-835769", aarch64_attr_bool
, true, NULL
,
18883 OPT_mfix_cortex_a53_835769
},
18884 { "fix-cortex-a53-843419", aarch64_attr_bool
, true, NULL
,
18885 OPT_mfix_cortex_a53_843419
},
18886 { "cmodel", aarch64_attr_enum
, false, NULL
, OPT_mcmodel_
},
18887 { "strict-align", aarch64_attr_mask
, true, NULL
, OPT_mstrict_align
},
18888 { "omit-leaf-frame-pointer", aarch64_attr_bool
, true, NULL
,
18889 OPT_momit_leaf_frame_pointer
},
18890 { "tls-dialect", aarch64_attr_enum
, false, NULL
, OPT_mtls_dialect_
},
18891 { "arch", aarch64_attr_custom
, false, aarch64_handle_attr_arch
,
18893 { "cpu", aarch64_attr_custom
, false, aarch64_handle_attr_cpu
, OPT_mcpu_
},
18894 { "tune", aarch64_attr_custom
, false, aarch64_handle_attr_tune
,
18896 { "branch-protection", aarch64_attr_custom
, false,
18897 aarch64_handle_attr_branch_protection
, OPT_mbranch_protection_
},
18898 { "sign-return-address", aarch64_attr_enum
, false, NULL
,
18899 OPT_msign_return_address_
},
18900 { "outline-atomics", aarch64_attr_bool
, true, NULL
,
18901 OPT_moutline_atomics
},
18902 { NULL
, aarch64_attr_custom
, false, NULL
, OPT____
}
18905 /* Parse ARG_STR which contains the definition of one target attribute.
18906 Show appropriate errors if any or return true if the attribute is valid. */
18909 aarch64_process_one_target_attr (char *arg_str
)
18911 bool invert
= false;
18913 size_t len
= strlen (arg_str
);
18917 error ("malformed %<target()%> pragma or attribute");
18921 char *str_to_check
= (char *) alloca (len
+ 1);
18922 strcpy (str_to_check
, arg_str
);
18924 /* We have something like __attribute__ ((target ("+fp+nosimd"))).
18925 It is easier to detect and handle it explicitly here rather than going
18926 through the machinery for the rest of the target attributes in this
18928 if (*str_to_check
== '+')
18929 return aarch64_handle_attr_isa_flags (str_to_check
);
18931 if (len
> 3 && startswith (str_to_check
, "no-"))
18936 char *arg
= strchr (str_to_check
, '=');
18938 /* If we found opt=foo then terminate STR_TO_CHECK at the '='
18939 and point ARG to "foo". */
18945 const struct aarch64_attribute_info
*p_attr
;
18946 bool found
= false;
18947 for (p_attr
= aarch64_attributes
; p_attr
->name
; p_attr
++)
18949 /* If the names don't match up, or the user has given an argument
18950 to an attribute that doesn't accept one, or didn't give an argument
18951 to an attribute that expects one, fail to match. */
18952 if (strcmp (str_to_check
, p_attr
->name
) != 0)
18956 bool attr_need_arg_p
= p_attr
->attr_type
== aarch64_attr_custom
18957 || p_attr
->attr_type
== aarch64_attr_enum
;
18959 if (attr_need_arg_p
^ (arg
!= NULL
))
18961 error ("pragma or attribute %<target(\"%s\")%> does not accept an argument", str_to_check
);
18965 /* If the name matches but the attribute does not allow "no-" versions
18966 then we can't match. */
18967 if (invert
&& !p_attr
->allow_neg
)
18969 error ("pragma or attribute %<target(\"%s\")%> does not allow a negated form", str_to_check
);
18973 switch (p_attr
->attr_type
)
18975 /* Has a custom handler registered.
18976 For example, cpu=, arch=, tune=. */
18977 case aarch64_attr_custom
:
18978 gcc_assert (p_attr
->handler
);
18979 if (!p_attr
->handler (arg
))
18983 /* Either set or unset a boolean option. */
18984 case aarch64_attr_bool
:
18986 struct cl_decoded_option decoded
;
18988 generate_option (p_attr
->opt_num
, NULL
, !invert
,
18989 CL_TARGET
, &decoded
);
18990 aarch64_handle_option (&global_options
, &global_options_set
,
18991 &decoded
, input_location
);
18994 /* Set or unset a bit in the target_flags. aarch64_handle_option
18995 should know what mask to apply given the option number. */
18996 case aarch64_attr_mask
:
18998 struct cl_decoded_option decoded
;
18999 /* We only need to specify the option number.
19000 aarch64_handle_option will know which mask to apply. */
19001 decoded
.opt_index
= p_attr
->opt_num
;
19002 decoded
.value
= !invert
;
19003 aarch64_handle_option (&global_options
, &global_options_set
,
19004 &decoded
, input_location
);
19007 /* Use the option setting machinery to set an option to an enum. */
19008 case aarch64_attr_enum
:
19013 valid
= opt_enum_arg_to_value (p_attr
->opt_num
, arg
,
19014 &value
, CL_TARGET
);
19017 set_option (&global_options
, NULL
, p_attr
->opt_num
, value
,
19018 NULL
, DK_UNSPECIFIED
, input_location
,
19023 error ("pragma or attribute %<target(\"%s=%s\")%> is not valid", str_to_check
, arg
);
19028 gcc_unreachable ();
19032 /* If we reached here we either have found an attribute and validated
19033 it or didn't match any. If we matched an attribute but its arguments
19034 were malformed we will have returned false already. */
19038 /* Count how many times the character C appears in
19039 NULL-terminated string STR. */
19041 static unsigned int
19042 num_occurences_in_str (char c
, char *str
)
19044 unsigned int res
= 0;
19045 while (*str
!= '\0')
19056 /* Parse the tree in ARGS that contains the target attribute information
19057 and update the global target options space. */
19060 aarch64_process_target_attr (tree args
)
19062 if (TREE_CODE (args
) == TREE_LIST
)
19066 tree head
= TREE_VALUE (args
);
19069 if (!aarch64_process_target_attr (head
))
19072 args
= TREE_CHAIN (args
);
19078 if (TREE_CODE (args
) != STRING_CST
)
19080 error ("attribute %<target%> argument not a string");
19084 size_t len
= strlen (TREE_STRING_POINTER (args
));
19085 char *str_to_check
= (char *) alloca (len
+ 1);
19086 strcpy (str_to_check
, TREE_STRING_POINTER (args
));
19090 error ("malformed %<target()%> pragma or attribute");
19094 /* Used to catch empty spaces between commas i.e.
19095 attribute ((target ("attr1,,attr2"))). */
19096 unsigned int num_commas
= num_occurences_in_str (',', str_to_check
);
19098 /* Handle multiple target attributes separated by ','. */
19099 char *token
= strtok_r (str_to_check
, ",", &str_to_check
);
19101 unsigned int num_attrs
= 0;
19105 if (!aarch64_process_one_target_attr (token
))
19107 /* Check if token is possibly an arch extension without
19109 aarch64_feature_flags isa_temp
= 0;
19110 auto with_plus
= std::string ("+") + token
;
19111 enum aarch64_parse_opt_result ext_res
19112 = aarch64_parse_extension (with_plus
.c_str (), &isa_temp
, nullptr);
19114 if (ext_res
== AARCH64_PARSE_OK
)
19115 error ("arch extension %<%s%> should be prefixed by %<+%>",
19118 error ("pragma or attribute %<target(\"%s\")%> is not valid", token
);
19122 token
= strtok_r (NULL
, ",", &str_to_check
);
19125 if (num_attrs
!= num_commas
+ 1)
19127 error ("malformed %<target(\"%s\")%> pragma or attribute", TREE_STRING_POINTER (args
));
19134 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P. This is used to
19135 process attribute ((target ("..."))). */
19138 aarch64_option_valid_attribute_p (tree fndecl
, tree
, tree args
, int)
19140 struct cl_target_option cur_target
;
19143 tree new_target
, new_optimize
;
19144 tree existing_target
= DECL_FUNCTION_SPECIFIC_TARGET (fndecl
);
19146 /* If what we're processing is the current pragma string then the
19147 target option node is already stored in target_option_current_node
19148 by aarch64_pragma_target_parse in aarch64-c.cc. Use that to avoid
19149 having to re-parse the string. This is especially useful to keep
19150 arm_neon.h compile times down since that header contains a lot
19151 of intrinsics enclosed in pragmas. */
19152 if (!existing_target
&& args
== current_target_pragma
)
19154 DECL_FUNCTION_SPECIFIC_TARGET (fndecl
) = target_option_current_node
;
19157 tree func_optimize
= DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl
);
19160 = build_optimization_node (&global_options
, &global_options_set
);
19161 func_optimize
= DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl
);
19163 /* If the function changed the optimization levels as well as setting
19164 target options, start with the optimizations specified. */
19165 if (func_optimize
&& func_optimize
!= old_optimize
)
19166 cl_optimization_restore (&global_options
, &global_options_set
,
19167 TREE_OPTIMIZATION (func_optimize
));
19169 /* Save the current target options to restore at the end. */
19170 cl_target_option_save (&cur_target
, &global_options
, &global_options_set
);
19172 /* If fndecl already has some target attributes applied to it, unpack
19173 them so that we add this attribute on top of them, rather than
19174 overwriting them. */
19175 if (existing_target
)
19177 struct cl_target_option
*existing_options
19178 = TREE_TARGET_OPTION (existing_target
);
19180 if (existing_options
)
19181 cl_target_option_restore (&global_options
, &global_options_set
,
19185 cl_target_option_restore (&global_options
, &global_options_set
,
19186 TREE_TARGET_OPTION (target_option_current_node
));
19188 ret
= aarch64_process_target_attr (args
);
19190 /* Set up any additional state. */
19193 aarch64_override_options_internal (&global_options
);
19194 new_target
= build_target_option_node (&global_options
,
19195 &global_options_set
);
19200 new_optimize
= build_optimization_node (&global_options
,
19201 &global_options_set
);
19205 DECL_FUNCTION_SPECIFIC_TARGET (fndecl
) = new_target
;
19207 if (old_optimize
!= new_optimize
)
19208 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl
) = new_optimize
;
19211 cl_target_option_restore (&global_options
, &global_options_set
, &cur_target
);
19213 if (old_optimize
!= new_optimize
)
19214 cl_optimization_restore (&global_options
, &global_options_set
,
19215 TREE_OPTIMIZATION (old_optimize
));
19219 /* Helper for aarch64_can_inline_p. In the case where CALLER and CALLEE are
19220 tri-bool options (yes, no, don't care) and the default value is
19221 DEF, determine whether to reject inlining. */
19224 aarch64_tribools_ok_for_inlining_p (int caller
, int callee
,
19225 int dont_care
, int def
)
19227 /* If the callee doesn't care, always allow inlining. */
19228 if (callee
== dont_care
)
19231 /* If the caller doesn't care, always allow inlining. */
19232 if (caller
== dont_care
)
19235 /* Otherwise, allow inlining if either the callee and caller values
19236 agree, or if the callee is using the default value. */
19237 return (callee
== caller
|| callee
== def
);
19240 /* Implement TARGET_CAN_INLINE_P. Decide whether it is valid
19241 to inline CALLEE into CALLER based on target-specific info.
19242 Make sure that the caller and callee have compatible architectural
19243 features. Then go through the other possible target attributes
19244 and see if they can block inlining. Try not to reject always_inline
19245 callees unless they are incompatible architecturally. */
19248 aarch64_can_inline_p (tree caller
, tree callee
)
19250 tree caller_tree
= DECL_FUNCTION_SPECIFIC_TARGET (caller
);
19251 tree callee_tree
= DECL_FUNCTION_SPECIFIC_TARGET (callee
);
19253 struct cl_target_option
*caller_opts
19254 = TREE_TARGET_OPTION (caller_tree
? caller_tree
19255 : target_option_default_node
);
19257 struct cl_target_option
*callee_opts
19258 = TREE_TARGET_OPTION (callee_tree
? callee_tree
19259 : target_option_default_node
);
19261 /* Callee's ISA flags should be a subset of the caller's. */
19262 if ((caller_opts
->x_aarch64_asm_isa_flags
19263 & callee_opts
->x_aarch64_asm_isa_flags
)
19264 != callee_opts
->x_aarch64_asm_isa_flags
)
19266 if ((caller_opts
->x_aarch64_isa_flags
& callee_opts
->x_aarch64_isa_flags
)
19267 != callee_opts
->x_aarch64_isa_flags
)
19270 /* Allow non-strict aligned functions inlining into strict
19272 if ((TARGET_STRICT_ALIGN_P (caller_opts
->x_target_flags
)
19273 != TARGET_STRICT_ALIGN_P (callee_opts
->x_target_flags
))
19274 && !(!TARGET_STRICT_ALIGN_P (callee_opts
->x_target_flags
)
19275 && TARGET_STRICT_ALIGN_P (caller_opts
->x_target_flags
)))
19278 bool always_inline
= lookup_attribute ("always_inline",
19279 DECL_ATTRIBUTES (callee
));
19281 /* If the architectural features match up and the callee is always_inline
19282 then the other attributes don't matter. */
19286 if (caller_opts
->x_aarch64_cmodel_var
19287 != callee_opts
->x_aarch64_cmodel_var
)
19290 if (caller_opts
->x_aarch64_tls_dialect
19291 != callee_opts
->x_aarch64_tls_dialect
)
19294 /* Honour explicit requests to workaround errata. */
19295 if (!aarch64_tribools_ok_for_inlining_p (
19296 caller_opts
->x_aarch64_fix_a53_err835769
,
19297 callee_opts
->x_aarch64_fix_a53_err835769
,
19298 2, TARGET_FIX_ERR_A53_835769_DEFAULT
))
19301 if (!aarch64_tribools_ok_for_inlining_p (
19302 caller_opts
->x_aarch64_fix_a53_err843419
,
19303 callee_opts
->x_aarch64_fix_a53_err843419
,
19304 2, TARGET_FIX_ERR_A53_843419
))
19307 /* If the user explicitly specified -momit-leaf-frame-pointer for the
19308 caller and calle and they don't match up, reject inlining. */
19309 if (!aarch64_tribools_ok_for_inlining_p (
19310 caller_opts
->x_flag_omit_leaf_frame_pointer
,
19311 callee_opts
->x_flag_omit_leaf_frame_pointer
,
19315 /* If the callee has specific tuning overrides, respect them. */
19316 if (callee_opts
->x_aarch64_override_tune_string
!= NULL
19317 && caller_opts
->x_aarch64_override_tune_string
== NULL
)
19320 /* If the user specified tuning override strings for the
19321 caller and callee and they don't match up, reject inlining.
19322 We just do a string compare here, we don't analyze the meaning
19323 of the string, as it would be too costly for little gain. */
19324 if (callee_opts
->x_aarch64_override_tune_string
19325 && caller_opts
->x_aarch64_override_tune_string
19326 && (strcmp (callee_opts
->x_aarch64_override_tune_string
,
19327 caller_opts
->x_aarch64_override_tune_string
) != 0))
19333 /* Return the ID of the TLDESC ABI, initializing the descriptor if hasn't
19337 aarch64_tlsdesc_abi_id ()
19339 predefined_function_abi
&tlsdesc_abi
= function_abis
[ARM_PCS_TLSDESC
];
19340 if (!tlsdesc_abi
.initialized_p ())
19342 HARD_REG_SET full_reg_clobbers
;
19343 CLEAR_HARD_REG_SET (full_reg_clobbers
);
19344 SET_HARD_REG_BIT (full_reg_clobbers
, R0_REGNUM
);
19345 SET_HARD_REG_BIT (full_reg_clobbers
, CC_REGNUM
);
19346 for (int regno
= P0_REGNUM
; regno
<= P15_REGNUM
; ++regno
)
19347 SET_HARD_REG_BIT (full_reg_clobbers
, regno
);
19348 tlsdesc_abi
.initialize (ARM_PCS_TLSDESC
, full_reg_clobbers
);
19350 return tlsdesc_abi
.id ();
19353 /* Return true if SYMBOL_REF X binds locally. */
19356 aarch64_symbol_binds_local_p (const_rtx x
)
19358 return (SYMBOL_REF_DECL (x
)
19359 ? targetm
.binds_local_p (SYMBOL_REF_DECL (x
))
19360 : SYMBOL_REF_LOCAL_P (x
));
19363 /* Return true if SYMBOL_REF X is thread local */
19365 aarch64_tls_symbol_p (rtx x
)
19367 if (! TARGET_HAVE_TLS
)
19370 x
= strip_salt (x
);
19371 if (!SYMBOL_REF_P (x
))
19374 return SYMBOL_REF_TLS_MODEL (x
) != 0;
19377 /* Classify a TLS symbol into one of the TLS kinds. */
19378 enum aarch64_symbol_type
19379 aarch64_classify_tls_symbol (rtx x
)
19381 enum tls_model tls_kind
= tls_symbolic_operand_type (x
);
19385 case TLS_MODEL_GLOBAL_DYNAMIC
:
19386 case TLS_MODEL_LOCAL_DYNAMIC
:
19387 return TARGET_TLS_DESC
? SYMBOL_SMALL_TLSDESC
: SYMBOL_SMALL_TLSGD
;
19389 case TLS_MODEL_INITIAL_EXEC
:
19390 switch (aarch64_cmodel
)
19392 case AARCH64_CMODEL_TINY
:
19393 case AARCH64_CMODEL_TINY_PIC
:
19394 return SYMBOL_TINY_TLSIE
;
19396 return SYMBOL_SMALL_TLSIE
;
19399 case TLS_MODEL_LOCAL_EXEC
:
19400 if (aarch64_tls_size
== 12)
19401 return SYMBOL_TLSLE12
;
19402 else if (aarch64_tls_size
== 24)
19403 return SYMBOL_TLSLE24
;
19404 else if (aarch64_tls_size
== 32)
19405 return SYMBOL_TLSLE32
;
19406 else if (aarch64_tls_size
== 48)
19407 return SYMBOL_TLSLE48
;
19409 gcc_unreachable ();
19411 case TLS_MODEL_EMULATED
:
19412 case TLS_MODEL_NONE
:
19413 return SYMBOL_FORCE_TO_MEM
;
19416 gcc_unreachable ();
19420 /* Return the correct method for accessing X + OFFSET, where X is either
19421 a SYMBOL_REF or LABEL_REF. */
19423 enum aarch64_symbol_type
19424 aarch64_classify_symbol (rtx x
, HOST_WIDE_INT offset
)
19426 x
= strip_salt (x
);
19428 if (LABEL_REF_P (x
))
19430 switch (aarch64_cmodel
)
19432 case AARCH64_CMODEL_LARGE
:
19433 return SYMBOL_FORCE_TO_MEM
;
19435 case AARCH64_CMODEL_TINY_PIC
:
19436 case AARCH64_CMODEL_TINY
:
19437 return SYMBOL_TINY_ABSOLUTE
;
19439 case AARCH64_CMODEL_SMALL_SPIC
:
19440 case AARCH64_CMODEL_SMALL_PIC
:
19441 case AARCH64_CMODEL_SMALL
:
19442 return SYMBOL_SMALL_ABSOLUTE
;
19445 gcc_unreachable ();
19449 if (SYMBOL_REF_P (x
))
19451 if (aarch64_tls_symbol_p (x
))
19452 return aarch64_classify_tls_symbol (x
);
19454 switch (aarch64_cmodel
)
19456 case AARCH64_CMODEL_TINY_PIC
:
19457 case AARCH64_CMODEL_TINY
:
19458 /* With -fPIC non-local symbols use the GOT. For orthogonality
19459 always use the GOT for extern weak symbols. */
19460 if ((flag_pic
|| SYMBOL_REF_WEAK (x
))
19461 && !aarch64_symbol_binds_local_p (x
))
19462 return SYMBOL_TINY_GOT
;
19464 /* When we retrieve symbol + offset address, we have to make sure
19465 the offset does not cause overflow of the final address. But
19466 we have no way of knowing the address of symbol at compile time
19467 so we can't accurately say if the distance between the PC and
19468 symbol + offset is outside the addressible range of +/-1MB in the
19469 TINY code model. So we limit the maximum offset to +/-64KB and
19470 assume the offset to the symbol is not larger than +/-(1MB - 64KB).
19471 If offset_within_block_p is true we allow larger offsets. */
19472 if (!(IN_RANGE (offset
, -0x10000, 0x10000)
19473 || offset_within_block_p (x
, offset
)))
19474 return SYMBOL_FORCE_TO_MEM
;
19476 return SYMBOL_TINY_ABSOLUTE
;
19479 case AARCH64_CMODEL_SMALL_SPIC
:
19480 case AARCH64_CMODEL_SMALL_PIC
:
19481 case AARCH64_CMODEL_SMALL
:
19482 if ((flag_pic
|| SYMBOL_REF_WEAK (x
))
19483 && !aarch64_symbol_binds_local_p (x
))
19484 return aarch64_cmodel
== AARCH64_CMODEL_SMALL_SPIC
19485 ? SYMBOL_SMALL_GOT_28K
: SYMBOL_SMALL_GOT_4G
;
19487 /* Same reasoning as the tiny code model, but the offset cap here is
19488 1MB, allowing +/-3.9GB for the offset to the symbol. */
19489 if (!(IN_RANGE (offset
, -0x100000, 0x100000)
19490 || offset_within_block_p (x
, offset
)))
19491 return SYMBOL_FORCE_TO_MEM
;
19493 return SYMBOL_SMALL_ABSOLUTE
;
19495 case AARCH64_CMODEL_LARGE
:
19496 /* This is alright even in PIC code as the constant
19497 pool reference is always PC relative and within
19498 the same translation unit. */
19499 if (!aarch64_pcrelative_literal_loads
&& CONSTANT_POOL_ADDRESS_P (x
))
19500 return SYMBOL_SMALL_ABSOLUTE
;
19502 return SYMBOL_FORCE_TO_MEM
;
19505 gcc_unreachable ();
19509 /* By default push everything into the constant pool. */
19510 return SYMBOL_FORCE_TO_MEM
;
19514 aarch64_constant_address_p (rtx x
)
19516 return (CONSTANT_P (x
) && memory_address_p (DImode
, x
));
19520 aarch64_legitimate_pic_operand_p (rtx x
)
19523 x
= strip_offset_and_salt (x
, &offset
);
19524 if (SYMBOL_REF_P (x
))
19530 /* Implement TARGET_LEGITIMATE_CONSTANT_P hook. Return true for constants
19531 that should be rematerialized rather than spilled. */
19534 aarch64_legitimate_constant_p (machine_mode mode
, rtx x
)
19536 /* Support CSE and rematerialization of common constants. */
19537 if (CONST_INT_P (x
)
19538 || CONST_DOUBLE_P (x
))
19541 /* Only accept variable-length vector constants if they can be
19544 ??? It would be possible (but complex) to handle rematerialization
19545 of other constants via secondary reloads. */
19546 if (!GET_MODE_SIZE (mode
).is_constant ())
19547 return aarch64_simd_valid_immediate (x
, NULL
);
19549 /* Otherwise, accept any CONST_VECTOR that, if all else fails, can at
19550 least be forced to memory and loaded from there. */
19551 if (CONST_VECTOR_P (x
))
19552 return !targetm
.cannot_force_const_mem (mode
, x
);
19554 /* Do not allow vector struct mode constants for Advanced SIMD.
19555 We could support 0 and -1 easily, but they need support in
19556 aarch64-simd.md. */
19557 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
19558 if (vec_flags
== (VEC_ADVSIMD
| VEC_STRUCT
))
19561 if (GET_CODE (x
) == HIGH
)
19564 /* Accept polynomial constants that can be calculated by using the
19565 destination of a move as the sole temporary. Constants that
19566 require a second temporary cannot be rematerialized (they can't be
19567 forced to memory and also aren't legitimate constants). */
19569 if (poly_int_rtx_p (x
, &offset
))
19570 return aarch64_offset_temporaries (false, offset
) <= 1;
19572 /* If an offset is being added to something else, we need to allow the
19573 base to be moved into the destination register, meaning that there
19574 are no free temporaries for the offset. */
19575 x
= strip_offset_and_salt (x
, &offset
);
19576 if (!offset
.is_constant () && aarch64_offset_temporaries (true, offset
) > 0)
19579 /* Do not allow const (plus (anchor_symbol, const_int)). */
19580 if (maybe_ne (offset
, 0) && SYMBOL_REF_P (x
) && SYMBOL_REF_ANCHOR_P (x
))
19583 /* Treat symbols as constants. Avoid TLS symbols as they are complex,
19584 so spilling them is better than rematerialization. */
19585 if (SYMBOL_REF_P (x
) && !SYMBOL_REF_TLS_MODEL (x
))
19588 /* Label references are always constant. */
19589 if (LABEL_REF_P (x
))
19596 aarch64_load_tp (rtx target
)
19599 || GET_MODE (target
) != Pmode
19600 || !register_operand (target
, Pmode
))
19601 target
= gen_reg_rtx (Pmode
);
19603 /* Can return in any reg. */
19604 emit_insn (gen_aarch64_load_tp_hard (target
));
19608 /* On AAPCS systems, this is the "struct __va_list". */
19609 static GTY(()) tree va_list_type
;
19611 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
19612 Return the type to use as __builtin_va_list.
19614 AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
19626 aarch64_build_builtin_va_list (void)
19629 tree f_stack
, f_grtop
, f_vrtop
, f_groff
, f_vroff
;
19631 /* Create the type. */
19632 va_list_type
= lang_hooks
.types
.make_type (RECORD_TYPE
);
19633 /* Give it the required name. */
19634 va_list_name
= build_decl (BUILTINS_LOCATION
,
19636 get_identifier ("__va_list"),
19638 DECL_ARTIFICIAL (va_list_name
) = 1;
19639 TYPE_NAME (va_list_type
) = va_list_name
;
19640 TYPE_STUB_DECL (va_list_type
) = va_list_name
;
19642 /* Create the fields. */
19643 f_stack
= build_decl (BUILTINS_LOCATION
,
19644 FIELD_DECL
, get_identifier ("__stack"),
19646 f_grtop
= build_decl (BUILTINS_LOCATION
,
19647 FIELD_DECL
, get_identifier ("__gr_top"),
19649 f_vrtop
= build_decl (BUILTINS_LOCATION
,
19650 FIELD_DECL
, get_identifier ("__vr_top"),
19652 f_groff
= build_decl (BUILTINS_LOCATION
,
19653 FIELD_DECL
, get_identifier ("__gr_offs"),
19654 integer_type_node
);
19655 f_vroff
= build_decl (BUILTINS_LOCATION
,
19656 FIELD_DECL
, get_identifier ("__vr_offs"),
19657 integer_type_node
);
19659 /* Tell tree-stdarg pass about our internal offset fields.
19660 NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
19661 purpose to identify whether the code is updating va_list internal
19662 offset fields through irregular way. */
19663 va_list_gpr_counter_field
= f_groff
;
19664 va_list_fpr_counter_field
= f_vroff
;
19666 DECL_ARTIFICIAL (f_stack
) = 1;
19667 DECL_ARTIFICIAL (f_grtop
) = 1;
19668 DECL_ARTIFICIAL (f_vrtop
) = 1;
19669 DECL_ARTIFICIAL (f_groff
) = 1;
19670 DECL_ARTIFICIAL (f_vroff
) = 1;
19672 DECL_FIELD_CONTEXT (f_stack
) = va_list_type
;
19673 DECL_FIELD_CONTEXT (f_grtop
) = va_list_type
;
19674 DECL_FIELD_CONTEXT (f_vrtop
) = va_list_type
;
19675 DECL_FIELD_CONTEXT (f_groff
) = va_list_type
;
19676 DECL_FIELD_CONTEXT (f_vroff
) = va_list_type
;
19678 TYPE_FIELDS (va_list_type
) = f_stack
;
19679 DECL_CHAIN (f_stack
) = f_grtop
;
19680 DECL_CHAIN (f_grtop
) = f_vrtop
;
19681 DECL_CHAIN (f_vrtop
) = f_groff
;
19682 DECL_CHAIN (f_groff
) = f_vroff
;
19684 /* Compute its layout. */
19685 layout_type (va_list_type
);
19687 return va_list_type
;
19690 /* Implement TARGET_EXPAND_BUILTIN_VA_START. */
19692 aarch64_expand_builtin_va_start (tree valist
, rtx nextarg ATTRIBUTE_UNUSED
)
19694 const CUMULATIVE_ARGS
*cum
;
19695 tree f_stack
, f_grtop
, f_vrtop
, f_groff
, f_vroff
;
19696 tree stack
, grtop
, vrtop
, groff
, vroff
;
19698 int gr_save_area_size
= cfun
->va_list_gpr_size
;
19699 int vr_save_area_size
= cfun
->va_list_fpr_size
;
19702 cum
= &crtl
->args
.info
;
19703 if (cfun
->va_list_gpr_size
)
19704 gr_save_area_size
= MIN ((NUM_ARG_REGS
- cum
->aapcs_ncrn
) * UNITS_PER_WORD
,
19705 cfun
->va_list_gpr_size
);
19706 if (cfun
->va_list_fpr_size
)
19707 vr_save_area_size
= MIN ((NUM_FP_ARG_REGS
- cum
->aapcs_nvrn
)
19708 * UNITS_PER_VREG
, cfun
->va_list_fpr_size
);
19712 gcc_assert (cum
->aapcs_nvrn
== 0);
19713 vr_save_area_size
= 0;
19716 f_stack
= TYPE_FIELDS (va_list_type_node
);
19717 f_grtop
= DECL_CHAIN (f_stack
);
19718 f_vrtop
= DECL_CHAIN (f_grtop
);
19719 f_groff
= DECL_CHAIN (f_vrtop
);
19720 f_vroff
= DECL_CHAIN (f_groff
);
19722 stack
= build3 (COMPONENT_REF
, TREE_TYPE (f_stack
), valist
, f_stack
,
19724 grtop
= build3 (COMPONENT_REF
, TREE_TYPE (f_grtop
), valist
, f_grtop
,
19726 vrtop
= build3 (COMPONENT_REF
, TREE_TYPE (f_vrtop
), valist
, f_vrtop
,
19728 groff
= build3 (COMPONENT_REF
, TREE_TYPE (f_groff
), valist
, f_groff
,
19730 vroff
= build3 (COMPONENT_REF
, TREE_TYPE (f_vroff
), valist
, f_vroff
,
19733 /* Emit code to initialize STACK, which points to the next varargs stack
19734 argument. CUM->AAPCS_STACK_SIZE gives the number of stack words used
19735 by named arguments. STACK is 8-byte aligned. */
19736 t
= make_tree (TREE_TYPE (stack
), virtual_incoming_args_rtx
);
19737 if (cum
->aapcs_stack_size
> 0)
19738 t
= fold_build_pointer_plus_hwi (t
, cum
->aapcs_stack_size
* UNITS_PER_WORD
);
19739 t
= build2 (MODIFY_EXPR
, TREE_TYPE (stack
), stack
, t
);
19740 expand_expr (t
, const0_rtx
, VOIDmode
, EXPAND_NORMAL
);
19742 /* Emit code to initialize GRTOP, the top of the GR save area.
19743 virtual_incoming_args_rtx should have been 16 byte aligned. */
19744 t
= make_tree (TREE_TYPE (grtop
), virtual_incoming_args_rtx
);
19745 t
= build2 (MODIFY_EXPR
, TREE_TYPE (grtop
), grtop
, t
);
19746 expand_expr (t
, const0_rtx
, VOIDmode
, EXPAND_NORMAL
);
19748 /* Emit code to initialize VRTOP, the top of the VR save area.
19749 This address is gr_save_area_bytes below GRTOP, rounded
19750 down to the next 16-byte boundary. */
19751 t
= make_tree (TREE_TYPE (vrtop
), virtual_incoming_args_rtx
);
19752 vr_offset
= ROUND_UP (gr_save_area_size
,
19753 STACK_BOUNDARY
/ BITS_PER_UNIT
);
19756 t
= fold_build_pointer_plus_hwi (t
, -vr_offset
);
19757 t
= build2 (MODIFY_EXPR
, TREE_TYPE (vrtop
), vrtop
, t
);
19758 expand_expr (t
, const0_rtx
, VOIDmode
, EXPAND_NORMAL
);
19760 /* Emit code to initialize GROFF, the offset from GRTOP of the
19761 next GPR argument. */
19762 t
= build2 (MODIFY_EXPR
, TREE_TYPE (groff
), groff
,
19763 build_int_cst (TREE_TYPE (groff
), -gr_save_area_size
));
19764 expand_expr (t
, const0_rtx
, VOIDmode
, EXPAND_NORMAL
);
19766 /* Likewise emit code to initialize VROFF, the offset from FTOP
19767 of the next VR argument. */
19768 t
= build2 (MODIFY_EXPR
, TREE_TYPE (vroff
), vroff
,
19769 build_int_cst (TREE_TYPE (vroff
), -vr_save_area_size
));
19770 expand_expr (t
, const0_rtx
, VOIDmode
, EXPAND_NORMAL
);
19773 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR. */
19776 aarch64_gimplify_va_arg_expr (tree valist
, tree type
, gimple_seq
*pre_p
,
19777 gimple_seq
*post_p ATTRIBUTE_UNUSED
)
19781 bool is_ha
; /* is HFA or HVA. */
19782 bool dw_align
; /* double-word align. */
19783 machine_mode ag_mode
= VOIDmode
;
19787 tree f_stack
, f_grtop
, f_vrtop
, f_groff
, f_vroff
;
19788 tree stack
, f_top
, f_off
, off
, arg
, roundup
, on_stack
;
19789 HOST_WIDE_INT size
, rsize
, adjust
, align
;
19790 tree t
, u
, cond1
, cond2
;
19792 indirect_p
= pass_va_arg_by_reference (type
);
19794 type
= build_pointer_type (type
);
19796 mode
= TYPE_MODE (type
);
19798 f_stack
= TYPE_FIELDS (va_list_type_node
);
19799 f_grtop
= DECL_CHAIN (f_stack
);
19800 f_vrtop
= DECL_CHAIN (f_grtop
);
19801 f_groff
= DECL_CHAIN (f_vrtop
);
19802 f_vroff
= DECL_CHAIN (f_groff
);
19804 stack
= build3 (COMPONENT_REF
, TREE_TYPE (f_stack
), unshare_expr (valist
),
19805 f_stack
, NULL_TREE
);
19806 size
= int_size_in_bytes (type
);
19808 unsigned int abi_break
;
19809 unsigned int abi_break_packed
;
19811 = aarch64_function_arg_alignment (mode
, type
, &abi_break
, &abi_break_packed
)
19816 if (aarch64_vfp_is_call_or_return_candidate (mode
, type
, &ag_mode
, &nregs
,
19819 /* No frontends can create types with variable-sized modes, so we
19820 shouldn't be asked to pass or return them. */
19821 unsigned int ag_size
= GET_MODE_SIZE (ag_mode
).to_constant ();
19823 /* TYPE passed in fp/simd registers. */
19825 aarch64_err_no_fpadvsimd (mode
);
19827 f_top
= build3 (COMPONENT_REF
, TREE_TYPE (f_vrtop
),
19828 unshare_expr (valist
), f_vrtop
, NULL_TREE
);
19829 f_off
= build3 (COMPONENT_REF
, TREE_TYPE (f_vroff
),
19830 unshare_expr (valist
), f_vroff
, NULL_TREE
);
19832 rsize
= nregs
* UNITS_PER_VREG
;
19836 if (BYTES_BIG_ENDIAN
&& ag_size
< UNITS_PER_VREG
)
19837 adjust
= UNITS_PER_VREG
- ag_size
;
19839 else if (BLOCK_REG_PADDING (mode
, type
, 1) == PAD_DOWNWARD
19840 && size
< UNITS_PER_VREG
)
19842 adjust
= UNITS_PER_VREG
- size
;
19847 /* TYPE passed in general registers. */
19848 f_top
= build3 (COMPONENT_REF
, TREE_TYPE (f_grtop
),
19849 unshare_expr (valist
), f_grtop
, NULL_TREE
);
19850 f_off
= build3 (COMPONENT_REF
, TREE_TYPE (f_groff
),
19851 unshare_expr (valist
), f_groff
, NULL_TREE
);
19852 rsize
= ROUND_UP (size
, UNITS_PER_WORD
);
19853 nregs
= rsize
/ UNITS_PER_WORD
;
19855 if (align
<= 8 && abi_break_packed
&& warn_psabi
)
19856 inform (input_location
, "parameter passing for argument of type "
19857 "%qT changed in GCC 13.1", type
);
19861 if (abi_break
&& warn_psabi
)
19862 inform (input_location
, "parameter passing for argument of type "
19863 "%qT changed in GCC 9.1", type
);
19867 if (BLOCK_REG_PADDING (mode
, type
, 1) == PAD_DOWNWARD
19868 && size
< UNITS_PER_WORD
)
19870 adjust
= UNITS_PER_WORD
- size
;
19874 /* Get a local temporary for the field value. */
19875 off
= get_initialized_tmp_var (f_off
, pre_p
, NULL
);
19877 /* Emit code to branch if off >= 0. */
19878 t
= build2 (GE_EXPR
, boolean_type_node
, off
,
19879 build_int_cst (TREE_TYPE (off
), 0));
19880 cond1
= build3 (COND_EXPR
, ptr_type_node
, t
, NULL_TREE
, NULL_TREE
);
19884 /* Emit: offs = (offs + 15) & -16. */
19885 t
= build2 (PLUS_EXPR
, TREE_TYPE (off
), off
,
19886 build_int_cst (TREE_TYPE (off
), 15));
19887 t
= build2 (BIT_AND_EXPR
, TREE_TYPE (off
), t
,
19888 build_int_cst (TREE_TYPE (off
), -16));
19889 roundup
= build2 (MODIFY_EXPR
, TREE_TYPE (off
), off
, t
);
19894 /* Update ap.__[g|v]r_offs */
19895 t
= build2 (PLUS_EXPR
, TREE_TYPE (off
), off
,
19896 build_int_cst (TREE_TYPE (off
), rsize
));
19897 t
= build2 (MODIFY_EXPR
, TREE_TYPE (f_off
), unshare_expr (f_off
), t
);
19901 t
= build2 (COMPOUND_EXPR
, TREE_TYPE (t
), roundup
, t
);
19903 /* [cond2] if (ap.__[g|v]r_offs > 0) */
19904 u
= build2 (GT_EXPR
, boolean_type_node
, unshare_expr (f_off
),
19905 build_int_cst (TREE_TYPE (f_off
), 0));
19906 cond2
= build3 (COND_EXPR
, ptr_type_node
, u
, NULL_TREE
, NULL_TREE
);
19908 /* String up: make sure the assignment happens before the use. */
19909 t
= build2 (COMPOUND_EXPR
, TREE_TYPE (cond2
), t
, cond2
);
19910 COND_EXPR_ELSE (cond1
) = t
;
19912 /* Prepare the trees handling the argument that is passed on the stack;
19913 the top level node will store in ON_STACK. */
19914 arg
= get_initialized_tmp_var (stack
, pre_p
, NULL
);
19917 /* if (alignof(type) > 8) (arg = arg + 15) & -16; */
19918 t
= fold_build_pointer_plus_hwi (arg
, 15);
19919 t
= build2 (BIT_AND_EXPR
, TREE_TYPE (t
), t
,
19920 build_int_cst (TREE_TYPE (t
), -16));
19921 roundup
= build2 (MODIFY_EXPR
, TREE_TYPE (arg
), arg
, t
);
19925 /* Advance ap.__stack */
19926 t
= fold_build_pointer_plus_hwi (arg
, size
+ 7);
19927 t
= build2 (BIT_AND_EXPR
, TREE_TYPE (t
), t
,
19928 build_int_cst (TREE_TYPE (t
), -8));
19929 t
= build2 (MODIFY_EXPR
, TREE_TYPE (stack
), unshare_expr (stack
), t
);
19930 /* String up roundup and advance. */
19932 t
= build2 (COMPOUND_EXPR
, TREE_TYPE (t
), roundup
, t
);
19933 /* String up with arg */
19934 on_stack
= build2 (COMPOUND_EXPR
, TREE_TYPE (arg
), t
, arg
);
19935 /* Big-endianness related address adjustment. */
19936 if (BLOCK_REG_PADDING (mode
, type
, 1) == PAD_DOWNWARD
19937 && size
< UNITS_PER_WORD
)
19939 t
= build2 (POINTER_PLUS_EXPR
, TREE_TYPE (arg
), arg
,
19940 size_int (UNITS_PER_WORD
- size
));
19941 on_stack
= build2 (COMPOUND_EXPR
, TREE_TYPE (arg
), on_stack
, t
);
19944 COND_EXPR_THEN (cond1
) = unshare_expr (on_stack
);
19945 COND_EXPR_THEN (cond2
) = unshare_expr (on_stack
);
19947 /* Adjustment to OFFSET in the case of BIG_ENDIAN. */
19950 t
= build2 (PREINCREMENT_EXPR
, TREE_TYPE (off
), off
,
19951 build_int_cst (TREE_TYPE (off
), adjust
));
19953 t
= fold_convert (sizetype
, t
);
19954 t
= build2 (POINTER_PLUS_EXPR
, TREE_TYPE (f_top
), f_top
, t
);
19958 /* type ha; // treat as "struct {ftype field[n];}"
19959 ... [computing offs]
19960 for (i = 0; i <nregs; ++i, offs += 16)
19961 ha.field[i] = *((ftype *)(ap.__vr_top + offs));
19964 tree tmp_ha
, field_t
, field_ptr_t
;
19966 /* Declare a local variable. */
19967 tmp_ha
= create_tmp_var_raw (type
, "ha");
19968 gimple_add_tmp_var (tmp_ha
);
19970 /* Establish the base type. */
19974 field_t
= float_type_node
;
19975 field_ptr_t
= float_ptr_type_node
;
19978 field_t
= double_type_node
;
19979 field_ptr_t
= double_ptr_type_node
;
19982 field_t
= long_double_type_node
;
19983 field_ptr_t
= long_double_ptr_type_node
;
19986 field_t
= dfloat32_type_node
;
19987 field_ptr_t
= build_pointer_type (dfloat32_type_node
);
19990 field_t
= dfloat64_type_node
;
19991 field_ptr_t
= build_pointer_type (dfloat64_type_node
);
19994 field_t
= dfloat128_type_node
;
19995 field_ptr_t
= build_pointer_type (dfloat128_type_node
);
19998 field_t
= aarch64_fp16_type_node
;
19999 field_ptr_t
= aarch64_fp16_ptr_type_node
;
20002 field_t
= aarch64_bf16_type_node
;
20003 field_ptr_t
= aarch64_bf16_ptr_type_node
;
20008 tree innertype
= make_signed_type (GET_MODE_PRECISION (SImode
));
20009 field_t
= build_vector_type_for_mode (innertype
, ag_mode
);
20010 field_ptr_t
= build_pointer_type (field_t
);
20017 /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area */
20018 TREE_ADDRESSABLE (tmp_ha
) = 1;
20019 tmp_ha
= build1 (ADDR_EXPR
, field_ptr_t
, tmp_ha
);
20021 t
= fold_convert (field_ptr_t
, addr
);
20022 t
= build2 (MODIFY_EXPR
, field_t
,
20023 build1 (INDIRECT_REF
, field_t
, tmp_ha
),
20024 build1 (INDIRECT_REF
, field_t
, t
));
20026 /* ha.field[i] = *((field_ptr_t)vr_saved_area + i) */
20027 for (i
= 1; i
< nregs
; ++i
)
20029 addr
= fold_build_pointer_plus_hwi (addr
, UNITS_PER_VREG
);
20030 u
= fold_convert (field_ptr_t
, addr
);
20031 u
= build2 (MODIFY_EXPR
, field_t
,
20032 build2 (MEM_REF
, field_t
, tmp_ha
,
20033 build_int_cst (field_ptr_t
,
20035 int_size_in_bytes (field_t
)))),
20036 build1 (INDIRECT_REF
, field_t
, u
));
20037 t
= build2 (COMPOUND_EXPR
, TREE_TYPE (t
), t
, u
);
20040 u
= fold_convert (TREE_TYPE (f_top
), tmp_ha
);
20041 t
= build2 (COMPOUND_EXPR
, TREE_TYPE (f_top
), t
, u
);
20044 COND_EXPR_ELSE (cond2
) = t
;
20045 addr
= fold_convert (build_pointer_type (type
), cond1
);
20046 addr
= build_va_arg_indirect_ref (addr
);
20049 addr
= build_va_arg_indirect_ref (addr
);
20054 /* Implement TARGET_SETUP_INCOMING_VARARGS. */
20057 aarch64_setup_incoming_varargs (cumulative_args_t cum_v
,
20058 const function_arg_info
&arg
,
20059 int *pretend_size ATTRIBUTE_UNUSED
, int no_rtl
)
20061 CUMULATIVE_ARGS
*cum
= get_cumulative_args (cum_v
);
20062 CUMULATIVE_ARGS local_cum
;
20063 int gr_saved
= cfun
->va_list_gpr_size
;
20064 int vr_saved
= cfun
->va_list_fpr_size
;
20066 /* The caller has advanced CUM up to, but not beyond, the last named
20067 argument. Advance a local copy of CUM past the last "real" named
20068 argument, to find out how many registers are left over. */
20070 if (!TYPE_NO_NAMED_ARGS_STDARG_P (TREE_TYPE (current_function_decl
)))
20071 aarch64_function_arg_advance (pack_cumulative_args(&local_cum
), arg
);
20073 /* Found out how many registers we need to save.
20074 Honor tree-stdvar analysis results. */
20075 if (cfun
->va_list_gpr_size
)
20076 gr_saved
= MIN (NUM_ARG_REGS
- local_cum
.aapcs_ncrn
,
20077 cfun
->va_list_gpr_size
/ UNITS_PER_WORD
);
20078 if (cfun
->va_list_fpr_size
)
20079 vr_saved
= MIN (NUM_FP_ARG_REGS
- local_cum
.aapcs_nvrn
,
20080 cfun
->va_list_fpr_size
/ UNITS_PER_VREG
);
20084 gcc_assert (local_cum
.aapcs_nvrn
== 0);
20094 /* virtual_incoming_args_rtx should have been 16-byte aligned. */
20095 ptr
= plus_constant (Pmode
, virtual_incoming_args_rtx
,
20096 - gr_saved
* UNITS_PER_WORD
);
20097 mem
= gen_frame_mem (BLKmode
, ptr
);
20098 set_mem_alias_set (mem
, get_varargs_alias_set ());
20100 move_block_from_reg (local_cum
.aapcs_ncrn
+ R0_REGNUM
,
20105 /* We can't use move_block_from_reg, because it will use
20106 the wrong mode, storing D regs only. */
20107 machine_mode mode
= TImode
;
20108 int off
, i
, vr_start
;
20110 /* Set OFF to the offset from virtual_incoming_args_rtx of
20111 the first vector register. The VR save area lies below
20112 the GR one, and is aligned to 16 bytes. */
20113 off
= -ROUND_UP (gr_saved
* UNITS_PER_WORD
,
20114 STACK_BOUNDARY
/ BITS_PER_UNIT
);
20115 off
-= vr_saved
* UNITS_PER_VREG
;
20117 vr_start
= V0_REGNUM
+ local_cum
.aapcs_nvrn
;
20118 for (i
= 0; i
< vr_saved
; ++i
)
20122 ptr
= plus_constant (Pmode
, virtual_incoming_args_rtx
, off
);
20123 mem
= gen_frame_mem (mode
, ptr
);
20124 set_mem_alias_set (mem
, get_varargs_alias_set ());
20125 aarch64_emit_move (mem
, gen_rtx_REG (mode
, vr_start
+ i
));
20126 off
+= UNITS_PER_VREG
;
20131 /* We don't save the size into *PRETEND_SIZE because we want to avoid
20132 any complication of having crtl->args.pretend_args_size changed. */
20133 cfun
->machine
->frame
.saved_varargs_size
20134 = (ROUND_UP (gr_saved
* UNITS_PER_WORD
,
20135 STACK_BOUNDARY
/ BITS_PER_UNIT
)
20136 + vr_saved
* UNITS_PER_VREG
);
20140 aarch64_conditional_register_usage (void)
20145 for (i
= V0_REGNUM
; i
<= V31_REGNUM
; i
++)
20148 call_used_regs
[i
] = 1;
20149 CLEAR_HARD_REG_BIT (operand_reg_set
, i
);
20153 for (i
= P0_REGNUM
; i
<= P15_REGNUM
; i
++)
20156 call_used_regs
[i
] = 1;
20159 /* Only allow the FFR and FFRT to be accessed via special patterns. */
20160 CLEAR_HARD_REG_BIT (operand_reg_set
, FFR_REGNUM
);
20161 CLEAR_HARD_REG_BIT (operand_reg_set
, FFRT_REGNUM
);
20163 /* When tracking speculation, we need a couple of call-clobbered registers
20164 to track the speculation state. It would be nice to just use
20165 IP0 and IP1, but currently there are numerous places that just
20166 assume these registers are free for other uses (eg pointer
20167 authentication). */
20168 if (aarch64_track_speculation
)
20170 fixed_regs
[SPECULATION_TRACKER_REGNUM
] = 1;
20171 call_used_regs
[SPECULATION_TRACKER_REGNUM
] = 1;
20172 fixed_regs
[SPECULATION_SCRATCH_REGNUM
] = 1;
20173 call_used_regs
[SPECULATION_SCRATCH_REGNUM
] = 1;
20177 /* Implement TARGET_MEMBER_TYPE_FORCES_BLK. */
20180 aarch64_member_type_forces_blk (const_tree field_or_array
, machine_mode mode
)
20182 /* For records we're passed a FIELD_DECL, for arrays we're passed
20183 an ARRAY_TYPE. In both cases we're interested in the TREE_TYPE. */
20184 const_tree type
= TREE_TYPE (field_or_array
);
20186 /* Assign BLKmode to anything that contains multiple SVE predicates.
20187 For structures, the "multiple" case is indicated by MODE being
20189 unsigned int num_zr
, num_pr
;
20190 if (aarch64_sve::builtin_type_p (type
, &num_zr
, &num_pr
) && num_pr
!= 0)
20192 if (TREE_CODE (field_or_array
) == ARRAY_TYPE
)
20193 return !simple_cst_equal (TYPE_SIZE (field_or_array
),
20195 return mode
== VOIDmode
;
20198 return default_member_type_forces_blk (field_or_array
, mode
);
20201 /* Bitmasks that indicate whether earlier versions of GCC would have
20202 taken a different path through the ABI logic. This should result in
20203 a -Wpsabi warning if the earlier path led to a different ABI decision.
20205 WARN_PSABI_EMPTY_CXX17_BASE
20206 Indicates that the type includes an artificial empty C++17 base field
20207 that, prior to GCC 10.1, would prevent the type from being treated as
20208 a HFA or HVA. See PR94383 for details.
20210 WARN_PSABI_NO_UNIQUE_ADDRESS
20211 Indicates that the type includes an empty [[no_unique_address]] field
20212 that, prior to GCC 10.1, would prevent the type from being treated as
20214 const unsigned int WARN_PSABI_EMPTY_CXX17_BASE
= 1U << 0;
20215 const unsigned int WARN_PSABI_NO_UNIQUE_ADDRESS
= 1U << 1;
20216 const unsigned int WARN_PSABI_ZERO_WIDTH_BITFIELD
= 1U << 2;
20218 /* Walk down the type tree of TYPE counting consecutive base elements.
20219 If *MODEP is VOIDmode, then set it to the first valid floating point
20220 type. If a non-floating point type is found, or if a floating point
20221 type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
20222 otherwise return the count in the sub-tree.
20224 The WARN_PSABI_FLAGS argument allows the caller to check whether this
20225 function has changed its behavior relative to earlier versions of GCC.
20226 Normally the argument should be nonnull and point to a zero-initialized
20227 variable. The function then records whether the ABI decision might
20228 be affected by a known fix to the ABI logic, setting the associated
20229 WARN_PSABI_* bits if so.
20231 When the argument is instead a null pointer, the function tries to
20232 simulate the behavior of GCC before all such ABI fixes were made.
20233 This is useful to check whether the function returns something
20234 different after the ABI fixes. */
20236 aapcs_vfp_sub_candidate (const_tree type
, machine_mode
*modep
,
20237 unsigned int *warn_psabi_flags
)
20240 HOST_WIDE_INT size
;
20242 if (aarch64_sve::builtin_type_p (type
))
20245 switch (TREE_CODE (type
))
20248 mode
= TYPE_MODE (type
);
20249 if (mode
!= DFmode
&& mode
!= SFmode
20250 && mode
!= TFmode
&& mode
!= HFmode
20251 && mode
!= SDmode
&& mode
!= DDmode
&& mode
!= TDmode
)
20254 if (*modep
== VOIDmode
)
20257 if (*modep
== mode
)
20263 mode
= TYPE_MODE (TREE_TYPE (type
));
20264 if (mode
!= DFmode
&& mode
!= SFmode
20265 && mode
!= TFmode
&& mode
!= HFmode
)
20268 if (*modep
== VOIDmode
)
20271 if (*modep
== mode
)
20277 /* Use V2SImode and V4SImode as representatives of all 64-bit
20278 and 128-bit vector types. */
20279 size
= int_size_in_bytes (type
);
20292 if (*modep
== VOIDmode
)
20295 /* Vector modes are considered to be opaque: two vectors are
20296 equivalent for the purposes of being homogeneous aggregates
20297 if they are the same size. */
20298 if (*modep
== mode
)
20306 tree index
= TYPE_DOMAIN (type
);
20308 /* Can't handle incomplete types nor sizes that are not
20310 if (!COMPLETE_TYPE_P (type
)
20311 || TREE_CODE (TYPE_SIZE (type
)) != INTEGER_CST
)
20314 count
= aapcs_vfp_sub_candidate (TREE_TYPE (type
), modep
,
20318 || !TYPE_MAX_VALUE (index
)
20319 || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index
))
20320 || !TYPE_MIN_VALUE (index
)
20321 || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index
))
20325 count
*= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index
))
20326 - tree_to_uhwi (TYPE_MIN_VALUE (index
)));
20328 /* There must be no padding. */
20329 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type
)),
20330 count
* GET_MODE_BITSIZE (*modep
)))
20342 /* Can't handle incomplete types nor sizes that are not
20344 if (!COMPLETE_TYPE_P (type
)
20345 || TREE_CODE (TYPE_SIZE (type
)) != INTEGER_CST
)
20348 for (field
= TYPE_FIELDS (type
); field
; field
= TREE_CHAIN (field
))
20350 if (TREE_CODE (field
) != FIELD_DECL
)
20353 if (DECL_FIELD_ABI_IGNORED (field
))
20355 /* See whether this is something that earlier versions of
20356 GCC failed to ignore. */
20358 if (lookup_attribute ("no_unique_address",
20359 DECL_ATTRIBUTES (field
)))
20360 flag
= WARN_PSABI_NO_UNIQUE_ADDRESS
;
20361 else if (cxx17_empty_base_field_p (field
))
20362 flag
= WARN_PSABI_EMPTY_CXX17_BASE
;
20364 /* No compatibility problem. */
20367 /* Simulate the old behavior when WARN_PSABI_FLAGS is null. */
20368 if (warn_psabi_flags
)
20370 *warn_psabi_flags
|= flag
;
20374 /* A zero-width bitfield may affect layout in some
20375 circumstances, but adds no members. The determination
20376 of whether or not a type is an HFA is performed after
20377 layout is complete, so if the type still looks like an
20378 HFA afterwards, it is still classed as one. This is
20379 potentially an ABI break for the hard-float ABI. */
20380 else if (DECL_BIT_FIELD (field
)
20381 && integer_zerop (DECL_SIZE (field
)))
20383 /* Prior to GCC-12 these fields were striped early,
20384 hiding them from the back-end entirely and
20385 resulting in the correct behaviour for argument
20386 passing. Simulate that old behaviour without
20387 generating a warning. */
20388 if (DECL_FIELD_CXX_ZERO_WIDTH_BIT_FIELD (field
))
20390 if (warn_psabi_flags
)
20392 *warn_psabi_flags
|= WARN_PSABI_ZERO_WIDTH_BITFIELD
;
20397 sub_count
= aapcs_vfp_sub_candidate (TREE_TYPE (field
), modep
,
20401 count
+= sub_count
;
20404 /* There must be no padding. */
20405 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type
)),
20406 count
* GET_MODE_BITSIZE (*modep
)))
20413 case QUAL_UNION_TYPE
:
20415 /* These aren't very interesting except in a degenerate case. */
20420 /* Can't handle incomplete types nor sizes that are not
20422 if (!COMPLETE_TYPE_P (type
)
20423 || TREE_CODE (TYPE_SIZE (type
)) != INTEGER_CST
)
20426 for (field
= TYPE_FIELDS (type
); field
; field
= TREE_CHAIN (field
))
20428 if (TREE_CODE (field
) != FIELD_DECL
)
20431 sub_count
= aapcs_vfp_sub_candidate (TREE_TYPE (field
), modep
,
20435 count
= count
> sub_count
? count
: sub_count
;
20438 /* There must be no padding. */
20439 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type
)),
20440 count
* GET_MODE_BITSIZE (*modep
)))
20453 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
20454 type as described in AAPCS64 \S 4.1.2.
20456 See the comment above aarch64_composite_type_p for the notes on MODE. */
20459 aarch64_short_vector_p (const_tree type
,
20462 poly_int64 size
= -1;
20464 if (type
&& TREE_CODE (type
) == VECTOR_TYPE
)
20466 if (aarch64_sve::builtin_type_p (type
))
20468 size
= int_size_in_bytes (type
);
20470 else if (GET_MODE_CLASS (mode
) == MODE_VECTOR_INT
20471 || GET_MODE_CLASS (mode
) == MODE_VECTOR_FLOAT
)
20473 /* The containing "else if" is too loose: it means that we look at TYPE
20474 if the type is a vector type (good), but that we otherwise ignore TYPE
20475 and look only at the mode. This is wrong because the type describes
20476 the language-level information whereas the mode is purely an internal
20477 GCC concept. We can therefore reach here for types that are not
20478 vectors in the AAPCS64 sense.
20480 We can't "fix" that for the traditional Advanced SIMD vector modes
20481 without breaking backwards compatibility. However, there's no such
20482 baggage for the structure modes, which were introduced in GCC 12. */
20483 if (aarch64_advsimd_struct_mode_p (mode
))
20486 /* For similar reasons, rely only on the type, not the mode, when
20487 processing SVE types. */
20488 if (type
&& aarch64_some_values_include_pst_objects_p (type
))
20489 /* Leave later code to report an error if SVE is disabled. */
20490 gcc_assert (!TARGET_SVE
|| aarch64_sve_mode_p (mode
));
20492 size
= GET_MODE_SIZE (mode
);
20494 if (known_eq (size
, 8) || known_eq (size
, 16))
20496 /* 64-bit and 128-bit vectors should only acquire an SVE mode if
20497 they are being treated as scalable AAPCS64 types. */
20498 gcc_assert (!aarch64_sve_mode_p (mode
)
20499 && !aarch64_advsimd_struct_mode_p (mode
));
20505 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
20506 type as described in AAPCS64 \S 4.3. This includes aggregate, union and
20507 array types. The C99 floating-point complex types are also considered
20508 as composite types, according to AAPCS64 \S 7.1.1. The complex integer
20509 types, which are GCC extensions and out of the scope of AAPCS64, are
20510 treated as composite types here as well.
20512 Note that MODE itself is not sufficient in determining whether a type
20513 is such a composite type or not. This is because
20514 stor-layout.cc:compute_record_mode may have already changed the MODE
20515 (BLKmode) of a RECORD_TYPE TYPE to some other mode. For example, a
20516 structure with only one field may have its MODE set to the mode of the
20517 field. Also an integer mode whose size matches the size of the
20518 RECORD_TYPE type may be used to substitute the original mode
20519 (i.e. BLKmode) in certain circumstances. In other words, MODE cannot be
20520 solely relied on. */
20523 aarch64_composite_type_p (const_tree type
,
20526 if (aarch64_short_vector_p (type
, mode
))
20529 if (type
&& (AGGREGATE_TYPE_P (type
) || TREE_CODE (type
) == COMPLEX_TYPE
))
20532 if (mode
== BLKmode
20533 || GET_MODE_CLASS (mode
) == MODE_COMPLEX_FLOAT
20534 || GET_MODE_CLASS (mode
) == MODE_COMPLEX_INT
)
20540 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
20541 shall be passed or returned in simd/fp register(s) (providing these
20542 parameter passing registers are available).
20544 Upon successful return, *COUNT returns the number of needed registers,
20545 *BASE_MODE returns the mode of the individual register and when IS_HA
20546 is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
20547 floating-point aggregate or a homogeneous short-vector aggregate.
20549 SILENT_P is true if the function should refrain from reporting any
20550 diagnostics. This should only be used if the caller is certain that
20551 any ABI decisions would eventually come through this function with
20552 SILENT_P set to false. */
20555 aarch64_vfp_is_call_or_return_candidate (machine_mode mode
,
20557 machine_mode
*base_mode
,
20562 if (is_ha
!= NULL
) *is_ha
= false;
20564 machine_mode new_mode
= VOIDmode
;
20565 bool composite_p
= aarch64_composite_type_p (type
, mode
);
20568 && (GET_MODE_CLASS (mode
) == MODE_FLOAT
20569 || GET_MODE_CLASS (mode
) == MODE_DECIMAL_FLOAT
))
20570 || aarch64_short_vector_p (type
, mode
))
20575 else if (GET_MODE_CLASS (mode
) == MODE_COMPLEX_FLOAT
)
20577 if (is_ha
!= NULL
) *is_ha
= true;
20579 new_mode
= GET_MODE_INNER (mode
);
20581 else if (type
&& composite_p
)
20583 unsigned int warn_psabi_flags
= 0;
20584 int ag_count
= aapcs_vfp_sub_candidate (type
, &new_mode
,
20585 &warn_psabi_flags
);
20586 if (ag_count
> 0 && ag_count
<= HA_MAX_NUM_FLDS
)
20588 static unsigned last_reported_type_uid
;
20589 unsigned uid
= TYPE_UID (TYPE_MAIN_VARIANT (type
));
20593 && warn_psabi_flags
20594 && uid
!= last_reported_type_uid
20595 && ((alt
= aapcs_vfp_sub_candidate (type
, &new_mode
, NULL
))
20599 = CHANGES_ROOT_URL
"gcc-10/changes.html#empty_base";
20601 = CHANGES_ROOT_URL
"gcc-12/changes.html#zero_width_bitfields";
20602 gcc_assert (alt
== -1);
20603 last_reported_type_uid
= uid
;
20604 /* Use TYPE_MAIN_VARIANT to strip any redundant const
20606 if (warn_psabi_flags
& WARN_PSABI_NO_UNIQUE_ADDRESS
)
20607 inform (input_location
, "parameter passing for argument of "
20608 "type %qT with %<[[no_unique_address]]%> members "
20609 "changed %{in GCC 10.1%}",
20610 TYPE_MAIN_VARIANT (type
), url10
);
20611 else if (warn_psabi_flags
& WARN_PSABI_EMPTY_CXX17_BASE
)
20612 inform (input_location
, "parameter passing for argument of "
20613 "type %qT when C++17 is enabled changed to match "
20614 "C++14 %{in GCC 10.1%}",
20615 TYPE_MAIN_VARIANT (type
), url10
);
20616 else if (warn_psabi_flags
& WARN_PSABI_ZERO_WIDTH_BITFIELD
)
20617 inform (input_location
, "parameter passing for argument of "
20618 "type %qT changed %{in GCC 12.1%}",
20619 TYPE_MAIN_VARIANT (type
), url12
);
20622 if (is_ha
!= NULL
) *is_ha
= true;
20631 gcc_assert (!aarch64_sve_mode_p (new_mode
));
20632 *base_mode
= new_mode
;
20636 /* Implement TARGET_STRUCT_VALUE_RTX. */
20639 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED
,
20640 int incoming ATTRIBUTE_UNUSED
)
20642 return gen_rtx_REG (Pmode
, AARCH64_STRUCT_VALUE_REGNUM
);
20645 /* Implements target hook vector_mode_supported_p. */
20647 aarch64_vector_mode_supported_p (machine_mode mode
)
20649 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
20650 return vec_flags
!= 0 && (vec_flags
& VEC_STRUCT
) == 0;
20653 /* Return the full-width SVE vector mode for element mode MODE, if one
20656 aarch64_full_sve_mode (scalar_mode mode
)
20675 return VNx16QImode
;
20677 return opt_machine_mode ();
20681 /* Return the 128-bit Advanced SIMD vector mode for element mode MODE,
20684 aarch64_vq_mode (scalar_mode mode
)
20705 return opt_machine_mode ();
20709 /* Return appropriate SIMD container
20710 for MODE within a vector of WIDTH bits. */
20711 static machine_mode
20712 aarch64_simd_container_mode (scalar_mode mode
, poly_int64 width
)
20715 && maybe_ne (width
, 128)
20716 && known_eq (width
, BITS_PER_SVE_VECTOR
))
20717 return aarch64_full_sve_mode (mode
).else_mode (word_mode
);
20719 gcc_assert (known_eq (width
, 64) || known_eq (width
, 128));
20722 if (known_eq (width
, 128))
20723 return aarch64_vq_mode (mode
).else_mode (word_mode
);
20746 /* Compare an SVE mode SVE_M and an Advanced SIMD mode ASIMD_M
20747 and return whether the SVE mode should be preferred over the
20748 Advanced SIMD one in aarch64_autovectorize_vector_modes. */
20750 aarch64_cmp_autovec_modes (machine_mode sve_m
, machine_mode asimd_m
)
20752 /* Take into account the aarch64-autovec-preference param if non-zero. */
20753 bool only_asimd_p
= aarch64_autovec_preference
== 1;
20754 bool only_sve_p
= aarch64_autovec_preference
== 2;
20761 /* The preference in case of a tie in costs. */
20762 bool prefer_asimd
= aarch64_autovec_preference
== 3;
20763 bool prefer_sve
= aarch64_autovec_preference
== 4;
20765 poly_int64 nunits_sve
= GET_MODE_NUNITS (sve_m
);
20766 poly_int64 nunits_asimd
= GET_MODE_NUNITS (asimd_m
);
20767 /* If the CPU information does not have an SVE width registered use the
20768 generic poly_int comparison that prefers SVE. If a preference is
20769 explicitly requested avoid this path. */
20770 if (aarch64_tune_params
.sve_width
== SVE_SCALABLE
20773 return maybe_gt (nunits_sve
, nunits_asimd
);
20775 /* Otherwise estimate the runtime width of the modes involved. */
20776 HOST_WIDE_INT est_sve
= estimated_poly_value (nunits_sve
);
20777 HOST_WIDE_INT est_asimd
= estimated_poly_value (nunits_asimd
);
20779 /* Preferring SVE means picking it first unless the Advanced SIMD mode
20780 is clearly wider. */
20782 return est_sve
>= est_asimd
;
20783 /* Conversely, preferring Advanced SIMD means picking SVE only if SVE
20784 is clearly wider. */
20786 return est_sve
> est_asimd
;
20788 /* In the default case prefer Advanced SIMD over SVE in case of a tie. */
20789 return est_sve
> est_asimd
;
20792 /* Return 128-bit container as the preferred SIMD mode for MODE. */
20793 static machine_mode
20794 aarch64_preferred_simd_mode (scalar_mode mode
)
20796 /* Take into account explicit auto-vectorization ISA preferences through
20797 aarch64_cmp_autovec_modes. */
20798 if (TARGET_SVE
&& aarch64_cmp_autovec_modes (VNx16QImode
, V16QImode
))
20799 return aarch64_full_sve_mode (mode
).else_mode (word_mode
);
20801 return aarch64_vq_mode (mode
).else_mode (word_mode
);
20805 /* Return a list of possible vector sizes for the vectorizer
20806 to iterate over. */
20807 static unsigned int
20808 aarch64_autovectorize_vector_modes (vector_modes
*modes
, bool)
20810 static const machine_mode sve_modes
[] = {
20811 /* Try using full vectors for all element types. */
20814 /* Try using 16-bit containers for 8-bit elements and full vectors
20815 for wider elements. */
20818 /* Try using 32-bit containers for 8-bit and 16-bit elements and
20819 full vectors for wider elements. */
20822 /* Try using 64-bit containers for all element types. */
20826 static const machine_mode advsimd_modes
[] = {
20827 /* Try using 128-bit vectors for all element types. */
20830 /* Try using 64-bit vectors for 8-bit elements and 128-bit vectors
20831 for wider elements. */
20834 /* Try using 64-bit vectors for 16-bit elements and 128-bit vectors
20835 for wider elements.
20837 TODO: We could support a limited form of V4QImode too, so that
20838 we use 32-bit vectors for 8-bit elements. */
20841 /* Try using 64-bit vectors for 32-bit elements and 128-bit vectors
20842 for 64-bit elements.
20844 TODO: We could similarly support limited forms of V2QImode and V2HImode
20849 /* Try using N-byte SVE modes only after trying N-byte Advanced SIMD mode.
20852 - If we can't use N-byte Advanced SIMD vectors then the placement
20853 doesn't matter; we'll just continue as though the Advanced SIMD
20854 entry didn't exist.
20856 - If an SVE main loop with N bytes ends up being cheaper than an
20857 Advanced SIMD main loop with N bytes then by default we'll replace
20858 the Advanced SIMD version with the SVE one.
20860 - If an Advanced SIMD main loop with N bytes ends up being cheaper
20861 than an SVE main loop with N bytes then by default we'll try to
20862 use the SVE loop to vectorize the epilogue instead. */
20864 bool only_asimd_p
= aarch64_autovec_preference
== 1;
20865 bool only_sve_p
= aarch64_autovec_preference
== 2;
20867 unsigned int sve_i
= (TARGET_SVE
&& !only_asimd_p
) ? 0 : ARRAY_SIZE (sve_modes
);
20868 unsigned int advsimd_i
= 0;
20870 while (!only_sve_p
&& advsimd_i
< ARRAY_SIZE (advsimd_modes
))
20872 if (sve_i
< ARRAY_SIZE (sve_modes
)
20873 && aarch64_cmp_autovec_modes (sve_modes
[sve_i
],
20874 advsimd_modes
[advsimd_i
]))
20875 modes
->safe_push (sve_modes
[sve_i
++]);
20877 modes
->safe_push (advsimd_modes
[advsimd_i
++]);
20879 while (sve_i
< ARRAY_SIZE (sve_modes
))
20880 modes
->safe_push (sve_modes
[sve_i
++]);
20882 unsigned int flags
= 0;
20883 /* Consider enabling VECT_COMPARE_COSTS for SVE, both so that we
20884 can compare SVE against Advanced SIMD and so that we can compare
20885 multiple SVE vectorization approaches against each other. There's
20886 not really any point doing this for Advanced SIMD only, since the
20887 first mode that works should always be the best. */
20888 if (TARGET_SVE
&& aarch64_sve_compare_costs
)
20889 flags
|= VECT_COMPARE_COSTS
;
20893 /* Implement TARGET_MANGLE_TYPE. */
20895 static const char *
20896 aarch64_mangle_type (const_tree type
)
20898 /* The AArch64 ABI documents say that "__va_list" has to be
20899 mangled as if it is in the "std" namespace. */
20900 if (lang_hooks
.types_compatible_p (CONST_CAST_TREE (type
), va_list_type
))
20901 return "St9__va_list";
20903 /* Half-precision floating point types. */
20904 if (TREE_CODE (type
) == REAL_TYPE
&& TYPE_PRECISION (type
) == 16)
20906 if (TYPE_MAIN_VARIANT (type
) == float16_type_node
)
20908 if (TYPE_MODE (type
) == BFmode
)
20914 /* Mangle AArch64-specific internal types. TYPE_NAME is non-NULL_TREE for
20916 if (TYPE_NAME (type
) != NULL
)
20919 if ((res
= aarch64_general_mangle_builtin_type (type
))
20920 || (res
= aarch64_sve::mangle_builtin_type (type
)))
20924 /* Use the default mangling. */
20928 /* Implement TARGET_VERIFY_TYPE_CONTEXT. */
20931 aarch64_verify_type_context (location_t loc
, type_context_kind context
,
20932 const_tree type
, bool silent_p
)
20934 return aarch64_sve::verify_type_context (loc
, context
, type
, silent_p
);
20937 /* Find the first rtx_insn before insn that will generate an assembly
20941 aarch64_prev_real_insn (rtx_insn
*insn
)
20948 insn
= prev_real_insn (insn
);
20950 while (insn
&& recog_memoized (insn
) < 0);
20956 is_madd_op (enum attr_type t1
)
20959 /* A number of these may be AArch32 only. */
20960 enum attr_type mlatypes
[] = {
20961 TYPE_MLA
, TYPE_MLAS
, TYPE_SMLAD
, TYPE_SMLADX
, TYPE_SMLAL
, TYPE_SMLALD
,
20962 TYPE_SMLALS
, TYPE_SMLALXY
, TYPE_SMLAWX
, TYPE_SMLAWY
, TYPE_SMLAXY
,
20963 TYPE_SMMLA
, TYPE_UMLAL
, TYPE_UMLALS
,TYPE_SMLSD
, TYPE_SMLSDX
, TYPE_SMLSLD
20966 for (i
= 0; i
< ARRAY_SIZE (mlatypes
); i
++)
20968 if (t1
== mlatypes
[i
])
20975 /* Check if there is a register dependency between a load and the insn
20976 for which we hold recog_data. */
20979 dep_between_memop_and_curr (rtx memop
)
20984 gcc_assert (GET_CODE (memop
) == SET
);
20986 if (!REG_P (SET_DEST (memop
)))
20989 load_reg
= SET_DEST (memop
);
20990 for (opno
= 1; opno
< recog_data
.n_operands
; opno
++)
20992 rtx operand
= recog_data
.operand
[opno
];
20993 if (REG_P (operand
)
20994 && reg_overlap_mentioned_p (load_reg
, operand
))
21002 /* When working around the Cortex-A53 erratum 835769,
21003 given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
21004 instruction and has a preceding memory instruction such that a NOP
21005 should be inserted between them. */
21008 aarch64_madd_needs_nop (rtx_insn
* insn
)
21010 enum attr_type attr_type
;
21014 if (!TARGET_FIX_ERR_A53_835769
)
21017 if (!INSN_P (insn
) || recog_memoized (insn
) < 0)
21020 attr_type
= get_attr_type (insn
);
21021 if (!is_madd_op (attr_type
))
21024 prev
= aarch64_prev_real_insn (insn
);
21025 /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
21026 Restore recog state to INSN to avoid state corruption. */
21027 extract_constrain_insn_cached (insn
);
21029 if (!prev
|| !contains_mem_rtx_p (PATTERN (prev
)))
21032 body
= single_set (prev
);
21034 /* If the previous insn is a memory op and there is no dependency between
21035 it and the DImode madd, emit a NOP between them. If body is NULL then we
21036 have a complex memory operation, probably a load/store pair.
21037 Be conservative for now and emit a NOP. */
21038 if (GET_MODE (recog_data
.operand
[0]) == DImode
21039 && (!body
|| !dep_between_memop_and_curr (body
)))
21047 /* Implement FINAL_PRESCAN_INSN. */
21050 aarch64_final_prescan_insn (rtx_insn
*insn
)
21052 if (aarch64_madd_needs_nop (insn
))
21053 fprintf (asm_out_file
, "\tnop // between mem op and mult-accumulate\n");
21057 /* Return true if BASE_OR_STEP is a valid immediate operand for an SVE INDEX
21061 aarch64_sve_index_immediate_p (rtx base_or_step
)
21063 return (CONST_INT_P (base_or_step
)
21064 && IN_RANGE (INTVAL (base_or_step
), -16, 15));
21067 /* Return true if X is a valid immediate for the SVE ADD and SUB instructions
21068 when applied to mode MODE. Negate X first if NEGATE_P is true. */
21071 aarch64_sve_arith_immediate_p (machine_mode mode
, rtx x
, bool negate_p
)
21073 rtx elt
= unwrap_const_vec_duplicate (x
);
21074 if (!CONST_INT_P (elt
))
21077 HOST_WIDE_INT val
= INTVAL (elt
);
21080 val
&= GET_MODE_MASK (GET_MODE_INNER (mode
));
21083 return IN_RANGE (val
, 0, 0xff);
21084 return IN_RANGE (val
, 0, 0xff00);
21087 /* Return true if X is a valid immediate for the SVE SQADD and SQSUB
21088 instructions when applied to mode MODE. Negate X first if NEGATE_P
21092 aarch64_sve_sqadd_sqsub_immediate_p (machine_mode mode
, rtx x
, bool negate_p
)
21094 if (!aarch64_sve_arith_immediate_p (mode
, x
, negate_p
))
21097 /* After the optional negation, the immediate must be nonnegative.
21098 E.g. a saturating add of -127 must be done via SQSUB Zn.B, Zn.B, #127
21099 instead of SQADD Zn.B, Zn.B, #129. */
21100 rtx elt
= unwrap_const_vec_duplicate (x
);
21101 return negate_p
== (INTVAL (elt
) < 0);
21104 /* Return true if X is a valid immediate operand for an SVE logical
21105 instruction such as AND. */
21108 aarch64_sve_bitmask_immediate_p (rtx x
)
21112 return (const_vec_duplicate_p (x
, &elt
)
21113 && CONST_INT_P (elt
)
21114 && aarch64_bitmask_imm (INTVAL (elt
),
21115 GET_MODE_INNER (GET_MODE (x
))));
21118 /* Return true if X is a valid immediate for the SVE DUP and CPY
21122 aarch64_sve_dup_immediate_p (rtx x
)
21124 x
= aarch64_bit_representation (unwrap_const_vec_duplicate (x
));
21125 if (!CONST_INT_P (x
))
21128 HOST_WIDE_INT val
= INTVAL (x
);
21130 return IN_RANGE (val
, -0x80, 0x7f);
21131 return IN_RANGE (val
, -0x8000, 0x7f00);
21134 /* Return true if X is a valid immediate operand for an SVE CMP instruction.
21135 SIGNED_P says whether the operand is signed rather than unsigned. */
21138 aarch64_sve_cmp_immediate_p (rtx x
, bool signed_p
)
21140 x
= unwrap_const_vec_duplicate (x
);
21141 return (CONST_INT_P (x
)
21143 ? IN_RANGE (INTVAL (x
), -16, 15)
21144 : IN_RANGE (INTVAL (x
), 0, 127)));
21147 /* Return true if X is a valid immediate operand for an SVE FADD or FSUB
21148 instruction. Negate X first if NEGATE_P is true. */
21151 aarch64_sve_float_arith_immediate_p (rtx x
, bool negate_p
)
21156 if (!const_vec_duplicate_p (x
, &elt
)
21157 || !CONST_DOUBLE_P (elt
))
21160 r
= *CONST_DOUBLE_REAL_VALUE (elt
);
21163 r
= real_value_negate (&r
);
21165 if (real_equal (&r
, &dconst1
))
21167 if (real_equal (&r
, &dconsthalf
))
21172 /* Return true if X is a valid immediate operand for an SVE FMUL
21176 aarch64_sve_float_mul_immediate_p (rtx x
)
21180 return (const_vec_duplicate_p (x
, &elt
)
21181 && CONST_DOUBLE_P (elt
)
21182 && (real_equal (CONST_DOUBLE_REAL_VALUE (elt
), &dconsthalf
)
21183 || real_equal (CONST_DOUBLE_REAL_VALUE (elt
), &dconst2
)));
21186 /* Return true if replicating VAL32 is a valid 2-byte or 4-byte immediate
21187 for the Advanced SIMD operation described by WHICH and INSN. If INFO
21188 is nonnull, use it to describe valid immediates. */
21190 aarch64_advsimd_valid_immediate_hs (unsigned int val32
,
21191 simd_immediate_info
*info
,
21192 enum simd_immediate_check which
,
21193 simd_immediate_info::insn_type insn
)
21195 /* Try a 4-byte immediate with LSL. */
21196 for (unsigned int shift
= 0; shift
< 32; shift
+= 8)
21197 if ((val32
& (0xff << shift
)) == val32
)
21200 *info
= simd_immediate_info (SImode
, val32
>> shift
, insn
,
21201 simd_immediate_info::LSL
, shift
);
21205 /* Try a 2-byte immediate with LSL. */
21206 unsigned int imm16
= val32
& 0xffff;
21207 if (imm16
== (val32
>> 16))
21208 for (unsigned int shift
= 0; shift
< 16; shift
+= 8)
21209 if ((imm16
& (0xff << shift
)) == imm16
)
21212 *info
= simd_immediate_info (HImode
, imm16
>> shift
, insn
,
21213 simd_immediate_info::LSL
, shift
);
21217 /* Try a 4-byte immediate with MSL, except for cases that MVN
21219 if (which
== AARCH64_CHECK_MOV
)
21220 for (unsigned int shift
= 8; shift
< 24; shift
+= 8)
21222 unsigned int low
= (1 << shift
) - 1;
21223 if (((val32
& (0xff << shift
)) | low
) == val32
)
21226 *info
= simd_immediate_info (SImode
, val32
>> shift
, insn
,
21227 simd_immediate_info::MSL
, shift
);
21235 /* Return true if replicating VAL64 is a valid immediate for the
21236 Advanced SIMD operation described by WHICH. If INFO is nonnull,
21237 use it to describe valid immediates. */
21239 aarch64_advsimd_valid_immediate (unsigned HOST_WIDE_INT val64
,
21240 simd_immediate_info
*info
,
21241 enum simd_immediate_check which
)
21243 unsigned int val32
= val64
& 0xffffffff;
21244 unsigned int val16
= val64
& 0xffff;
21245 unsigned int val8
= val64
& 0xff;
21247 if (val32
== (val64
>> 32))
21249 if ((which
& AARCH64_CHECK_ORR
) != 0
21250 && aarch64_advsimd_valid_immediate_hs (val32
, info
, which
,
21251 simd_immediate_info::MOV
))
21254 if ((which
& AARCH64_CHECK_BIC
) != 0
21255 && aarch64_advsimd_valid_immediate_hs (~val32
, info
, which
,
21256 simd_immediate_info::MVN
))
21259 /* Try using a replicated byte. */
21260 if (which
== AARCH64_CHECK_MOV
21261 && val16
== (val32
>> 16)
21262 && val8
== (val16
>> 8))
21265 *info
= simd_immediate_info (QImode
, val8
);
21270 /* Try using a bit-to-bytemask. */
21271 if (which
== AARCH64_CHECK_MOV
)
21274 for (i
= 0; i
< 64; i
+= 8)
21276 unsigned char byte
= (val64
>> i
) & 0xff;
21277 if (byte
!= 0 && byte
!= 0xff)
21283 *info
= simd_immediate_info (DImode
, val64
);
21290 /* Return true if replicating VAL64 gives a valid immediate for an SVE MOV
21291 instruction. If INFO is nonnull, use it to describe valid immediates. */
21294 aarch64_sve_valid_immediate (unsigned HOST_WIDE_INT val64
,
21295 simd_immediate_info
*info
)
21297 scalar_int_mode mode
= DImode
;
21298 unsigned int val32
= val64
& 0xffffffff;
21299 if (val32
== (val64
>> 32))
21302 unsigned int val16
= val32
& 0xffff;
21303 if (val16
== (val32
>> 16))
21306 unsigned int val8
= val16
& 0xff;
21307 if (val8
== (val16
>> 8))
21311 HOST_WIDE_INT val
= trunc_int_for_mode (val64
, mode
);
21312 if (IN_RANGE (val
, -0x80, 0x7f))
21314 /* DUP with no shift. */
21316 *info
= simd_immediate_info (mode
, val
);
21319 if ((val
& 0xff) == 0 && IN_RANGE (val
, -0x8000, 0x7f00))
21321 /* DUP with LSL #8. */
21323 *info
= simd_immediate_info (mode
, val
);
21326 if (aarch64_bitmask_imm (val64
, mode
))
21330 *info
= simd_immediate_info (mode
, val
);
21336 /* Return true if X is an UNSPEC_PTRUE constant of the form:
21338 (const (unspec [PATTERN ZERO] UNSPEC_PTRUE))
21340 where PATTERN is the svpattern as a CONST_INT and where ZERO
21341 is a zero constant of the required PTRUE mode (which can have
21342 fewer elements than X's mode, if zero bits are significant).
21344 If so, and if INFO is nonnull, describe the immediate in INFO. */
21346 aarch64_sve_ptrue_svpattern_p (rtx x
, struct simd_immediate_info
*info
)
21348 if (GET_CODE (x
) != CONST
)
21352 if (GET_CODE (x
) != UNSPEC
|| XINT (x
, 1) != UNSPEC_PTRUE
)
21357 aarch64_svpattern pattern
21358 = (aarch64_svpattern
) INTVAL (XVECEXP (x
, 0, 0));
21359 machine_mode pred_mode
= GET_MODE (XVECEXP (x
, 0, 1));
21360 scalar_int_mode int_mode
= aarch64_sve_element_int_mode (pred_mode
);
21361 *info
= simd_immediate_info (int_mode
, pattern
);
21366 /* Return true if X is a valid SVE predicate. If INFO is nonnull, use
21367 it to describe valid immediates. */
21370 aarch64_sve_pred_valid_immediate (rtx x
, simd_immediate_info
*info
)
21372 if (aarch64_sve_ptrue_svpattern_p (x
, info
))
21375 if (x
== CONST0_RTX (GET_MODE (x
)))
21378 *info
= simd_immediate_info (DImode
, 0);
21382 /* Analyze the value as a VNx16BImode. This should be relatively
21383 efficient, since rtx_vector_builder has enough built-in capacity
21384 to store all VLA predicate constants without needing the heap. */
21385 rtx_vector_builder builder
;
21386 if (!aarch64_get_sve_pred_bits (builder
, x
))
21389 unsigned int elt_size
= aarch64_widest_sve_pred_elt_size (builder
);
21390 if (int vl
= aarch64_partial_ptrue_length (builder
, elt_size
))
21392 machine_mode mode
= aarch64_sve_pred_mode (elt_size
).require ();
21393 aarch64_svpattern pattern
= aarch64_svpattern_for_vl (mode
, vl
);
21394 if (pattern
!= AARCH64_NUM_SVPATTERNS
)
21398 scalar_int_mode int_mode
= aarch64_sve_element_int_mode (mode
);
21399 *info
= simd_immediate_info (int_mode
, pattern
);
21407 /* Return true if OP is a valid SIMD immediate for the operation
21408 described by WHICH. If INFO is nonnull, use it to describe valid
21411 aarch64_simd_valid_immediate (rtx op
, simd_immediate_info
*info
,
21412 enum simd_immediate_check which
)
21414 machine_mode mode
= GET_MODE (op
);
21415 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
21416 if (vec_flags
== 0 || vec_flags
== (VEC_ADVSIMD
| VEC_STRUCT
))
21419 if ((vec_flags
& VEC_ADVSIMD
) && !TARGET_SIMD
)
21422 if (vec_flags
& VEC_SVE_PRED
)
21423 return aarch64_sve_pred_valid_immediate (op
, info
);
21425 scalar_mode elt_mode
= GET_MODE_INNER (mode
);
21427 unsigned int n_elts
;
21428 if (CONST_VECTOR_P (op
)
21429 && CONST_VECTOR_DUPLICATE_P (op
))
21430 n_elts
= CONST_VECTOR_NPATTERNS (op
);
21431 else if ((vec_flags
& VEC_SVE_DATA
)
21432 && const_vec_series_p (op
, &base
, &step
))
21434 gcc_assert (GET_MODE_CLASS (mode
) == MODE_VECTOR_INT
);
21435 if (!aarch64_sve_index_immediate_p (base
)
21436 || !aarch64_sve_index_immediate_p (step
))
21441 /* Get the corresponding container mode. E.g. an INDEX on V2SI
21442 should yield two integer values per 128-bit block, meaning
21443 that we need to treat it in the same way as V2DI and then
21444 ignore the upper 32 bits of each element. */
21445 elt_mode
= aarch64_sve_container_int_mode (mode
);
21446 *info
= simd_immediate_info (elt_mode
, base
, step
);
21450 else if (CONST_VECTOR_P (op
)
21451 && CONST_VECTOR_NUNITS (op
).is_constant (&n_elts
))
21452 /* N_ELTS set above. */;
21456 scalar_float_mode elt_float_mode
;
21458 && is_a
<scalar_float_mode
> (elt_mode
, &elt_float_mode
))
21460 rtx elt
= CONST_VECTOR_ENCODED_ELT (op
, 0);
21461 if (aarch64_float_const_zero_rtx_p (elt
)
21462 || aarch64_float_const_representable_p (elt
))
21465 *info
= simd_immediate_info (elt_float_mode
, elt
);
21470 /* If all elements in an SVE vector have the same value, we have a free
21471 choice between using the element mode and using the container mode.
21472 Using the element mode means that unused parts of the vector are
21473 duplicates of the used elements, while using the container mode means
21474 that the unused parts are an extension of the used elements. Using the
21475 element mode is better for (say) VNx4HI 0x101, since 0x01010101 is valid
21476 for its container mode VNx4SI while 0x00000101 isn't.
21478 If not all elements in an SVE vector have the same value, we need the
21479 transition from one element to the next to occur at container boundaries.
21480 E.g. a fixed-length VNx4HI containing { 1, 2, 3, 4 } should be treated
21481 in the same way as a VNx4SI containing { 1, 2, 3, 4 }. */
21482 scalar_int_mode elt_int_mode
;
21483 if ((vec_flags
& VEC_SVE_DATA
) && n_elts
> 1)
21484 elt_int_mode
= aarch64_sve_container_int_mode (mode
);
21486 elt_int_mode
= int_mode_for_mode (elt_mode
).require ();
21488 unsigned int elt_size
= GET_MODE_SIZE (elt_int_mode
);
21492 /* Expand the vector constant out into a byte vector, with the least
21493 significant byte of the register first. */
21494 auto_vec
<unsigned char, 16> bytes
;
21495 bytes
.reserve (n_elts
* elt_size
);
21496 for (unsigned int i
= 0; i
< n_elts
; i
++)
21498 /* The vector is provided in gcc endian-neutral fashion.
21499 For aarch64_be Advanced SIMD, it must be laid out in the vector
21500 register in reverse order. */
21501 bool swap_p
= ((vec_flags
& VEC_ADVSIMD
) != 0 && BYTES_BIG_ENDIAN
);
21502 rtx elt
= CONST_VECTOR_ELT (op
, swap_p
? (n_elts
- 1 - i
) : i
);
21504 if (elt_mode
!= elt_int_mode
)
21505 elt
= gen_lowpart (elt_int_mode
, elt
);
21507 if (!CONST_INT_P (elt
))
21510 unsigned HOST_WIDE_INT elt_val
= INTVAL (elt
);
21511 for (unsigned int byte
= 0; byte
< elt_size
; byte
++)
21513 bytes
.quick_push (elt_val
& 0xff);
21514 elt_val
>>= BITS_PER_UNIT
;
21518 /* The immediate must repeat every eight bytes. */
21519 unsigned int nbytes
= bytes
.length ();
21520 for (unsigned i
= 8; i
< nbytes
; ++i
)
21521 if (bytes
[i
] != bytes
[i
- 8])
21524 /* Get the repeating 8-byte value as an integer. No endian correction
21525 is needed here because bytes is already in lsb-first order. */
21526 unsigned HOST_WIDE_INT val64
= 0;
21527 for (unsigned int i
= 0; i
< 8; i
++)
21528 val64
|= ((unsigned HOST_WIDE_INT
) bytes
[i
% nbytes
]
21529 << (i
* BITS_PER_UNIT
));
21531 if (vec_flags
& VEC_SVE_DATA
)
21532 return aarch64_sve_valid_immediate (val64
, info
);
21534 return aarch64_advsimd_valid_immediate (val64
, info
, which
);
21537 /* Check whether X is a VEC_SERIES-like constant that starts at 0 and
21538 has a step in the range of INDEX. Return the index expression if so,
21539 otherwise return null. */
21541 aarch64_check_zero_based_sve_index_immediate (rtx x
)
21544 if (const_vec_series_p (x
, &base
, &step
)
21545 && base
== const0_rtx
21546 && aarch64_sve_index_immediate_p (step
))
21551 /* Check of immediate shift constants are within range. */
21553 aarch64_simd_shift_imm_p (rtx x
, machine_mode mode
, bool left
)
21555 x
= unwrap_const_vec_duplicate (x
);
21556 if (!CONST_INT_P (x
))
21558 int bit_width
= GET_MODE_UNIT_SIZE (mode
) * BITS_PER_UNIT
;
21560 return IN_RANGE (INTVAL (x
), 0, bit_width
- 1);
21562 return IN_RANGE (INTVAL (x
), 1, bit_width
);
21565 /* Return the bitmask CONST_INT to select the bits required by a zero extract
21566 operation of width WIDTH at bit position POS. */
21569 aarch64_mask_from_zextract_ops (rtx width
, rtx pos
)
21571 gcc_assert (CONST_INT_P (width
));
21572 gcc_assert (CONST_INT_P (pos
));
21574 unsigned HOST_WIDE_INT mask
21575 = ((unsigned HOST_WIDE_INT
) 1 << UINTVAL (width
)) - 1;
21576 return GEN_INT (mask
<< UINTVAL (pos
));
21580 aarch64_mov_operand_p (rtx x
, machine_mode mode
)
21582 if (GET_CODE (x
) == HIGH
21583 && aarch64_valid_symref (XEXP (x
, 0), GET_MODE (XEXP (x
, 0))))
21586 if (CONST_INT_P (x
))
21589 if (VECTOR_MODE_P (GET_MODE (x
)))
21591 /* Require predicate constants to be VNx16BI before RA, so that we
21592 force everything to have a canonical form. */
21593 if (!lra_in_progress
21594 && !reload_completed
21595 && GET_MODE_CLASS (GET_MODE (x
)) == MODE_VECTOR_BOOL
21596 && GET_MODE (x
) != VNx16BImode
)
21599 return aarch64_simd_valid_immediate (x
, NULL
);
21602 /* Remove UNSPEC_SALT_ADDR before checking symbol reference. */
21603 x
= strip_salt (x
);
21605 /* GOT accesses are valid moves. */
21606 if (SYMBOL_REF_P (x
)
21607 && aarch64_classify_symbolic_expression (x
) == SYMBOL_SMALL_GOT_4G
)
21610 if (SYMBOL_REF_P (x
) && mode
== DImode
&& CONSTANT_ADDRESS_P (x
))
21613 if (TARGET_SVE
&& aarch64_sve_cnt_immediate_p (x
))
21616 return aarch64_classify_symbolic_expression (x
)
21617 == SYMBOL_TINY_ABSOLUTE
;
21620 /* Create a 0 constant that is based on V4SI to allow CSE to optimally share
21621 the constant creation. */
21624 aarch64_gen_shareable_zero (machine_mode mode
)
21626 machine_mode zmode
= V4SImode
;
21627 rtx tmp
= gen_reg_rtx (zmode
);
21628 emit_move_insn (tmp
, CONST0_RTX (zmode
));
21629 return lowpart_subreg (mode
, tmp
, zmode
);
21632 /* Return a const_int vector of VAL. */
21634 aarch64_simd_gen_const_vector_dup (machine_mode mode
, HOST_WIDE_INT val
)
21636 rtx c
= gen_int_mode (val
, GET_MODE_INNER (mode
));
21637 return gen_const_vec_duplicate (mode
, c
);
21640 /* Check OP is a legal scalar immediate for the MOVI instruction. */
21643 aarch64_simd_scalar_immediate_valid_for_move (rtx op
, scalar_int_mode mode
)
21645 machine_mode vmode
;
21647 vmode
= aarch64_simd_container_mode (mode
, 64);
21648 rtx op_v
= aarch64_simd_gen_const_vector_dup (vmode
, INTVAL (op
));
21649 return aarch64_simd_valid_immediate (op_v
, NULL
);
21652 /* Construct and return a PARALLEL RTX vector with elements numbering the
21653 lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
21654 the vector - from the perspective of the architecture. This does not
21655 line up with GCC's perspective on lane numbers, so we end up with
21656 different masks depending on our target endian-ness. The diagram
21657 below may help. We must draw the distinction when building masks
21658 which select one half of the vector. An instruction selecting
21659 architectural low-lanes for a big-endian target, must be described using
21660 a mask selecting GCC high-lanes.
21662 Big-Endian Little-Endian
21664 GCC 0 1 2 3 3 2 1 0
21665 | x | x | x | x | | x | x | x | x |
21666 Architecture 3 2 1 0 3 2 1 0
21668 Low Mask: { 2, 3 } { 0, 1 }
21669 High Mask: { 0, 1 } { 2, 3 }
21671 MODE Is the mode of the vector and NUNITS is the number of units in it. */
21674 aarch64_simd_vect_par_cnst_half (machine_mode mode
, int nunits
, bool high
)
21676 rtvec v
= rtvec_alloc (nunits
/ 2);
21677 int high_base
= nunits
/ 2;
21683 if (BYTES_BIG_ENDIAN
)
21684 base
= high
? low_base
: high_base
;
21686 base
= high
? high_base
: low_base
;
21688 for (i
= 0; i
< nunits
/ 2; i
++)
21689 RTVEC_ELT (v
, i
) = GEN_INT (base
+ i
);
21691 t1
= gen_rtx_PARALLEL (mode
, v
);
21695 /* Check OP for validity as a PARALLEL RTX vector with elements
21696 numbering the lanes of either the high (HIGH == TRUE) or low lanes,
21697 from the perspective of the architecture. See the diagram above
21698 aarch64_simd_vect_par_cnst_half for more details. */
21701 aarch64_simd_check_vect_par_cnst_half (rtx op
, machine_mode mode
,
21705 if (!VECTOR_MODE_P (mode
) || !GET_MODE_NUNITS (mode
).is_constant (&nelts
))
21708 rtx ideal
= aarch64_simd_vect_par_cnst_half (mode
, nelts
, high
);
21709 HOST_WIDE_INT count_op
= XVECLEN (op
, 0);
21710 HOST_WIDE_INT count_ideal
= XVECLEN (ideal
, 0);
21713 if (count_op
!= count_ideal
)
21716 for (i
= 0; i
< count_ideal
; i
++)
21718 rtx elt_op
= XVECEXP (op
, 0, i
);
21719 rtx elt_ideal
= XVECEXP (ideal
, 0, i
);
21721 if (!CONST_INT_P (elt_op
)
21722 || INTVAL (elt_ideal
) != INTVAL (elt_op
))
21728 /* Return a PARALLEL containing NELTS elements, with element I equal
21729 to BASE + I * STEP. */
21732 aarch64_gen_stepped_int_parallel (unsigned int nelts
, int base
, int step
)
21734 rtvec vec
= rtvec_alloc (nelts
);
21735 for (unsigned int i
= 0; i
< nelts
; ++i
)
21736 RTVEC_ELT (vec
, i
) = gen_int_mode (base
+ i
* step
, DImode
);
21737 return gen_rtx_PARALLEL (VOIDmode
, vec
);
21740 /* Return true if OP is a PARALLEL of CONST_INTs that form a linear
21741 series with step STEP. */
21744 aarch64_stepped_int_parallel_p (rtx op
, int step
)
21746 if (GET_CODE (op
) != PARALLEL
|| !CONST_INT_P (XVECEXP (op
, 0, 0)))
21749 unsigned HOST_WIDE_INT base
= UINTVAL (XVECEXP (op
, 0, 0));
21750 for (int i
= 1; i
< XVECLEN (op
, 0); ++i
)
21751 if (!CONST_INT_P (XVECEXP (op
, 0, i
))
21752 || UINTVAL (XVECEXP (op
, 0, i
)) != base
+ i
* step
)
21758 /* Bounds-check lanes. Ensure OPERAND lies between LOW (inclusive) and
21759 HIGH (exclusive). */
21761 aarch64_simd_lane_bounds (rtx operand
, HOST_WIDE_INT low
, HOST_WIDE_INT high
,
21764 HOST_WIDE_INT lane
;
21765 gcc_assert (CONST_INT_P (operand
));
21766 lane
= INTVAL (operand
);
21768 if (lane
< low
|| lane
>= high
)
21771 error_at (EXPR_LOCATION (exp
), "lane %wd out of range %wd - %wd",
21772 lane
, low
, high
- 1);
21774 error ("lane %wd out of range %wd - %wd", lane
, low
, high
- 1);
21778 /* Peform endian correction on lane number N, which indexes a vector
21779 of mode MODE, and return the result as an SImode rtx. */
21782 aarch64_endian_lane_rtx (machine_mode mode
, unsigned int n
)
21784 return gen_int_mode (ENDIAN_LANE_N (GET_MODE_NUNITS (mode
), n
), SImode
);
21787 /* Return TRUE if OP is a valid vector addressing mode. */
21790 aarch64_simd_mem_operand_p (rtx op
)
21792 return MEM_P (op
) && (GET_CODE (XEXP (op
, 0)) == POST_INC
21793 || REG_P (XEXP (op
, 0)));
21796 /* Return true if OP is a valid MEM operand for an SVE LD1R instruction. */
21799 aarch64_sve_ld1r_operand_p (rtx op
)
21801 struct aarch64_address_info addr
;
21805 && is_a
<scalar_mode
> (GET_MODE (op
), &mode
)
21806 && aarch64_classify_address (&addr
, XEXP (op
, 0), mode
, false)
21807 && addr
.type
== ADDRESS_REG_IMM
21808 && offset_6bit_unsigned_scaled_p (mode
, addr
.const_offset
));
21811 /* Return true if OP is a valid MEM operand for an SVE LD1R{Q,O} instruction
21812 where the size of the read data is specified by `mode` and the size of the
21813 vector elements are specified by `elem_mode`. */
21815 aarch64_sve_ld1rq_ld1ro_operand_p (rtx op
, machine_mode mode
,
21816 scalar_mode elem_mode
)
21818 struct aarch64_address_info addr
;
21820 || !aarch64_classify_address (&addr
, XEXP (op
, 0), elem_mode
, false))
21823 if (addr
.type
== ADDRESS_REG_IMM
)
21824 return offset_4bit_signed_scaled_p (mode
, addr
.const_offset
);
21826 if (addr
.type
== ADDRESS_REG_REG
)
21827 return (1U << addr
.shift
) == GET_MODE_SIZE (elem_mode
);
21832 /* Return true if OP is a valid MEM operand for an SVE LD1RQ instruction. */
21834 aarch64_sve_ld1rq_operand_p (rtx op
)
21836 return aarch64_sve_ld1rq_ld1ro_operand_p (op
, TImode
,
21837 GET_MODE_INNER (GET_MODE (op
)));
21840 /* Return true if OP is a valid MEM operand for an SVE LD1RO instruction for
21841 accessing a vector where the element size is specified by `elem_mode`. */
21843 aarch64_sve_ld1ro_operand_p (rtx op
, scalar_mode elem_mode
)
21845 return aarch64_sve_ld1rq_ld1ro_operand_p (op
, OImode
, elem_mode
);
21848 /* Return true if OP is a valid MEM operand for an SVE LDFF1 instruction. */
21850 aarch64_sve_ldff1_operand_p (rtx op
)
21855 struct aarch64_address_info addr
;
21856 if (!aarch64_classify_address (&addr
, XEXP (op
, 0), GET_MODE (op
), false))
21859 if (addr
.type
== ADDRESS_REG_IMM
)
21860 return known_eq (addr
.const_offset
, 0);
21862 return addr
.type
== ADDRESS_REG_REG
;
21865 /* Return true if OP is a valid MEM operand for an SVE LDNF1 instruction. */
21867 aarch64_sve_ldnf1_operand_p (rtx op
)
21869 struct aarch64_address_info addr
;
21872 && aarch64_classify_address (&addr
, XEXP (op
, 0),
21873 GET_MODE (op
), false)
21874 && addr
.type
== ADDRESS_REG_IMM
);
21877 /* Return true if OP is a valid MEM operand for an SVE LDR instruction.
21878 The conditions for STR are the same. */
21880 aarch64_sve_ldr_operand_p (rtx op
)
21882 struct aarch64_address_info addr
;
21885 && aarch64_classify_address (&addr
, XEXP (op
, 0), GET_MODE (op
),
21886 false, ADDR_QUERY_ANY
)
21887 && addr
.type
== ADDRESS_REG_IMM
);
21890 /* Return true if OP is a valid address for an SVE PRF[BHWD] instruction,
21891 addressing memory of mode MODE. */
21893 aarch64_sve_prefetch_operand_p (rtx op
, machine_mode mode
)
21895 struct aarch64_address_info addr
;
21896 if (!aarch64_classify_address (&addr
, op
, mode
, false, ADDR_QUERY_ANY
))
21899 if (addr
.type
== ADDRESS_REG_IMM
)
21900 return offset_6bit_signed_scaled_p (mode
, addr
.const_offset
);
21902 return addr
.type
== ADDRESS_REG_REG
;
21905 /* Return true if OP is a valid MEM operand for an SVE_STRUCT mode.
21906 We need to be able to access the individual pieces, so the range
21907 is different from LD[234] and ST[234]. */
21909 aarch64_sve_struct_memory_operand_p (rtx op
)
21914 machine_mode mode
= GET_MODE (op
);
21915 struct aarch64_address_info addr
;
21916 if (!aarch64_classify_address (&addr
, XEXP (op
, 0), SVE_BYTE_MODE
, false,
21918 || addr
.type
!= ADDRESS_REG_IMM
)
21921 poly_int64 first
= addr
.const_offset
;
21922 poly_int64 last
= first
+ GET_MODE_SIZE (mode
) - BYTES_PER_SVE_VECTOR
;
21923 return (offset_4bit_signed_scaled_p (SVE_BYTE_MODE
, first
)
21924 && offset_4bit_signed_scaled_p (SVE_BYTE_MODE
, last
));
21927 /* Emit a register copy from operand to operand, taking care not to
21928 early-clobber source registers in the process.
21930 COUNT is the number of components into which the copy needs to be
21933 aarch64_simd_emit_reg_reg_move (rtx
*operands
, machine_mode mode
,
21934 unsigned int count
)
21937 int rdest
= REGNO (operands
[0]);
21938 int rsrc
= REGNO (operands
[1]);
21940 if (!reg_overlap_mentioned_p (operands
[0], operands
[1])
21942 for (i
= 0; i
< count
; i
++)
21943 emit_move_insn (gen_rtx_REG (mode
, rdest
+ i
),
21944 gen_rtx_REG (mode
, rsrc
+ i
));
21946 for (i
= 0; i
< count
; i
++)
21947 emit_move_insn (gen_rtx_REG (mode
, rdest
+ count
- i
- 1),
21948 gen_rtx_REG (mode
, rsrc
+ count
- i
- 1));
21951 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
21952 one of VSTRUCT modes: OI, CI, or XI. */
21954 aarch64_simd_attr_length_rglist (machine_mode mode
)
21956 /* This is only used (and only meaningful) for Advanced SIMD, not SVE. */
21957 return (GET_MODE_SIZE (mode
).to_constant () / UNITS_PER_VREG
) * 4;
21960 /* Implement target hook TARGET_VECTOR_ALIGNMENT. The AAPCS64 sets the maximum
21961 alignment of a vector to 128 bits. SVE predicates have an alignment of
21963 static HOST_WIDE_INT
21964 aarch64_simd_vector_alignment (const_tree type
)
21966 /* ??? Checking the mode isn't ideal, but VECTOR_BOOLEAN_TYPE_P can
21967 be set for non-predicate vectors of booleans. Modes are the most
21968 direct way we have of identifying real SVE predicate types. */
21969 if (GET_MODE_CLASS (TYPE_MODE (type
)) == MODE_VECTOR_BOOL
)
21971 widest_int min_size
21972 = constant_lower_bound (wi::to_poly_widest (TYPE_SIZE (type
)));
21973 return wi::umin (min_size
, 128).to_uhwi ();
21976 /* Implement target hook TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT. */
21978 aarch64_vectorize_preferred_vector_alignment (const_tree type
)
21980 if (aarch64_sve_data_mode_p (TYPE_MODE (type
)))
21982 /* If the length of the vector is a fixed power of 2, try to align
21983 to that length, otherwise don't try to align at all. */
21984 HOST_WIDE_INT result
;
21985 if (!GET_MODE_BITSIZE (TYPE_MODE (type
)).is_constant (&result
)
21986 || !pow2p_hwi (result
))
21987 result
= TYPE_ALIGN (TREE_TYPE (type
));
21990 return TYPE_ALIGN (type
);
21993 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE. */
21995 aarch64_simd_vector_alignment_reachable (const_tree type
, bool is_packed
)
22000 /* For fixed-length vectors, check that the vectorizer will aim for
22001 full-vector alignment. This isn't true for generic GCC vectors
22002 that are wider than the ABI maximum of 128 bits. */
22003 poly_uint64 preferred_alignment
=
22004 aarch64_vectorize_preferred_vector_alignment (type
);
22005 if (TREE_CODE (TYPE_SIZE (type
)) == INTEGER_CST
22006 && maybe_ne (wi::to_widest (TYPE_SIZE (type
)),
22007 preferred_alignment
))
22010 /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned. */
22014 /* Return true if the vector misalignment factor is supported by the
22017 aarch64_builtin_support_vector_misalignment (machine_mode mode
,
22018 const_tree type
, int misalignment
,
22021 if (TARGET_SIMD
&& STRICT_ALIGNMENT
)
22023 /* Return if movmisalign pattern is not supported for this mode. */
22024 if (optab_handler (movmisalign_optab
, mode
) == CODE_FOR_nothing
)
22027 /* Misalignment factor is unknown at compile time. */
22028 if (misalignment
== -1)
22031 return default_builtin_support_vector_misalignment (mode
, type
, misalignment
,
22035 /* If VALS is a vector constant that can be loaded into a register
22036 using DUP, generate instructions to do so and return an RTX to
22037 assign to the register. Otherwise return NULL_RTX. */
22039 aarch64_simd_dup_constant (rtx vals
)
22041 machine_mode mode
= GET_MODE (vals
);
22042 machine_mode inner_mode
= GET_MODE_INNER (mode
);
22045 if (!const_vec_duplicate_p (vals
, &x
))
22048 /* We can load this constant by using DUP and a constant in a
22049 single ARM register. This will be cheaper than a vector
22051 x
= copy_to_mode_reg (inner_mode
, x
);
22052 return gen_vec_duplicate (mode
, x
);
22056 /* Generate code to load VALS, which is a PARALLEL containing only
22057 constants (for vec_init) or CONST_VECTOR, efficiently into a
22058 register. Returns an RTX to copy into the register, or NULL_RTX
22059 for a PARALLEL that cannot be converted into a CONST_VECTOR. */
22061 aarch64_simd_make_constant (rtx vals
)
22063 machine_mode mode
= GET_MODE (vals
);
22065 rtx const_vec
= NULL_RTX
;
22069 if (CONST_VECTOR_P (vals
))
22071 else if (GET_CODE (vals
) == PARALLEL
)
22073 /* A CONST_VECTOR must contain only CONST_INTs and
22074 CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
22075 Only store valid constants in a CONST_VECTOR. */
22076 int n_elts
= XVECLEN (vals
, 0);
22077 for (i
= 0; i
< n_elts
; ++i
)
22079 rtx x
= XVECEXP (vals
, 0, i
);
22080 if (CONST_INT_P (x
) || CONST_DOUBLE_P (x
))
22083 if (n_const
== n_elts
)
22084 const_vec
= gen_rtx_CONST_VECTOR (mode
, XVEC (vals
, 0));
22087 gcc_unreachable ();
22089 if (const_vec
!= NULL_RTX
22090 && aarch64_simd_valid_immediate (const_vec
, NULL
))
22091 /* Load using MOVI/MVNI. */
22093 else if ((const_dup
= aarch64_simd_dup_constant (vals
)) != NULL_RTX
)
22094 /* Loaded using DUP. */
22096 else if (const_vec
!= NULL_RTX
)
22097 /* Load from constant pool. We cannot take advantage of single-cycle
22098 LD1 because we need a PC-relative addressing mode. */
22101 /* A PARALLEL containing something not valid inside CONST_VECTOR.
22102 We cannot construct an initializer. */
22106 /* Expand a vector initialisation sequence, such that TARGET is
22107 initialised to contain VALS. */
22110 aarch64_expand_vector_init (rtx target
, rtx vals
)
22112 machine_mode mode
= GET_MODE (target
);
22113 scalar_mode inner_mode
= GET_MODE_INNER (mode
);
22114 /* The number of vector elements. */
22115 int n_elts
= XVECLEN (vals
, 0);
22116 /* The number of vector elements which are not constant. */
22118 rtx any_const
= NULL_RTX
;
22119 /* The first element of vals. */
22120 rtx v0
= XVECEXP (vals
, 0, 0);
22121 bool all_same
= true;
22123 /* This is a special vec_init<M><N> where N is not an element mode but a
22124 vector mode with half the elements of M. We expect to find two entries
22125 of mode N in VALS and we must put their concatentation into TARGET. */
22126 if (XVECLEN (vals
, 0) == 2 && VECTOR_MODE_P (GET_MODE (XVECEXP (vals
, 0, 0))))
22128 machine_mode narrow_mode
= GET_MODE (XVECEXP (vals
, 0, 0));
22129 gcc_assert (GET_MODE_INNER (narrow_mode
) == inner_mode
22130 && known_eq (GET_MODE_SIZE (mode
),
22131 2 * GET_MODE_SIZE (narrow_mode
)));
22132 emit_insn (gen_aarch64_vec_concat (narrow_mode
, target
,
22133 XVECEXP (vals
, 0, 0),
22134 XVECEXP (vals
, 0, 1)));
22138 /* Count the number of variable elements to initialise. */
22139 for (int i
= 0; i
< n_elts
; ++i
)
22141 rtx x
= XVECEXP (vals
, 0, i
);
22142 if (!(CONST_INT_P (x
) || CONST_DOUBLE_P (x
)))
22147 all_same
&= rtx_equal_p (x
, v0
);
22150 /* No variable elements, hand off to aarch64_simd_make_constant which knows
22151 how best to handle this. */
22154 rtx constant
= aarch64_simd_make_constant (vals
);
22155 if (constant
!= NULL_RTX
)
22157 emit_move_insn (target
, constant
);
22162 /* Splat a single non-constant element if we can. */
22165 rtx x
= copy_to_mode_reg (inner_mode
, v0
);
22166 aarch64_emit_move (target
, gen_vec_duplicate (mode
, x
));
22170 /* Check for interleaving case.
22171 For eg if initializer is (int16x8_t) {x, y, x, y, x, y, x, y}.
22172 Generate following code:
22175 zip1 v0.h, v0.h, v1.h
22176 for "large enough" initializer. */
22181 for (i
= 2; i
< n_elts
; i
++)
22182 if (!rtx_equal_p (XVECEXP (vals
, 0, i
), XVECEXP (vals
, 0, i
% 2)))
22187 machine_mode mode
= GET_MODE (target
);
22190 for (int i
= 0; i
< 2; i
++)
22192 rtx x
= expand_vector_broadcast (mode
, XVECEXP (vals
, 0, i
));
22193 dest
[i
] = force_reg (mode
, x
);
22196 rtvec v
= gen_rtvec (2, dest
[0], dest
[1]);
22197 emit_set_insn (target
, gen_rtx_UNSPEC (mode
, v
, UNSPEC_ZIP1
));
22202 enum insn_code icode
= optab_handler (vec_set_optab
, mode
);
22203 gcc_assert (icode
!= CODE_FOR_nothing
);
22205 /* If there are only variable elements, try to optimize
22206 the insertion using dup for the most common element
22207 followed by insertions. */
22209 /* The algorithm will fill matches[*][0] with the earliest matching element,
22210 and matches[X][1] with the count of duplicate elements (if X is the
22211 earliest element which has duplicates). */
22213 if (n_var
== n_elts
&& n_elts
<= 16)
22215 int matches
[16][2] = {0};
22216 for (int i
= 0; i
< n_elts
; i
++)
22218 for (int j
= 0; j
<= i
; j
++)
22220 if (rtx_equal_p (XVECEXP (vals
, 0, i
), XVECEXP (vals
, 0, j
)))
22228 int maxelement
= 0;
22230 for (int i
= 0; i
< n_elts
; i
++)
22231 if (matches
[i
][1] > maxv
)
22234 maxv
= matches
[i
][1];
22237 /* Create a duplicate of the most common element, unless all elements
22238 are equally useless to us, in which case just immediately set the
22239 vector register using the first element. */
22243 /* For vectors of two 64-bit elements, we can do even better. */
22245 && (inner_mode
== E_DImode
22246 || inner_mode
== E_DFmode
))
22249 rtx x0
= XVECEXP (vals
, 0, 0);
22250 rtx x1
= XVECEXP (vals
, 0, 1);
22251 /* Combine can pick up this case, but handling it directly
22252 here leaves clearer RTL.
22254 This is load_pair_lanes<mode>, and also gives us a clean-up
22255 for store_pair_lanes<mode>. */
22256 if (memory_operand (x0
, inner_mode
)
22257 && memory_operand (x1
, inner_mode
)
22258 && aarch64_mergeable_load_pair_p (mode
, x0
, x1
))
22261 if (inner_mode
== DFmode
)
22262 t
= gen_load_pair_lanesdf (target
, x0
, x1
);
22264 t
= gen_load_pair_lanesdi (target
, x0
, x1
);
22269 /* The subreg-move sequence below will move into lane zero of the
22270 vector register. For big-endian we want that position to hold
22271 the last element of VALS. */
22272 maxelement
= BYTES_BIG_ENDIAN
? n_elts
- 1 : 0;
22273 rtx x
= copy_to_mode_reg (inner_mode
, XVECEXP (vals
, 0, maxelement
));
22274 aarch64_emit_move (target
, lowpart_subreg (mode
, x
, inner_mode
));
22278 rtx x
= copy_to_mode_reg (inner_mode
, XVECEXP (vals
, 0, maxelement
));
22279 aarch64_emit_move (target
, gen_vec_duplicate (mode
, x
));
22282 /* Insert the rest. */
22283 for (int i
= 0; i
< n_elts
; i
++)
22285 rtx x
= XVECEXP (vals
, 0, i
);
22286 if (matches
[i
][0] == maxelement
)
22288 x
= copy_to_mode_reg (inner_mode
, x
);
22289 emit_insn (GEN_FCN (icode
) (target
, x
, GEN_INT (i
)));
22294 /* Initialise a vector which is part-variable. We want to first try
22295 to build those lanes which are constant in the most efficient way we
22297 if (n_var
!= n_elts
)
22299 rtx copy
= copy_rtx (vals
);
22301 /* Load constant part of vector. We really don't care what goes into the
22302 parts we will overwrite, but we're more likely to be able to load the
22303 constant efficiently if it has fewer, larger, repeating parts
22304 (see aarch64_simd_valid_immediate). */
22305 for (int i
= 0; i
< n_elts
; i
++)
22307 rtx x
= XVECEXP (vals
, 0, i
);
22308 if (CONST_INT_P (x
) || CONST_DOUBLE_P (x
))
22310 rtx subst
= any_const
;
22311 for (int bit
= n_elts
/ 2; bit
> 0; bit
/= 2)
22313 /* Look in the copied vector, as more elements are const. */
22314 rtx test
= XVECEXP (copy
, 0, i
^ bit
);
22315 if (CONST_INT_P (test
) || CONST_DOUBLE_P (test
))
22321 XVECEXP (copy
, 0, i
) = subst
;
22323 aarch64_expand_vector_init (target
, copy
);
22326 /* Insert the variable lanes directly. */
22327 for (int i
= 0; i
< n_elts
; i
++)
22329 rtx x
= XVECEXP (vals
, 0, i
);
22330 if (CONST_INT_P (x
) || CONST_DOUBLE_P (x
))
22332 x
= copy_to_mode_reg (inner_mode
, x
);
22333 emit_insn (GEN_FCN (icode
) (target
, x
, GEN_INT (i
)));
22337 /* Emit RTL corresponding to:
22338 insr TARGET, ELEM. */
22341 emit_insr (rtx target
, rtx elem
)
22343 machine_mode mode
= GET_MODE (target
);
22344 scalar_mode elem_mode
= GET_MODE_INNER (mode
);
22345 elem
= force_reg (elem_mode
, elem
);
22347 insn_code icode
= optab_handler (vec_shl_insert_optab
, mode
);
22348 gcc_assert (icode
!= CODE_FOR_nothing
);
22349 emit_insn (GEN_FCN (icode
) (target
, target
, elem
));
22352 /* Subroutine of aarch64_sve_expand_vector_init for handling
22353 trailing constants.
22354 This function works as follows:
22355 (a) Create a new vector consisting of trailing constants.
22356 (b) Initialize TARGET with the constant vector using emit_move_insn.
22357 (c) Insert remaining elements in TARGET using insr.
22358 NELTS is the total number of elements in original vector while
22359 while NELTS_REQD is the number of elements that are actually
22362 ??? The heuristic used is to do above only if number of constants
22363 is at least half the total number of elements. May need fine tuning. */
22366 aarch64_sve_expand_vector_init_handle_trailing_constants
22367 (rtx target
, const rtx_vector_builder
&builder
, int nelts
, int nelts_reqd
)
22369 machine_mode mode
= GET_MODE (target
);
22370 scalar_mode elem_mode
= GET_MODE_INNER (mode
);
22371 int n_trailing_constants
= 0;
22373 for (int i
= nelts_reqd
- 1;
22374 i
>= 0 && valid_for_const_vector_p (elem_mode
, builder
.elt (i
));
22376 n_trailing_constants
++;
22378 if (n_trailing_constants
>= nelts_reqd
/ 2)
22380 /* Try to use the natural pattern of BUILDER to extend the trailing
22381 constant elements to a full vector. Replace any variables in the
22382 extra elements with zeros.
22384 ??? It would be better if the builders supported "don't care"
22385 elements, with the builder filling in whichever elements
22386 give the most compact encoding. */
22387 rtx_vector_builder
v (mode
, nelts
, 1);
22388 for (int i
= 0; i
< nelts
; i
++)
22390 rtx x
= builder
.elt (i
+ nelts_reqd
- n_trailing_constants
);
22391 if (!valid_for_const_vector_p (elem_mode
, x
))
22392 x
= CONST0_RTX (elem_mode
);
22395 rtx const_vec
= v
.build ();
22396 emit_move_insn (target
, const_vec
);
22398 for (int i
= nelts_reqd
- n_trailing_constants
- 1; i
>= 0; i
--)
22399 emit_insr (target
, builder
.elt (i
));
22407 /* Subroutine of aarch64_sve_expand_vector_init.
22409 (a) Initialize TARGET by broadcasting element NELTS_REQD - 1 of BUILDER.
22410 (b) Skip trailing elements from BUILDER, which are the same as
22411 element NELTS_REQD - 1.
22412 (c) Insert earlier elements in reverse order in TARGET using insr. */
22415 aarch64_sve_expand_vector_init_insert_elems (rtx target
,
22416 const rtx_vector_builder
&builder
,
22419 machine_mode mode
= GET_MODE (target
);
22420 scalar_mode elem_mode
= GET_MODE_INNER (mode
);
22422 struct expand_operand ops
[2];
22423 enum insn_code icode
= optab_handler (vec_duplicate_optab
, mode
);
22424 gcc_assert (icode
!= CODE_FOR_nothing
);
22426 create_output_operand (&ops
[0], target
, mode
);
22427 create_input_operand (&ops
[1], builder
.elt (nelts_reqd
- 1), elem_mode
);
22428 expand_insn (icode
, 2, ops
);
22430 int ndups
= builder
.count_dups (nelts_reqd
- 1, -1, -1);
22431 for (int i
= nelts_reqd
- ndups
- 1; i
>= 0; i
--)
22432 emit_insr (target
, builder
.elt (i
));
22435 /* Subroutine of aarch64_sve_expand_vector_init to handle case
22436 when all trailing elements of builder are same.
22437 This works as follows:
22438 (a) Use expand_insn interface to broadcast last vector element in TARGET.
22439 (b) Insert remaining elements in TARGET using insr.
22441 ??? The heuristic used is to do above if number of same trailing elements
22442 is at least 3/4 of total number of elements, loosely based on
22443 heuristic from mostly_zeros_p. May need fine-tuning. */
22446 aarch64_sve_expand_vector_init_handle_trailing_same_elem
22447 (rtx target
, const rtx_vector_builder
&builder
, int nelts_reqd
)
22449 int ndups
= builder
.count_dups (nelts_reqd
- 1, -1, -1);
22450 if (ndups
>= (3 * nelts_reqd
) / 4)
22452 aarch64_sve_expand_vector_init_insert_elems (target
, builder
,
22453 nelts_reqd
- ndups
+ 1);
22460 /* Initialize register TARGET from BUILDER. NELTS is the constant number
22461 of elements in BUILDER.
22463 The function tries to initialize TARGET from BUILDER if it fits one
22464 of the special cases outlined below.
22466 Failing that, the function divides BUILDER into two sub-vectors:
22467 v_even = even elements of BUILDER;
22468 v_odd = odd elements of BUILDER;
22470 and recursively calls itself with v_even and v_odd.
22472 if (recursive call succeeded for v_even or v_odd)
22473 TARGET = zip (v_even, v_odd)
22475 The function returns true if it managed to build TARGET from BUILDER
22476 with one of the special cases, false otherwise.
22478 Example: {a, 1, b, 2, c, 3, d, 4}
22480 The vector gets divided into:
22481 v_even = {a, b, c, d}
22482 v_odd = {1, 2, 3, 4}
22484 aarch64_sve_expand_vector_init(v_odd) hits case 1 and
22485 initialize tmp2 from constant vector v_odd using emit_move_insn.
22487 aarch64_sve_expand_vector_init(v_even) fails since v_even contains
22488 4 elements, so we construct tmp1 from v_even using insr:
22495 TARGET = zip (tmp1, tmp2)
22496 which sets TARGET to {a, 1, b, 2, c, 3, d, 4}. */
22499 aarch64_sve_expand_vector_init (rtx target
, const rtx_vector_builder
&builder
,
22500 int nelts
, int nelts_reqd
)
22502 machine_mode mode
= GET_MODE (target
);
22504 /* Case 1: Vector contains trailing constants. */
22506 if (aarch64_sve_expand_vector_init_handle_trailing_constants
22507 (target
, builder
, nelts
, nelts_reqd
))
22510 /* Case 2: Vector contains leading constants. */
22512 rtx_vector_builder
rev_builder (mode
, nelts_reqd
, 1);
22513 for (int i
= 0; i
< nelts_reqd
; i
++)
22514 rev_builder
.quick_push (builder
.elt (nelts_reqd
- i
- 1));
22515 rev_builder
.finalize ();
22517 if (aarch64_sve_expand_vector_init_handle_trailing_constants
22518 (target
, rev_builder
, nelts
, nelts_reqd
))
22520 emit_insn (gen_aarch64_sve_rev (mode
, target
, target
));
22524 /* Case 3: Vector contains trailing same element. */
22526 if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
22527 (target
, builder
, nelts_reqd
))
22530 /* Case 4: Vector contains leading same element. */
22532 if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
22533 (target
, rev_builder
, nelts_reqd
) && nelts_reqd
== nelts
)
22535 emit_insn (gen_aarch64_sve_rev (mode
, target
, target
));
22539 /* Avoid recursing below 4-elements.
22540 ??? The threshold 4 may need fine-tuning. */
22542 if (nelts_reqd
<= 4)
22545 rtx_vector_builder
v_even (mode
, nelts
, 1);
22546 rtx_vector_builder
v_odd (mode
, nelts
, 1);
22548 for (int i
= 0; i
< nelts
* 2; i
+= 2)
22550 v_even
.quick_push (builder
.elt (i
));
22551 v_odd
.quick_push (builder
.elt (i
+ 1));
22554 v_even
.finalize ();
22557 rtx tmp1
= gen_reg_rtx (mode
);
22558 bool did_even_p
= aarch64_sve_expand_vector_init (tmp1
, v_even
,
22559 nelts
, nelts_reqd
/ 2);
22561 rtx tmp2
= gen_reg_rtx (mode
);
22562 bool did_odd_p
= aarch64_sve_expand_vector_init (tmp2
, v_odd
,
22563 nelts
, nelts_reqd
/ 2);
22565 if (!did_even_p
&& !did_odd_p
)
22568 /* Initialize v_even and v_odd using INSR if it didn't match any of the
22569 special cases and zip v_even, v_odd. */
22572 aarch64_sve_expand_vector_init_insert_elems (tmp1
, v_even
, nelts_reqd
/ 2);
22575 aarch64_sve_expand_vector_init_insert_elems (tmp2
, v_odd
, nelts_reqd
/ 2);
22577 rtvec v
= gen_rtvec (2, tmp1
, tmp2
);
22578 emit_set_insn (target
, gen_rtx_UNSPEC (mode
, v
, UNSPEC_ZIP1
));
22582 /* Initialize register TARGET from the elements in PARALLEL rtx VALS. */
22585 aarch64_sve_expand_vector_init (rtx target
, rtx vals
)
22587 machine_mode mode
= GET_MODE (target
);
22588 int nelts
= XVECLEN (vals
, 0);
22590 rtx_vector_builder
v (mode
, nelts
, 1);
22591 for (int i
= 0; i
< nelts
; i
++)
22592 v
.quick_push (XVECEXP (vals
, 0, i
));
22595 /* If neither sub-vectors of v could be initialized specially,
22596 then use INSR to insert all elements from v into TARGET.
22597 ??? This might not be optimal for vectors with large
22598 initializers like 16-element or above.
22599 For nelts < 4, it probably isn't useful to handle specially. */
22602 || !aarch64_sve_expand_vector_init (target
, v
, nelts
, nelts
))
22603 aarch64_sve_expand_vector_init_insert_elems (target
, v
, nelts
);
22606 /* Check whether VALUE is a vector constant in which every element
22607 is either a power of 2 or a negated power of 2. If so, return
22608 a constant vector of log2s, and flip CODE between PLUS and MINUS
22609 if VALUE contains negated powers of 2. Return NULL_RTX otherwise. */
22612 aarch64_convert_mult_to_shift (rtx value
, rtx_code
&code
)
22614 if (!CONST_VECTOR_P (value
))
22617 rtx_vector_builder builder
;
22618 if (!builder
.new_unary_operation (GET_MODE (value
), value
, false))
22621 scalar_mode int_mode
= GET_MODE_INNER (GET_MODE (value
));
22622 /* 1 if the result of the multiplication must be negated,
22623 0 if it mustn't, or -1 if we don't yet care. */
22625 unsigned int encoded_nelts
= const_vector_encoded_nelts (value
);
22626 for (unsigned int i
= 0; i
< encoded_nelts
; ++i
)
22628 rtx elt
= CONST_VECTOR_ENCODED_ELT (value
, i
);
22629 if (!CONST_SCALAR_INT_P (elt
))
22631 rtx_mode_t
val (elt
, int_mode
);
22632 wide_int pow2
= wi::neg (val
);
22635 /* It matters whether we negate or not. Make that choice,
22636 and make sure that it's consistent with previous elements. */
22637 if (negate
== !wi::neg_p (val
))
22639 negate
= wi::neg_p (val
);
22643 /* POW2 is now the value that we want to be a power of 2. */
22644 int shift
= wi::exact_log2 (pow2
);
22647 builder
.quick_push (gen_int_mode (shift
, int_mode
));
22650 /* PLUS and MINUS are equivalent; canonicalize on PLUS. */
22652 else if (negate
== 1)
22653 code
= code
== PLUS
? MINUS
: PLUS
;
22654 return builder
.build ();
22657 /* Prepare for an integer SVE multiply-add or multiply-subtract pattern;
22658 CODE is PLUS for the former and MINUS for the latter. OPERANDS is the
22659 operands array, in the same order as for fma_optab. Return true if
22660 the function emitted all the necessary instructions, false if the caller
22661 should generate the pattern normally with the new OPERANDS array. */
22664 aarch64_prepare_sve_int_fma (rtx
*operands
, rtx_code code
)
22666 machine_mode mode
= GET_MODE (operands
[0]);
22667 if (rtx shifts
= aarch64_convert_mult_to_shift (operands
[2], code
))
22669 rtx product
= expand_binop (mode
, vashl_optab
, operands
[1], shifts
,
22670 NULL_RTX
, true, OPTAB_DIRECT
);
22671 force_expand_binop (mode
, code
== PLUS
? add_optab
: sub_optab
,
22672 operands
[3], product
, operands
[0], true,
22676 operands
[2] = force_reg (mode
, operands
[2]);
22680 /* Likewise, but for a conditional pattern. */
22683 aarch64_prepare_sve_cond_int_fma (rtx
*operands
, rtx_code code
)
22685 machine_mode mode
= GET_MODE (operands
[0]);
22686 if (rtx shifts
= aarch64_convert_mult_to_shift (operands
[3], code
))
22688 rtx product
= expand_binop (mode
, vashl_optab
, operands
[2], shifts
,
22689 NULL_RTX
, true, OPTAB_DIRECT
);
22690 emit_insn (gen_cond (code
, mode
, operands
[0], operands
[1],
22691 operands
[4], product
, operands
[5]));
22694 operands
[3] = force_reg (mode
, operands
[3]);
22698 static unsigned HOST_WIDE_INT
22699 aarch64_shift_truncation_mask (machine_mode mode
)
22701 if (!SHIFT_COUNT_TRUNCATED
|| aarch64_vector_data_mode_p (mode
))
22703 return GET_MODE_UNIT_BITSIZE (mode
) - 1;
22706 /* Select a format to encode pointers in exception handling data. */
22708 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED
, int global
)
22711 switch (aarch64_cmodel
)
22713 case AARCH64_CMODEL_TINY
:
22714 case AARCH64_CMODEL_TINY_PIC
:
22715 case AARCH64_CMODEL_SMALL
:
22716 case AARCH64_CMODEL_SMALL_PIC
:
22717 case AARCH64_CMODEL_SMALL_SPIC
:
22718 /* text+got+data < 4Gb. 4-byte signed relocs are sufficient
22720 type
= DW_EH_PE_sdata4
;
22723 /* No assumptions here. 8-byte relocs required. */
22724 type
= DW_EH_PE_sdata8
;
22727 return (global
? DW_EH_PE_indirect
: 0) | DW_EH_PE_pcrel
| type
;
22730 /* Output .variant_pcs for aarch64_vector_pcs function symbols. */
22733 aarch64_asm_output_variant_pcs (FILE *stream
, const tree decl
, const char* name
)
22735 if (TREE_CODE (decl
) == FUNCTION_DECL
)
22737 arm_pcs pcs
= (arm_pcs
) fndecl_abi (decl
).id ();
22738 if (pcs
== ARM_PCS_SIMD
|| pcs
== ARM_PCS_SVE
)
22740 fprintf (stream
, "\t.variant_pcs\t");
22741 assemble_name (stream
, name
);
22742 fprintf (stream
, "\n");
22747 /* The last .arch and .tune assembly strings that we printed. */
22748 static std::string aarch64_last_printed_arch_string
;
22749 static std::string aarch64_last_printed_tune_string
;
22751 /* Implement ASM_DECLARE_FUNCTION_NAME. Output the ISA features used
22752 by the function fndecl. */
22755 aarch64_declare_function_name (FILE *stream
, const char* name
,
22758 tree target_parts
= DECL_FUNCTION_SPECIFIC_TARGET (fndecl
);
22760 struct cl_target_option
*targ_options
;
22762 targ_options
= TREE_TARGET_OPTION (target_parts
);
22764 targ_options
= TREE_TARGET_OPTION (target_option_current_node
);
22765 gcc_assert (targ_options
);
22767 const struct processor
*this_arch
22768 = aarch64_get_arch (targ_options
->x_selected_arch
);
22770 auto isa_flags
= targ_options
->x_aarch64_asm_isa_flags
;
22771 std::string extension
22772 = aarch64_get_extension_string_for_isa_flags (isa_flags
,
22774 /* Only update the assembler .arch string if it is distinct from the last
22775 such string we printed. */
22776 std::string to_print
= this_arch
->name
+ extension
;
22777 if (to_print
!= aarch64_last_printed_arch_string
)
22779 asm_fprintf (asm_out_file
, "\t.arch %s\n", to_print
.c_str ());
22780 aarch64_last_printed_arch_string
= to_print
;
22783 /* Print the cpu name we're tuning for in the comments, might be
22784 useful to readers of the generated asm. Do it only when it changes
22785 from function to function and verbose assembly is requested. */
22786 const struct processor
*this_tune
22787 = aarch64_get_tune_cpu (targ_options
->x_selected_tune
);
22789 if (flag_debug_asm
&& aarch64_last_printed_tune_string
!= this_tune
->name
)
22791 asm_fprintf (asm_out_file
, "\t" ASM_COMMENT_START
".tune %s\n",
22793 aarch64_last_printed_tune_string
= this_tune
->name
;
22796 aarch64_asm_output_variant_pcs (stream
, fndecl
, name
);
22798 /* Don't forget the type directive for ELF. */
22799 ASM_OUTPUT_TYPE_DIRECTIVE (stream
, name
, "function");
22800 ASM_OUTPUT_LABEL (stream
, name
);
22802 cfun
->machine
->label_is_assembled
= true;
22805 /* Implement PRINT_PATCHABLE_FUNCTION_ENTRY. */
22808 aarch64_print_patchable_function_entry (FILE *file
,
22809 unsigned HOST_WIDE_INT patch_area_size
,
22812 if (!cfun
->machine
->label_is_assembled
)
22814 /* Emit the patching area before the entry label, if any. */
22815 default_print_patchable_function_entry (file
, patch_area_size
,
22820 rtx pa
= gen_patchable_area (GEN_INT (patch_area_size
),
22821 GEN_INT (record_p
));
22822 basic_block bb
= ENTRY_BLOCK_PTR_FOR_FN (cfun
)->next_bb
;
22824 if (!aarch64_bti_enabled ()
22825 || cgraph_node::get (cfun
->decl
)->only_called_directly_p ())
22827 /* Emit the patchable_area at the beginning of the function. */
22828 rtx_insn
*insn
= emit_insn_before (pa
, BB_HEAD (bb
));
22829 INSN_ADDRESSES_NEW (insn
, -1);
22833 rtx_insn
*insn
= next_real_nondebug_insn (get_insns ());
22836 || GET_CODE (PATTERN (insn
)) != UNSPEC_VOLATILE
22837 || XINT (PATTERN (insn
), 1) != UNSPECV_BTI_C
)
22839 /* Emit a BTI_C. */
22840 insn
= emit_insn_before (gen_bti_c (), BB_HEAD (bb
));
22843 /* Emit the patchable_area after BTI_C. */
22844 insn
= emit_insn_after (pa
, insn
);
22845 INSN_ADDRESSES_NEW (insn
, -1);
22848 /* Output patchable area. */
22851 aarch64_output_patchable_area (unsigned int patch_area_size
, bool record_p
)
22853 default_print_patchable_function_entry (asm_out_file
, patch_area_size
,
22857 /* Implement ASM_OUTPUT_DEF_FROM_DECLS. Output .variant_pcs for aliases. */
22860 aarch64_asm_output_alias (FILE *stream
, const tree decl
, const tree target
)
22862 const char *name
= XSTR (XEXP (DECL_RTL (decl
), 0), 0);
22863 const char *value
= IDENTIFIER_POINTER (target
);
22864 aarch64_asm_output_variant_pcs (stream
, decl
, name
);
22865 ASM_OUTPUT_DEF (stream
, name
, value
);
22868 /* Implement ASM_OUTPUT_EXTERNAL. Output .variant_pcs for undefined
22869 function symbol references. */
22872 aarch64_asm_output_external (FILE *stream
, tree decl
, const char* name
)
22874 default_elf_asm_output_external (stream
, decl
, name
);
22875 aarch64_asm_output_variant_pcs (stream
, decl
, name
);
22878 /* Triggered after a .cfi_startproc directive is emitted into the assembly file.
22879 Used to output the .cfi_b_key_frame directive when signing the current
22880 function with the B key. */
22883 aarch64_post_cfi_startproc (FILE *f
, tree ignored ATTRIBUTE_UNUSED
)
22885 if (cfun
->machine
->frame
.laid_out
&& aarch64_return_address_signing_enabled ()
22886 && aarch64_ra_sign_key
== AARCH64_KEY_B
)
22887 asm_fprintf (f
, "\t.cfi_b_key_frame\n");
22890 /* Implements TARGET_ASM_FILE_START. Output the assembly header. */
22893 aarch64_start_file (void)
22895 struct cl_target_option
*default_options
22896 = TREE_TARGET_OPTION (target_option_default_node
);
22898 const struct processor
*default_arch
22899 = aarch64_get_arch (default_options
->x_selected_arch
);
22900 auto default_isa_flags
= default_options
->x_aarch64_asm_isa_flags
;
22901 std::string extension
22902 = aarch64_get_extension_string_for_isa_flags (default_isa_flags
,
22903 default_arch
->flags
);
22905 aarch64_last_printed_arch_string
= default_arch
->name
+ extension
;
22906 aarch64_last_printed_tune_string
= "";
22907 asm_fprintf (asm_out_file
, "\t.arch %s\n",
22908 aarch64_last_printed_arch_string
.c_str ());
22910 default_file_start ();
22913 /* Emit load exclusive. */
22916 aarch64_emit_load_exclusive (machine_mode mode
, rtx rval
,
22917 rtx mem
, rtx model_rtx
)
22919 if (mode
== TImode
)
22920 emit_insn (gen_aarch64_load_exclusive_pair (gen_lowpart (DImode
, rval
),
22921 gen_highpart (DImode
, rval
),
22924 emit_insn (gen_aarch64_load_exclusive (mode
, rval
, mem
, model_rtx
));
22927 /* Emit store exclusive. */
22930 aarch64_emit_store_exclusive (machine_mode mode
, rtx bval
,
22931 rtx mem
, rtx rval
, rtx model_rtx
)
22933 if (mode
== TImode
)
22934 emit_insn (gen_aarch64_store_exclusive_pair
22935 (bval
, mem
, operand_subword (rval
, 0, 0, TImode
),
22936 operand_subword (rval
, 1, 0, TImode
), model_rtx
));
22938 emit_insn (gen_aarch64_store_exclusive (mode
, bval
, mem
, rval
, model_rtx
));
22941 /* Mark the previous jump instruction as unlikely. */
22944 aarch64_emit_unlikely_jump (rtx insn
)
22946 rtx_insn
*jump
= emit_jump_insn (insn
);
22947 add_reg_br_prob_note (jump
, profile_probability::very_unlikely ());
22950 /* We store the names of the various atomic helpers in a 5x5 array.
22951 Return the libcall function given MODE, MODEL and NAMES. */
22954 aarch64_atomic_ool_func(machine_mode mode
, rtx model_rtx
,
22955 const atomic_ool_names
*names
)
22957 memmodel model
= memmodel_from_int (INTVAL (model_rtx
));
22958 int mode_idx
, model_idx
;
22978 gcc_unreachable ();
22983 case MEMMODEL_RELAXED
:
22986 case MEMMODEL_CONSUME
:
22987 case MEMMODEL_ACQUIRE
:
22990 case MEMMODEL_RELEASE
:
22993 case MEMMODEL_ACQ_REL
:
22994 case MEMMODEL_SEQ_CST
:
22997 case MEMMODEL_SYNC_ACQUIRE
:
22998 case MEMMODEL_SYNC_RELEASE
:
22999 case MEMMODEL_SYNC_SEQ_CST
:
23003 gcc_unreachable ();
23006 return init_one_libfunc_visibility (names
->str
[mode_idx
][model_idx
],
23007 VISIBILITY_HIDDEN
);
23010 #define DEF0(B, N) \
23011 { "__aarch64_" #B #N "_relax", \
23012 "__aarch64_" #B #N "_acq", \
23013 "__aarch64_" #B #N "_rel", \
23014 "__aarch64_" #B #N "_acq_rel", \
23015 "__aarch64_" #B #N "_sync" }
23017 #define DEF4(B) DEF0(B, 1), DEF0(B, 2), DEF0(B, 4), DEF0(B, 8), \
23018 { NULL, NULL, NULL, NULL }
23019 #define DEF5(B) DEF0(B, 1), DEF0(B, 2), DEF0(B, 4), DEF0(B, 8), DEF0(B, 16)
23021 static const atomic_ool_names aarch64_ool_cas_names
= { { DEF5(cas
) } };
23022 const atomic_ool_names aarch64_ool_swp_names
= { { DEF4(swp
) } };
23023 const atomic_ool_names aarch64_ool_ldadd_names
= { { DEF4(ldadd
) } };
23024 const atomic_ool_names aarch64_ool_ldset_names
= { { DEF4(ldset
) } };
23025 const atomic_ool_names aarch64_ool_ldclr_names
= { { DEF4(ldclr
) } };
23026 const atomic_ool_names aarch64_ool_ldeor_names
= { { DEF4(ldeor
) } };
23032 /* Expand a compare and swap pattern. */
23035 aarch64_expand_compare_and_swap (rtx operands
[])
23037 rtx bval
, rval
, mem
, oldval
, newval
, is_weak
, mod_s
, mod_f
, x
, cc_reg
;
23038 machine_mode mode
, r_mode
;
23040 bval
= operands
[0];
23041 rval
= operands
[1];
23043 oldval
= operands
[3];
23044 newval
= operands
[4];
23045 is_weak
= operands
[5];
23046 mod_s
= operands
[6];
23047 mod_f
= operands
[7];
23048 mode
= GET_MODE (mem
);
23050 /* Normally the succ memory model must be stronger than fail, but in the
23051 unlikely event of fail being ACQUIRE and succ being RELEASE we need to
23052 promote succ to ACQ_REL so that we don't lose the acquire semantics. */
23053 if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f
)))
23054 && is_mm_release (memmodel_from_int (INTVAL (mod_s
))))
23055 mod_s
= GEN_INT (MEMMODEL_ACQ_REL
);
23058 if (mode
== QImode
|| mode
== HImode
)
23061 rval
= gen_reg_rtx (r_mode
);
23066 /* The CAS insn requires oldval and rval overlap, but we need to
23067 have a copy of oldval saved across the operation to tell if
23068 the operation is successful. */
23069 if (reg_overlap_mentioned_p (rval
, oldval
))
23070 rval
= copy_to_mode_reg (r_mode
, oldval
);
23072 emit_move_insn (rval
, gen_lowpart (r_mode
, oldval
));
23074 emit_insn (gen_aarch64_compare_and_swap_lse (mode
, rval
, mem
,
23076 cc_reg
= aarch64_gen_compare_reg_maybe_ze (NE
, rval
, oldval
, mode
);
23078 else if (TARGET_OUTLINE_ATOMICS
)
23080 /* Oldval must satisfy compare afterward. */
23081 if (!aarch64_plus_operand (oldval
, mode
))
23082 oldval
= force_reg (mode
, oldval
);
23083 rtx func
= aarch64_atomic_ool_func (mode
, mod_s
, &aarch64_ool_cas_names
);
23084 rval
= emit_library_call_value (func
, NULL_RTX
, LCT_NORMAL
, r_mode
,
23085 oldval
, mode
, newval
, mode
,
23086 XEXP (mem
, 0), Pmode
);
23087 cc_reg
= aarch64_gen_compare_reg_maybe_ze (NE
, rval
, oldval
, mode
);
23091 /* The oldval predicate varies by mode. Test it and force to reg. */
23092 insn_code code
= code_for_aarch64_compare_and_swap (mode
);
23093 if (!insn_data
[code
].operand
[2].predicate (oldval
, mode
))
23094 oldval
= force_reg (mode
, oldval
);
23096 emit_insn (GEN_FCN (code
) (rval
, mem
, oldval
, newval
,
23097 is_weak
, mod_s
, mod_f
));
23098 cc_reg
= gen_rtx_REG (CCmode
, CC_REGNUM
);
23101 if (r_mode
!= mode
)
23102 rval
= gen_lowpart (mode
, rval
);
23103 emit_move_insn (operands
[1], rval
);
23105 x
= gen_rtx_EQ (SImode
, cc_reg
, const0_rtx
);
23106 emit_insn (gen_rtx_SET (bval
, x
));
23109 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
23110 sequence implementing an atomic operation. */
23113 aarch64_emit_post_barrier (enum memmodel model
)
23115 const enum memmodel base_model
= memmodel_base (model
);
23117 if (is_mm_sync (model
)
23118 && (base_model
== MEMMODEL_ACQUIRE
23119 || base_model
== MEMMODEL_ACQ_REL
23120 || base_model
== MEMMODEL_SEQ_CST
))
23122 emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST
)));
23126 /* Split a compare and swap pattern. */
23129 aarch64_split_compare_and_swap (rtx operands
[])
23131 /* Split after prolog/epilog to avoid interactions with shrinkwrapping. */
23132 gcc_assert (epilogue_completed
);
23134 rtx rval
, mem
, oldval
, newval
, scratch
, x
, model_rtx
;
23137 rtx_code_label
*label1
, *label2
;
23138 enum memmodel model
;
23140 rval
= operands
[0];
23142 oldval
= operands
[2];
23143 newval
= operands
[3];
23144 is_weak
= (operands
[4] != const0_rtx
);
23145 model_rtx
= operands
[5];
23146 scratch
= operands
[7];
23147 mode
= GET_MODE (mem
);
23148 model
= memmodel_from_int (INTVAL (model_rtx
));
23150 /* When OLDVAL is zero and we want the strong version we can emit a tighter
23153 LD[A]XR rval, [mem]
23155 ST[L]XR scratch, newval, [mem]
23156 CBNZ scratch, .label1
23159 bool strong_zero_p
= (!is_weak
&& !aarch64_track_speculation
&&
23160 oldval
== const0_rtx
&& mode
!= TImode
);
23165 label1
= gen_label_rtx ();
23166 emit_label (label1
);
23168 label2
= gen_label_rtx ();
23170 /* The initial load can be relaxed for a __sync operation since a final
23171 barrier will be emitted to stop code hoisting. */
23172 if (is_mm_sync (model
))
23173 aarch64_emit_load_exclusive (mode
, rval
, mem
, GEN_INT (MEMMODEL_RELAXED
));
23175 aarch64_emit_load_exclusive (mode
, rval
, mem
, model_rtx
);
23178 x
= gen_rtx_NE (VOIDmode
, rval
, const0_rtx
);
23181 rtx cc_reg
= aarch64_gen_compare_reg_maybe_ze (NE
, rval
, oldval
, mode
);
23182 x
= gen_rtx_NE (VOIDmode
, cc_reg
, const0_rtx
);
23184 x
= gen_rtx_IF_THEN_ELSE (VOIDmode
, x
,
23185 gen_rtx_LABEL_REF (Pmode
, label2
), pc_rtx
);
23186 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx
, x
));
23188 aarch64_emit_store_exclusive (mode
, scratch
, mem
, newval
, model_rtx
);
23192 if (aarch64_track_speculation
)
23194 /* Emit an explicit compare instruction, so that we can correctly
23195 track the condition codes. */
23196 rtx cc_reg
= aarch64_gen_compare_reg (NE
, scratch
, const0_rtx
);
23197 x
= gen_rtx_NE (GET_MODE (cc_reg
), cc_reg
, const0_rtx
);
23200 x
= gen_rtx_NE (VOIDmode
, scratch
, const0_rtx
);
23202 x
= gen_rtx_IF_THEN_ELSE (VOIDmode
, x
,
23203 gen_rtx_LABEL_REF (Pmode
, label1
), pc_rtx
);
23204 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx
, x
));
23207 aarch64_gen_compare_reg (NE
, scratch
, const0_rtx
);
23209 emit_label (label2
);
23211 /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
23212 to set the condition flags. If this is not used it will be removed by
23215 aarch64_gen_compare_reg (NE
, rval
, const0_rtx
);
23217 /* Emit any final barrier needed for a __sync operation. */
23218 if (is_mm_sync (model
))
23219 aarch64_emit_post_barrier (model
);
23222 /* Split an atomic operation. */
23225 aarch64_split_atomic_op (enum rtx_code code
, rtx old_out
, rtx new_out
, rtx mem
,
23226 rtx value
, rtx model_rtx
, rtx cond
)
23228 /* Split after prolog/epilog to avoid interactions with shrinkwrapping. */
23229 gcc_assert (epilogue_completed
);
23231 machine_mode mode
= GET_MODE (mem
);
23232 machine_mode wmode
= (mode
== DImode
? DImode
: SImode
);
23233 const enum memmodel model
= memmodel_from_int (INTVAL (model_rtx
));
23234 const bool is_sync
= is_mm_sync (model
);
23235 rtx_code_label
*label
;
23238 /* Split the atomic operation into a sequence. */
23239 label
= gen_label_rtx ();
23240 emit_label (label
);
23243 new_out
= gen_lowpart (wmode
, new_out
);
23245 old_out
= gen_lowpart (wmode
, old_out
);
23248 value
= simplify_gen_subreg (wmode
, value
, mode
, 0);
23250 /* The initial load can be relaxed for a __sync operation since a final
23251 barrier will be emitted to stop code hoisting. */
23253 aarch64_emit_load_exclusive (mode
, old_out
, mem
,
23254 GEN_INT (MEMMODEL_RELAXED
));
23256 aarch64_emit_load_exclusive (mode
, old_out
, mem
, model_rtx
);
23265 x
= gen_rtx_AND (wmode
, old_out
, value
);
23266 emit_insn (gen_rtx_SET (new_out
, x
));
23267 x
= gen_rtx_NOT (wmode
, new_out
);
23268 emit_insn (gen_rtx_SET (new_out
, x
));
23272 if (CONST_INT_P (value
))
23274 value
= GEN_INT (-UINTVAL (value
));
23277 /* Fall through. */
23280 x
= gen_rtx_fmt_ee (code
, wmode
, old_out
, value
);
23281 emit_insn (gen_rtx_SET (new_out
, x
));
23285 aarch64_emit_store_exclusive (mode
, cond
, mem
,
23286 gen_lowpart (mode
, new_out
), model_rtx
);
23288 if (aarch64_track_speculation
)
23290 /* Emit an explicit compare instruction, so that we can correctly
23291 track the condition codes. */
23292 rtx cc_reg
= aarch64_gen_compare_reg (NE
, cond
, const0_rtx
);
23293 x
= gen_rtx_NE (GET_MODE (cc_reg
), cc_reg
, const0_rtx
);
23296 x
= gen_rtx_NE (VOIDmode
, cond
, const0_rtx
);
23298 x
= gen_rtx_IF_THEN_ELSE (VOIDmode
, x
,
23299 gen_rtx_LABEL_REF (Pmode
, label
), pc_rtx
);
23300 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx
, x
));
23302 /* Emit any final barrier needed for a __sync operation. */
23304 aarch64_emit_post_barrier (model
);
23308 aarch64_init_libfuncs (void)
23310 /* Half-precision float operations. The compiler handles all operations
23311 with NULL libfuncs by converting to SFmode. */
23314 set_conv_libfunc (trunc_optab
, HFmode
, SFmode
, "__gnu_f2h_ieee");
23315 set_conv_libfunc (sext_optab
, SFmode
, HFmode
, "__gnu_h2f_ieee");
23318 set_optab_libfunc (add_optab
, HFmode
, NULL
);
23319 set_optab_libfunc (sdiv_optab
, HFmode
, NULL
);
23320 set_optab_libfunc (smul_optab
, HFmode
, NULL
);
23321 set_optab_libfunc (neg_optab
, HFmode
, NULL
);
23322 set_optab_libfunc (sub_optab
, HFmode
, NULL
);
23325 set_optab_libfunc (eq_optab
, HFmode
, NULL
);
23326 set_optab_libfunc (ne_optab
, HFmode
, NULL
);
23327 set_optab_libfunc (lt_optab
, HFmode
, NULL
);
23328 set_optab_libfunc (le_optab
, HFmode
, NULL
);
23329 set_optab_libfunc (ge_optab
, HFmode
, NULL
);
23330 set_optab_libfunc (gt_optab
, HFmode
, NULL
);
23331 set_optab_libfunc (unord_optab
, HFmode
, NULL
);
23334 /* Target hook for c_mode_for_suffix. */
23335 static machine_mode
23336 aarch64_c_mode_for_suffix (char suffix
)
23344 /* We can only represent floating point constants which will fit in
23345 "quarter-precision" values. These values are characterised by
23346 a sign bit, a 4-bit mantissa and a 3-bit exponent. And are given
23349 (-1)^s * (n/16) * 2^r
23352 's' is the sign bit.
23353 'n' is an integer in the range 16 <= n <= 31.
23354 'r' is an integer in the range -3 <= r <= 4. */
23356 /* Return true iff X can be represented by a quarter-precision
23357 floating point immediate operand X. Note, we cannot represent 0.0. */
23359 aarch64_float_const_representable_p (rtx x
)
23361 /* This represents our current view of how many bits
23362 make up the mantissa. */
23363 int point_pos
= 2 * HOST_BITS_PER_WIDE_INT
- 1;
23365 unsigned HOST_WIDE_INT mantissa
, mask
;
23366 REAL_VALUE_TYPE r
, m
;
23369 x
= unwrap_const_vec_duplicate (x
);
23370 if (!CONST_DOUBLE_P (x
))
23373 if (GET_MODE (x
) == VOIDmode
23374 || (GET_MODE (x
) == HFmode
&& !TARGET_FP_F16INST
))
23377 r
= *CONST_DOUBLE_REAL_VALUE (x
);
23379 /* We cannot represent infinities, NaNs or +/-zero. We won't
23380 know if we have +zero until we analyse the mantissa, but we
23381 can reject the other invalid values. */
23382 if (REAL_VALUE_ISINF (r
) || REAL_VALUE_ISNAN (r
)
23383 || REAL_VALUE_MINUS_ZERO (r
))
23386 /* Extract exponent. */
23387 r
= real_value_abs (&r
);
23388 exponent
= REAL_EXP (&r
);
23390 /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
23391 highest (sign) bit, with a fixed binary point at bit point_pos.
23392 m1 holds the low part of the mantissa, m2 the high part.
23393 WARNING: If we ever have a representation using more than 2 * H_W_I - 1
23394 bits for the mantissa, this can fail (low bits will be lost). */
23395 real_ldexp (&m
, &r
, point_pos
- exponent
);
23396 wide_int w
= real_to_integer (&m
, &fail
, HOST_BITS_PER_WIDE_INT
* 2);
23398 /* If the low part of the mantissa has bits set we cannot represent
23400 if (w
.ulow () != 0)
23402 /* We have rejected the lower HOST_WIDE_INT, so update our
23403 understanding of how many bits lie in the mantissa and
23404 look only at the high HOST_WIDE_INT. */
23405 mantissa
= w
.elt (1);
23406 point_pos
-= HOST_BITS_PER_WIDE_INT
;
23408 /* We can only represent values with a mantissa of the form 1.xxxx. */
23409 mask
= ((unsigned HOST_WIDE_INT
)1 << (point_pos
- 5)) - 1;
23410 if ((mantissa
& mask
) != 0)
23413 /* Having filtered unrepresentable values, we may now remove all
23414 but the highest 5 bits. */
23415 mantissa
>>= point_pos
- 5;
23417 /* We cannot represent the value 0.0, so reject it. This is handled
23422 /* Then, as bit 4 is always set, we can mask it off, leaving
23423 the mantissa in the range [0, 15]. */
23424 mantissa
&= ~(1 << 4);
23425 gcc_assert (mantissa
<= 15);
23427 /* GCC internally does not use IEEE754-like encoding (where normalized
23428 significands are in the range [1, 2). GCC uses [0.5, 1) (see real.cc).
23429 Our mantissa values are shifted 4 places to the left relative to
23430 normalized IEEE754 so we must modify the exponent returned by REAL_EXP
23431 by 5 places to correct for GCC's representation. */
23432 exponent
= 5 - exponent
;
23434 return (exponent
>= 0 && exponent
<= 7);
23437 /* Returns the string with the instruction for AdvSIMD MOVI, MVNI, ORR or BIC
23438 immediate with a CONST_VECTOR of MODE and WIDTH. WHICH selects whether to
23439 output MOVI/MVNI, ORR or BIC immediate. */
23441 aarch64_output_simd_mov_immediate (rtx const_vector
, unsigned width
,
23442 enum simd_immediate_check which
)
23445 static char templ
[40];
23446 const char *mnemonic
;
23447 const char *shift_op
;
23448 unsigned int lane_count
= 0;
23451 struct simd_immediate_info info
;
23453 /* This will return true to show const_vector is legal for use as either
23454 a AdvSIMD MOVI instruction (or, implicitly, MVNI), ORR or BIC immediate.
23455 It will also update INFO to show how the immediate should be generated.
23456 WHICH selects whether to check for MOVI/MVNI, ORR or BIC. */
23457 is_valid
= aarch64_simd_valid_immediate (const_vector
, &info
, which
);
23458 gcc_assert (is_valid
);
23460 element_char
= sizetochar (GET_MODE_BITSIZE (info
.elt_mode
));
23461 lane_count
= width
/ GET_MODE_BITSIZE (info
.elt_mode
);
23463 if (GET_MODE_CLASS (info
.elt_mode
) == MODE_FLOAT
)
23465 gcc_assert (info
.insn
== simd_immediate_info::MOV
23466 && info
.u
.mov
.shift
== 0);
23467 /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
23468 move immediate path. */
23469 if (aarch64_float_const_zero_rtx_p (info
.u
.mov
.value
))
23470 info
.u
.mov
.value
= GEN_INT (0);
23473 const unsigned int buf_size
= 20;
23474 char float_buf
[buf_size
] = {'\0'};
23475 real_to_decimal_for_mode (float_buf
,
23476 CONST_DOUBLE_REAL_VALUE (info
.u
.mov
.value
),
23477 buf_size
, buf_size
, 1, info
.elt_mode
);
23479 if (lane_count
== 1)
23480 snprintf (templ
, sizeof (templ
), "fmov\t%%d0, %s", float_buf
);
23482 snprintf (templ
, sizeof (templ
), "fmov\t%%0.%d%c, %s",
23483 lane_count
, element_char
, float_buf
);
23488 gcc_assert (CONST_INT_P (info
.u
.mov
.value
));
23490 if (which
== AARCH64_CHECK_MOV
)
23492 mnemonic
= info
.insn
== simd_immediate_info::MVN
? "mvni" : "movi";
23493 shift_op
= (info
.u
.mov
.modifier
== simd_immediate_info::MSL
23495 if (lane_count
== 1)
23496 snprintf (templ
, sizeof (templ
), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX
,
23497 mnemonic
, UINTVAL (info
.u
.mov
.value
));
23498 else if (info
.u
.mov
.shift
)
23499 snprintf (templ
, sizeof (templ
), "%s\t%%0.%d%c, "
23500 HOST_WIDE_INT_PRINT_HEX
", %s %d", mnemonic
, lane_count
,
23501 element_char
, UINTVAL (info
.u
.mov
.value
), shift_op
,
23504 snprintf (templ
, sizeof (templ
), "%s\t%%0.%d%c, "
23505 HOST_WIDE_INT_PRINT_HEX
, mnemonic
, lane_count
,
23506 element_char
, UINTVAL (info
.u
.mov
.value
));
23510 /* For AARCH64_CHECK_BIC and AARCH64_CHECK_ORR. */
23511 mnemonic
= info
.insn
== simd_immediate_info::MVN
? "bic" : "orr";
23512 if (info
.u
.mov
.shift
)
23513 snprintf (templ
, sizeof (templ
), "%s\t%%0.%d%c, #"
23514 HOST_WIDE_INT_PRINT_DEC
", %s #%d", mnemonic
, lane_count
,
23515 element_char
, UINTVAL (info
.u
.mov
.value
), "lsl",
23518 snprintf (templ
, sizeof (templ
), "%s\t%%0.%d%c, #"
23519 HOST_WIDE_INT_PRINT_DEC
, mnemonic
, lane_count
,
23520 element_char
, UINTVAL (info
.u
.mov
.value
));
23526 aarch64_output_scalar_simd_mov_immediate (rtx immediate
, scalar_int_mode mode
)
23529 /* If a floating point number was passed and we desire to use it in an
23530 integer mode do the conversion to integer. */
23531 if (CONST_DOUBLE_P (immediate
) && GET_MODE_CLASS (mode
) == MODE_INT
)
23533 unsigned HOST_WIDE_INT ival
;
23534 if (!aarch64_reinterpret_float_as_int (immediate
, &ival
))
23535 gcc_unreachable ();
23536 immediate
= gen_int_mode (ival
, mode
);
23539 machine_mode vmode
;
23540 /* use a 64 bit mode for everything except for DI/DF/DD mode, where we use
23541 a 128 bit vector mode. */
23542 int width
= GET_MODE_BITSIZE (mode
) == 64 ? 128 : 64;
23544 vmode
= aarch64_simd_container_mode (mode
, width
);
23545 rtx v_op
= aarch64_simd_gen_const_vector_dup (vmode
, INTVAL (immediate
));
23546 return aarch64_output_simd_mov_immediate (v_op
, width
);
23549 /* Return the output string to use for moving immediate CONST_VECTOR
23550 into an SVE register. */
23553 aarch64_output_sve_mov_immediate (rtx const_vector
)
23555 static char templ
[40];
23556 struct simd_immediate_info info
;
23559 bool is_valid
= aarch64_simd_valid_immediate (const_vector
, &info
);
23560 gcc_assert (is_valid
);
23562 element_char
= sizetochar (GET_MODE_BITSIZE (info
.elt_mode
));
23564 machine_mode vec_mode
= GET_MODE (const_vector
);
23565 if (aarch64_sve_pred_mode_p (vec_mode
))
23567 static char buf
[sizeof ("ptrue\t%0.N, vlNNNNN")];
23568 if (info
.insn
== simd_immediate_info::MOV
)
23570 gcc_assert (info
.u
.mov
.value
== const0_rtx
);
23571 snprintf (buf
, sizeof (buf
), "pfalse\t%%0.b");
23575 gcc_assert (info
.insn
== simd_immediate_info::PTRUE
);
23576 unsigned int total_bytes
;
23577 if (info
.u
.pattern
== AARCH64_SV_ALL
23578 && BYTES_PER_SVE_VECTOR
.is_constant (&total_bytes
))
23579 snprintf (buf
, sizeof (buf
), "ptrue\t%%0.%c, vl%d", element_char
,
23580 total_bytes
/ GET_MODE_SIZE (info
.elt_mode
));
23582 snprintf (buf
, sizeof (buf
), "ptrue\t%%0.%c, %s", element_char
,
23583 svpattern_token (info
.u
.pattern
));
23588 if (info
.insn
== simd_immediate_info::INDEX
)
23590 snprintf (templ
, sizeof (templ
), "index\t%%0.%c, #"
23591 HOST_WIDE_INT_PRINT_DEC
", #" HOST_WIDE_INT_PRINT_DEC
,
23592 element_char
, INTVAL (info
.u
.index
.base
),
23593 INTVAL (info
.u
.index
.step
));
23597 if (GET_MODE_CLASS (info
.elt_mode
) == MODE_FLOAT
)
23599 if (aarch64_float_const_zero_rtx_p (info
.u
.mov
.value
))
23600 info
.u
.mov
.value
= GEN_INT (0);
23603 const int buf_size
= 20;
23604 char float_buf
[buf_size
] = {};
23605 real_to_decimal_for_mode (float_buf
,
23606 CONST_DOUBLE_REAL_VALUE (info
.u
.mov
.value
),
23607 buf_size
, buf_size
, 1, info
.elt_mode
);
23609 snprintf (templ
, sizeof (templ
), "fmov\t%%0.%c, #%s",
23610 element_char
, float_buf
);
23615 snprintf (templ
, sizeof (templ
), "mov\t%%0.%c, #" HOST_WIDE_INT_PRINT_DEC
,
23616 element_char
, INTVAL (info
.u
.mov
.value
));
23620 /* Return the asm template for a PTRUES. CONST_UNSPEC is the
23621 aarch64_sve_ptrue_svpattern_immediate that describes the predicate
23625 aarch64_output_sve_ptrues (rtx const_unspec
)
23627 static char templ
[40];
23629 struct simd_immediate_info info
;
23630 bool is_valid
= aarch64_simd_valid_immediate (const_unspec
, &info
);
23631 gcc_assert (is_valid
&& info
.insn
== simd_immediate_info::PTRUE
);
23633 char element_char
= sizetochar (GET_MODE_BITSIZE (info
.elt_mode
));
23634 snprintf (templ
, sizeof (templ
), "ptrues\t%%0.%c, %s", element_char
,
23635 svpattern_token (info
.u
.pattern
));
23639 /* Split operands into moves from op[1] + op[2] into op[0]. */
23642 aarch64_split_combinev16qi (rtx operands
[3])
23644 unsigned int dest
= REGNO (operands
[0]);
23645 unsigned int src1
= REGNO (operands
[1]);
23646 unsigned int src2
= REGNO (operands
[2]);
23647 machine_mode halfmode
= GET_MODE (operands
[1]);
23648 unsigned int halfregs
= REG_NREGS (operands
[1]);
23649 rtx destlo
, desthi
;
23651 gcc_assert (halfmode
== V16QImode
);
23653 if (src1
== dest
&& src2
== dest
+ halfregs
)
23655 /* No-op move. Can't split to nothing; emit something. */
23656 emit_note (NOTE_INSN_DELETED
);
23660 /* Preserve register attributes for variable tracking. */
23661 destlo
= gen_rtx_REG_offset (operands
[0], halfmode
, dest
, 0);
23662 desthi
= gen_rtx_REG_offset (operands
[0], halfmode
, dest
+ halfregs
,
23663 GET_MODE_SIZE (halfmode
));
23665 /* Special case of reversed high/low parts. */
23666 if (reg_overlap_mentioned_p (operands
[2], destlo
)
23667 && reg_overlap_mentioned_p (operands
[1], desthi
))
23669 emit_insn (gen_xorv16qi3 (operands
[1], operands
[1], operands
[2]));
23670 emit_insn (gen_xorv16qi3 (operands
[2], operands
[1], operands
[2]));
23671 emit_insn (gen_xorv16qi3 (operands
[1], operands
[1], operands
[2]));
23673 else if (!reg_overlap_mentioned_p (operands
[2], destlo
))
23675 /* Try to avoid unnecessary moves if part of the result
23676 is in the right place already. */
23678 emit_move_insn (destlo
, operands
[1]);
23679 if (src2
!= dest
+ halfregs
)
23680 emit_move_insn (desthi
, operands
[2]);
23684 if (src2
!= dest
+ halfregs
)
23685 emit_move_insn (desthi
, operands
[2]);
23687 emit_move_insn (destlo
, operands
[1]);
23691 /* vec_perm support. */
23693 struct expand_vec_perm_d
23695 rtx target
, op0
, op1
;
23696 vec_perm_indices perm
;
23697 machine_mode vmode
;
23698 machine_mode op_mode
;
23699 unsigned int vec_flags
;
23700 unsigned int op_vec_flags
;
23705 static bool aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d
*d
);
23707 /* Generate a variable permutation. */
23710 aarch64_expand_vec_perm_1 (rtx target
, rtx op0
, rtx op1
, rtx sel
)
23712 machine_mode vmode
= GET_MODE (target
);
23713 bool one_vector_p
= rtx_equal_p (op0
, op1
);
23715 gcc_checking_assert (vmode
== V8QImode
|| vmode
== V16QImode
);
23716 gcc_checking_assert (GET_MODE (op0
) == vmode
);
23717 gcc_checking_assert (GET_MODE (op1
) == vmode
);
23718 gcc_checking_assert (GET_MODE (sel
) == vmode
);
23719 gcc_checking_assert (TARGET_SIMD
);
23723 if (vmode
== V8QImode
)
23725 /* Expand the argument to a V16QI mode by duplicating it. */
23726 rtx pair
= gen_reg_rtx (V16QImode
);
23727 emit_insn (gen_aarch64_combinev8qi (pair
, op0
, op0
));
23728 emit_insn (gen_aarch64_qtbl1v8qi (target
, pair
, sel
));
23732 emit_insn (gen_aarch64_qtbl1v16qi (target
, op0
, sel
));
23739 if (vmode
== V8QImode
)
23741 pair
= gen_reg_rtx (V16QImode
);
23742 emit_insn (gen_aarch64_combinev8qi (pair
, op0
, op1
));
23743 emit_insn (gen_aarch64_qtbl1v8qi (target
, pair
, sel
));
23747 pair
= gen_reg_rtx (V2x16QImode
);
23748 emit_insn (gen_aarch64_combinev16qi (pair
, op0
, op1
));
23749 emit_insn (gen_aarch64_qtbl2v16qi (target
, pair
, sel
));
23754 /* Expand a vec_perm with the operands given by TARGET, OP0, OP1 and SEL.
23755 NELT is the number of elements in the vector. */
23758 aarch64_expand_vec_perm (rtx target
, rtx op0
, rtx op1
, rtx sel
,
23761 machine_mode vmode
= GET_MODE (target
);
23762 bool one_vector_p
= rtx_equal_p (op0
, op1
);
23765 /* The TBL instruction does not use a modulo index, so we must take care
23766 of that ourselves. */
23767 mask
= aarch64_simd_gen_const_vector_dup (vmode
,
23768 one_vector_p
? nelt
- 1 : 2 * nelt
- 1);
23769 sel
= expand_simple_binop (vmode
, AND
, sel
, mask
, NULL
, 0, OPTAB_LIB_WIDEN
);
23771 /* For big-endian, we also need to reverse the index within the vector
23772 (but not which vector). */
23773 if (BYTES_BIG_ENDIAN
)
23775 /* If one_vector_p, mask is a vector of (nelt - 1)'s already. */
23777 mask
= aarch64_simd_gen_const_vector_dup (vmode
, nelt
- 1);
23778 sel
= expand_simple_binop (vmode
, XOR
, sel
, mask
,
23779 NULL
, 0, OPTAB_LIB_WIDEN
);
23781 aarch64_expand_vec_perm_1 (target
, op0
, op1
, sel
);
23784 /* Generate (set TARGET (unspec [OP0 OP1] CODE)). */
23787 emit_unspec2 (rtx target
, int code
, rtx op0
, rtx op1
)
23789 emit_insn (gen_rtx_SET (target
,
23790 gen_rtx_UNSPEC (GET_MODE (target
),
23791 gen_rtvec (2, op0
, op1
), code
)));
23794 /* Expand an SVE vec_perm with the given operands. */
23797 aarch64_expand_sve_vec_perm (rtx target
, rtx op0
, rtx op1
, rtx sel
)
23799 machine_mode data_mode
= GET_MODE (target
);
23800 machine_mode sel_mode
= GET_MODE (sel
);
23801 /* Enforced by the pattern condition. */
23802 int nunits
= GET_MODE_NUNITS (sel_mode
).to_constant ();
23804 /* Note: vec_perm indices are supposed to wrap when they go beyond the
23805 size of the two value vectors, i.e. the upper bits of the indices
23806 are effectively ignored. SVE TBL instead produces 0 for any
23807 out-of-range indices, so we need to modulo all the vec_perm indices
23808 to ensure they are all in range. */
23809 rtx sel_reg
= force_reg (sel_mode
, sel
);
23811 /* Check if the sel only references the first values vector. */
23812 if (CONST_VECTOR_P (sel
)
23813 && aarch64_const_vec_all_in_range_p (sel
, 0, nunits
- 1))
23815 emit_unspec2 (target
, UNSPEC_TBL
, op0
, sel_reg
);
23819 /* Check if the two values vectors are the same. */
23820 if (rtx_equal_p (op0
, op1
))
23822 rtx max_sel
= aarch64_simd_gen_const_vector_dup (sel_mode
, nunits
- 1);
23823 rtx sel_mod
= expand_simple_binop (sel_mode
, AND
, sel_reg
, max_sel
,
23824 NULL
, 0, OPTAB_DIRECT
);
23825 emit_unspec2 (target
, UNSPEC_TBL
, op0
, sel_mod
);
23829 /* Run TBL on for each value vector and combine the results. */
23831 rtx res0
= gen_reg_rtx (data_mode
);
23832 rtx res1
= gen_reg_rtx (data_mode
);
23833 rtx neg_num_elems
= aarch64_simd_gen_const_vector_dup (sel_mode
, -nunits
);
23834 if (!CONST_VECTOR_P (sel
)
23835 || !aarch64_const_vec_all_in_range_p (sel
, 0, 2 * nunits
- 1))
23837 rtx max_sel
= aarch64_simd_gen_const_vector_dup (sel_mode
,
23839 sel_reg
= expand_simple_binop (sel_mode
, AND
, sel_reg
, max_sel
,
23840 NULL
, 0, OPTAB_DIRECT
);
23842 emit_unspec2 (res0
, UNSPEC_TBL
, op0
, sel_reg
);
23843 rtx sel_sub
= expand_simple_binop (sel_mode
, PLUS
, sel_reg
, neg_num_elems
,
23844 NULL
, 0, OPTAB_DIRECT
);
23845 emit_unspec2 (res1
, UNSPEC_TBL
, op1
, sel_sub
);
23846 if (GET_MODE_CLASS (data_mode
) == MODE_VECTOR_INT
)
23847 emit_insn (gen_rtx_SET (target
, gen_rtx_IOR (data_mode
, res0
, res1
)));
23849 emit_unspec2 (target
, UNSPEC_IORF
, res0
, res1
);
23852 /* Recognize patterns suitable for the TRN instructions. */
23854 aarch64_evpc_trn (struct expand_vec_perm_d
*d
)
23857 poly_uint64 nelt
= d
->perm
.length ();
23859 machine_mode vmode
= d
->vmode
;
23861 if (GET_MODE_UNIT_SIZE (vmode
) > 8)
23864 /* Note that these are little-endian tests.
23865 We correct for big-endian later. */
23866 if (!d
->perm
[0].is_constant (&odd
)
23867 || (odd
!= 0 && odd
!= 1)
23868 || !d
->perm
.series_p (0, 2, odd
, 2)
23869 || !d
->perm
.series_p (1, 2, nelt
+ odd
, 2))
23878 /* We don't need a big-endian lane correction for SVE; see the comment
23879 at the head of aarch64-sve.md for details. */
23880 if (BYTES_BIG_ENDIAN
&& d
->vec_flags
== VEC_ADVSIMD
)
23882 std::swap (in0
, in1
);
23887 emit_set_insn (out
, gen_rtx_UNSPEC (vmode
, gen_rtvec (2, in0
, in1
),
23888 odd
? UNSPEC_TRN2
: UNSPEC_TRN1
));
23892 /* Try to re-encode the PERM constant so it combines odd and even elements.
23893 This rewrites constants such as {0, 1, 4, 5}/V4SF to {0, 2}/V2DI.
23894 We retry with this new constant with the full suite of patterns. */
23896 aarch64_evpc_reencode (struct expand_vec_perm_d
*d
)
23898 expand_vec_perm_d newd
;
23899 unsigned HOST_WIDE_INT nelt
;
23901 if (d
->vec_flags
!= VEC_ADVSIMD
)
23904 /* Get the new mode. Always twice the size of the inner
23905 and half the elements. */
23906 poly_uint64 vec_bits
= GET_MODE_BITSIZE (d
->vmode
);
23907 unsigned int new_elt_bits
= GET_MODE_UNIT_BITSIZE (d
->vmode
) * 2;
23908 auto new_elt_mode
= int_mode_for_size (new_elt_bits
, false).require ();
23909 machine_mode new_mode
= aarch64_simd_container_mode (new_elt_mode
, vec_bits
);
23911 if (new_mode
== word_mode
)
23914 /* to_constant is safe since this routine is specific to Advanced SIMD
23916 nelt
= d
->perm
.length ().to_constant ();
23918 vec_perm_builder newpermconst
;
23919 newpermconst
.new_vector (nelt
/ 2, nelt
/ 2, 1);
23921 /* Convert the perm constant if we can. Require even, odd as the pairs. */
23922 for (unsigned int i
= 0; i
< nelt
; i
+= 2)
23924 poly_int64 elt0
= d
->perm
[i
];
23925 poly_int64 elt1
= d
->perm
[i
+ 1];
23927 if (!multiple_p (elt0
, 2, &newelt
) || maybe_ne (elt0
+ 1, elt1
))
23929 newpermconst
.quick_push (newelt
.to_constant ());
23931 newpermconst
.finalize ();
23933 newd
.vmode
= new_mode
;
23934 newd
.vec_flags
= VEC_ADVSIMD
;
23935 newd
.op_mode
= newd
.vmode
;
23936 newd
.op_vec_flags
= newd
.vec_flags
;
23937 newd
.target
= d
->target
? gen_lowpart (new_mode
, d
->target
) : NULL
;
23938 newd
.op0
= d
->op0
? gen_lowpart (new_mode
, d
->op0
) : NULL
;
23939 newd
.op1
= d
->op1
? gen_lowpart (new_mode
, d
->op1
) : NULL
;
23940 newd
.testing_p
= d
->testing_p
;
23941 newd
.one_vector_p
= d
->one_vector_p
;
23943 newd
.perm
.new_vector (newpermconst
, newd
.one_vector_p
? 1 : 2, nelt
/ 2);
23944 return aarch64_expand_vec_perm_const_1 (&newd
);
23947 /* Recognize patterns suitable for the UZP instructions. */
23949 aarch64_evpc_uzp (struct expand_vec_perm_d
*d
)
23953 machine_mode vmode
= d
->vmode
;
23955 if (GET_MODE_UNIT_SIZE (vmode
) > 8)
23958 /* Note that these are little-endian tests.
23959 We correct for big-endian later. */
23960 if (!d
->perm
[0].is_constant (&odd
)
23961 || (odd
!= 0 && odd
!= 1)
23962 || !d
->perm
.series_p (0, 1, odd
, 2))
23971 /* We don't need a big-endian lane correction for SVE; see the comment
23972 at the head of aarch64-sve.md for details. */
23973 if (BYTES_BIG_ENDIAN
&& d
->vec_flags
== VEC_ADVSIMD
)
23975 std::swap (in0
, in1
);
23980 emit_set_insn (out
, gen_rtx_UNSPEC (vmode
, gen_rtvec (2, in0
, in1
),
23981 odd
? UNSPEC_UZP2
: UNSPEC_UZP1
));
23985 /* Recognize patterns suitable for the ZIP instructions. */
23987 aarch64_evpc_zip (struct expand_vec_perm_d
*d
)
23990 poly_uint64 nelt
= d
->perm
.length ();
23992 machine_mode vmode
= d
->vmode
;
23994 if (GET_MODE_UNIT_SIZE (vmode
) > 8)
23997 /* Note that these are little-endian tests.
23998 We correct for big-endian later. */
23999 poly_uint64 first
= d
->perm
[0];
24000 if ((maybe_ne (first
, 0U) && maybe_ne (first
* 2, nelt
))
24001 || !d
->perm
.series_p (0, 2, first
, 1)
24002 || !d
->perm
.series_p (1, 2, first
+ nelt
, 1))
24004 high
= maybe_ne (first
, 0U);
24012 /* We don't need a big-endian lane correction for SVE; see the comment
24013 at the head of aarch64-sve.md for details. */
24014 if (BYTES_BIG_ENDIAN
&& d
->vec_flags
== VEC_ADVSIMD
)
24016 std::swap (in0
, in1
);
24021 emit_set_insn (out
, gen_rtx_UNSPEC (vmode
, gen_rtvec (2, in0
, in1
),
24022 high
? UNSPEC_ZIP2
: UNSPEC_ZIP1
));
24026 /* Recognize patterns for the EXT insn. */
24029 aarch64_evpc_ext (struct expand_vec_perm_d
*d
)
24031 HOST_WIDE_INT location
;
24034 /* The first element always refers to the first vector.
24035 Check if the extracted indices are increasing by one. */
24036 if (d
->vec_flags
== VEC_SVE_PRED
24037 || !d
->perm
[0].is_constant (&location
)
24038 || !d
->perm
.series_p (0, 1, location
, 1))
24045 /* The case where (location == 0) is a no-op for both big- and little-endian,
24046 and is removed by the mid-end at optimization levels -O1 and higher.
24048 We don't need a big-endian lane correction for SVE; see the comment
24049 at the head of aarch64-sve.md for details. */
24050 if (BYTES_BIG_ENDIAN
&& location
!= 0 && d
->vec_flags
== VEC_ADVSIMD
)
24052 /* After setup, we want the high elements of the first vector (stored
24053 at the LSB end of the register), and the low elements of the second
24054 vector (stored at the MSB end of the register). So swap. */
24055 std::swap (d
->op0
, d
->op1
);
24056 /* location != 0 (above), so safe to assume (nelt - location) < nelt.
24057 to_constant () is safe since this is restricted to Advanced SIMD
24059 location
= d
->perm
.length ().to_constant () - location
;
24062 offset
= GEN_INT (location
);
24063 emit_set_insn (d
->target
,
24064 gen_rtx_UNSPEC (d
->vmode
,
24065 gen_rtvec (3, d
->op0
, d
->op1
, offset
),
24070 /* Recognize patterns for the REV{64,32,16} insns, which reverse elements
24071 within each 64-bit, 32-bit or 16-bit granule. */
24074 aarch64_evpc_rev_local (struct expand_vec_perm_d
*d
)
24076 HOST_WIDE_INT diff
;
24077 unsigned int i
, size
, unspec
;
24078 machine_mode pred_mode
;
24080 if (d
->vec_flags
== VEC_SVE_PRED
24081 || !d
->one_vector_p
24082 || !d
->perm
[0].is_constant (&diff
)
24086 if (d
->vec_flags
& VEC_SVE_DATA
)
24087 size
= (diff
+ 1) * aarch64_sve_container_bits (d
->vmode
);
24089 size
= (diff
+ 1) * GET_MODE_UNIT_BITSIZE (d
->vmode
);
24092 unspec
= UNSPEC_REV64
;
24093 pred_mode
= VNx2BImode
;
24095 else if (size
== 32)
24097 unspec
= UNSPEC_REV32
;
24098 pred_mode
= VNx4BImode
;
24100 else if (size
== 16)
24102 unspec
= UNSPEC_REV16
;
24103 pred_mode
= VNx8BImode
;
24108 unsigned int step
= diff
+ 1;
24109 for (i
= 0; i
< step
; ++i
)
24110 if (!d
->perm
.series_p (i
, step
, diff
- i
, step
))
24117 if (d
->vec_flags
& VEC_SVE_DATA
)
24119 rtx pred
= aarch64_ptrue_reg (pred_mode
);
24120 emit_insn (gen_aarch64_sve_revbhw (d
->vmode
, pred_mode
,
24121 d
->target
, pred
, d
->op0
));
24124 rtx src
= gen_rtx_UNSPEC (d
->vmode
, gen_rtvec (1, d
->op0
), unspec
);
24125 emit_set_insn (d
->target
, src
);
24129 /* Recognize patterns for the REV insn, which reverses elements within
24133 aarch64_evpc_rev_global (struct expand_vec_perm_d
*d
)
24135 poly_uint64 nelt
= d
->perm
.length ();
24137 if (!d
->one_vector_p
|| d
->vec_flags
== VEC_ADVSIMD
)
24140 if (!d
->perm
.series_p (0, 1, nelt
- 1, -1))
24147 rtx src
= gen_rtx_UNSPEC (d
->vmode
, gen_rtvec (1, d
->op0
), UNSPEC_REV
);
24148 emit_set_insn (d
->target
, src
);
24153 aarch64_evpc_dup (struct expand_vec_perm_d
*d
)
24155 rtx out
= d
->target
;
24158 machine_mode vmode
= d
->vmode
;
24161 if (d
->vec_flags
== VEC_SVE_PRED
24162 || d
->perm
.encoding ().encoded_nelts () != 1
24163 || !d
->perm
[0].is_constant (&elt
))
24166 if ((d
->vec_flags
& VEC_SVE_DATA
)
24167 && elt
* (aarch64_sve_container_bits (vmode
) / 8) >= 64)
24174 /* The generic preparation in aarch64_expand_vec_perm_const_1
24175 swaps the operand order and the permute indices if it finds
24176 d->perm[0] to be in the second operand. Thus, we can always
24177 use d->op0 and need not do any extra arithmetic to get the
24178 correct lane number. */
24180 lane
= GEN_INT (elt
); /* The pattern corrects for big-endian. */
24182 rtx parallel
= gen_rtx_PARALLEL (vmode
, gen_rtvec (1, lane
));
24183 rtx select
= gen_rtx_VEC_SELECT (GET_MODE_INNER (vmode
), in0
, parallel
);
24184 emit_set_insn (out
, gen_rtx_VEC_DUPLICATE (vmode
, select
));
24189 aarch64_evpc_tbl (struct expand_vec_perm_d
*d
)
24191 rtx rperm
[MAX_COMPILE_TIME_VEC_BYTES
], sel
;
24192 machine_mode vmode
= d
->vmode
;
24194 /* Make sure that the indices are constant. */
24195 unsigned int encoded_nelts
= d
->perm
.encoding ().encoded_nelts ();
24196 for (unsigned int i
= 0; i
< encoded_nelts
; ++i
)
24197 if (!d
->perm
[i
].is_constant ())
24203 /* Generic code will try constant permutation twice. Once with the
24204 original mode and again with the elements lowered to QImode.
24205 So wait and don't do the selector expansion ourselves. */
24206 if (vmode
!= V8QImode
&& vmode
!= V16QImode
)
24209 /* to_constant is safe since this routine is specific to Advanced SIMD
24211 unsigned int nelt
= d
->perm
.length ().to_constant ();
24212 for (unsigned int i
= 0; i
< nelt
; ++i
)
24213 /* If big-endian and two vectors we end up with a weird mixed-endian
24214 mode on NEON. Reverse the index within each word but not the word
24215 itself. to_constant is safe because we checked is_constant above. */
24216 rperm
[i
] = GEN_INT (BYTES_BIG_ENDIAN
24217 ? d
->perm
[i
].to_constant () ^ (nelt
- 1)
24218 : d
->perm
[i
].to_constant ());
24220 sel
= gen_rtx_CONST_VECTOR (vmode
, gen_rtvec_v (nelt
, rperm
));
24221 sel
= force_reg (vmode
, sel
);
24223 aarch64_expand_vec_perm_1 (d
->target
, d
->op0
, d
->op1
, sel
);
24227 /* Try to implement D using an SVE TBL instruction. */
24230 aarch64_evpc_sve_tbl (struct expand_vec_perm_d
*d
)
24232 unsigned HOST_WIDE_INT nelt
;
24234 /* Permuting two variable-length vectors could overflow the
24236 if (!d
->one_vector_p
&& !d
->perm
.length ().is_constant (&nelt
))
24242 machine_mode sel_mode
= related_int_vector_mode (d
->vmode
).require ();
24243 rtx sel
= vec_perm_indices_to_rtx (sel_mode
, d
->perm
);
24244 if (d
->one_vector_p
)
24245 emit_unspec2 (d
->target
, UNSPEC_TBL
, d
->op0
, force_reg (sel_mode
, sel
));
24247 aarch64_expand_sve_vec_perm (d
->target
, d
->op0
, d
->op1
, sel
);
24251 /* Try to implement D using SVE dup instruction. */
24254 aarch64_evpc_sve_dup (struct expand_vec_perm_d
*d
)
24256 if (BYTES_BIG_ENDIAN
24257 || !d
->one_vector_p
24258 || d
->vec_flags
!= VEC_SVE_DATA
24259 || d
->op_vec_flags
!= VEC_ADVSIMD
24260 || d
->perm
.encoding ().nelts_per_pattern () != 1
24261 || !known_eq (d
->perm
.encoding ().npatterns (),
24262 GET_MODE_NUNITS (d
->op_mode
))
24263 || !known_eq (GET_MODE_BITSIZE (d
->op_mode
), 128))
24266 int npatterns
= d
->perm
.encoding ().npatterns ();
24267 for (int i
= 0; i
< npatterns
; i
++)
24268 if (!known_eq (d
->perm
[i
], i
))
24274 aarch64_expand_sve_dupq (d
->target
, GET_MODE (d
->target
), d
->op0
);
24278 /* Try to implement D using SVE SEL instruction. */
24281 aarch64_evpc_sel (struct expand_vec_perm_d
*d
)
24283 machine_mode vmode
= d
->vmode
;
24284 int unit_size
= GET_MODE_UNIT_SIZE (vmode
);
24286 if (d
->vec_flags
!= VEC_SVE_DATA
24290 int n_patterns
= d
->perm
.encoding ().npatterns ();
24291 poly_int64 vec_len
= d
->perm
.length ();
24293 for (int i
= 0; i
< n_patterns
; ++i
)
24294 if (!known_eq (d
->perm
[i
], i
)
24295 && !known_eq (d
->perm
[i
], vec_len
+ i
))
24298 for (int i
= n_patterns
; i
< n_patterns
* 2; i
++)
24299 if (!d
->perm
.series_p (i
, n_patterns
, i
, n_patterns
)
24300 && !d
->perm
.series_p (i
, n_patterns
, vec_len
+ i
, n_patterns
))
24306 machine_mode pred_mode
= aarch64_sve_pred_mode (vmode
);
24308 /* Build a predicate that is true when op0 elements should be used. */
24309 rtx_vector_builder
builder (pred_mode
, n_patterns
, 2);
24310 for (int i
= 0; i
< n_patterns
* 2; i
++)
24312 rtx elem
= known_eq (d
->perm
[i
], i
) ? CONST1_RTX (BImode
)
24313 : CONST0_RTX (BImode
);
24314 builder
.quick_push (elem
);
24317 rtx const_vec
= builder
.build ();
24318 rtx pred
= force_reg (pred_mode
, const_vec
);
24319 /* TARGET = PRED ? OP0 : OP1. */
24320 emit_insn (gen_vcond_mask (vmode
, vmode
, d
->target
, d
->op0
, d
->op1
, pred
));
24324 /* Recognize patterns suitable for the INS instructions. */
24326 aarch64_evpc_ins (struct expand_vec_perm_d
*d
)
24328 machine_mode mode
= d
->vmode
;
24329 unsigned HOST_WIDE_INT nelt
;
24331 if (d
->vec_flags
!= VEC_ADVSIMD
)
24334 /* to_constant is safe since this routine is specific to Advanced SIMD
24336 nelt
= d
->perm
.length ().to_constant ();
24339 HOST_WIDE_INT idx
= -1;
24341 for (unsigned HOST_WIDE_INT i
= 0; i
< nelt
; i
++)
24344 if (!d
->perm
[i
].is_constant (&elt
))
24346 if (elt
== (HOST_WIDE_INT
) i
)
24359 for (unsigned HOST_WIDE_INT i
= 0; i
< nelt
; i
++)
24361 if (d
->perm
[i
].to_constant () == (HOST_WIDE_INT
) (i
+ nelt
))
24375 gcc_assert (idx
!= -1);
24377 unsigned extractindex
= d
->perm
[idx
].to_constant ();
24378 rtx extractv
= d
->op0
;
24379 if (extractindex
>= nelt
)
24382 extractindex
-= nelt
;
24384 gcc_assert (extractindex
< nelt
);
24386 insn_code icode
= code_for_aarch64_simd_vec_copy_lane (mode
);
24387 expand_operand ops
[5];
24388 create_output_operand (&ops
[0], d
->target
, mode
);
24389 create_input_operand (&ops
[1], insv
, mode
);
24390 create_integer_operand (&ops
[2], 1 << idx
);
24391 create_input_operand (&ops
[3], extractv
, mode
);
24392 create_integer_operand (&ops
[4], extractindex
);
24393 expand_insn (icode
, 5, ops
);
24399 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d
*d
)
24401 gcc_assert (d
->op_mode
!= E_VOIDmode
);
24403 /* The pattern matching functions above are written to look for a small
24404 number to begin the sequence (0, 1, N/2). If we begin with an index
24405 from the second operand, we can swap the operands. */
24406 poly_int64 nelt
= d
->perm
.length ();
24407 if (known_ge (d
->perm
[0], nelt
))
24409 d
->perm
.rotate_inputs (1);
24410 std::swap (d
->op0
, d
->op1
);
24413 if (((d
->vec_flags
== VEC_ADVSIMD
&& TARGET_SIMD
)
24414 || d
->vec_flags
== VEC_SVE_DATA
24415 || d
->vec_flags
== (VEC_SVE_DATA
| VEC_PARTIAL
)
24416 || d
->vec_flags
== VEC_SVE_PRED
)
24417 && known_gt (nelt
, 1))
24419 if (d
->vmode
== d
->op_mode
)
24421 if (aarch64_evpc_rev_local (d
))
24423 else if (aarch64_evpc_rev_global (d
))
24425 else if (aarch64_evpc_ext (d
))
24427 else if (aarch64_evpc_dup (d
))
24429 else if (aarch64_evpc_zip (d
))
24431 else if (aarch64_evpc_uzp (d
))
24433 else if (aarch64_evpc_trn (d
))
24435 else if (aarch64_evpc_sel (d
))
24437 else if (aarch64_evpc_ins (d
))
24439 else if (aarch64_evpc_reencode (d
))
24442 if (d
->vec_flags
== VEC_SVE_DATA
)
24443 return aarch64_evpc_sve_tbl (d
);
24444 else if (d
->vec_flags
== VEC_ADVSIMD
)
24445 return aarch64_evpc_tbl (d
);
24449 if (aarch64_evpc_sve_dup (d
))
24456 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST. */
24459 aarch64_vectorize_vec_perm_const (machine_mode vmode
, machine_mode op_mode
,
24460 rtx target
, rtx op0
, rtx op1
,
24461 const vec_perm_indices
&sel
)
24463 struct expand_vec_perm_d d
;
24465 /* Check whether the mask can be applied to a single vector. */
24466 if (sel
.ninputs () == 1
24467 || (op0
&& rtx_equal_p (op0
, op1
)))
24468 d
.one_vector_p
= true;
24469 else if (sel
.all_from_input_p (0))
24471 d
.one_vector_p
= true;
24474 else if (sel
.all_from_input_p (1))
24476 d
.one_vector_p
= true;
24480 d
.one_vector_p
= false;
24482 d
.perm
.new_vector (sel
.encoding (), d
.one_vector_p
? 1 : 2,
24483 sel
.nelts_per_input ());
24485 d
.vec_flags
= aarch64_classify_vector_mode (d
.vmode
);
24486 d
.op_mode
= op_mode
;
24487 d
.op_vec_flags
= aarch64_classify_vector_mode (d
.op_mode
);
24489 d
.op0
= op0
? force_reg (op_mode
, op0
) : NULL_RTX
;
24493 d
.op1
= op1
? force_reg (op_mode
, op1
) : NULL_RTX
;
24494 d
.testing_p
= !target
;
24497 return aarch64_expand_vec_perm_const_1 (&d
);
24499 rtx_insn
*last
= get_last_insn ();
24500 bool ret
= aarch64_expand_vec_perm_const_1 (&d
);
24501 gcc_assert (last
== get_last_insn ());
24506 /* Implement TARGET_VECTORIZE_CAN_SPECIAL_DIV_BY_CONST. */
24509 aarch64_vectorize_can_special_div_by_constant (enum tree_code code
,
24510 tree vectype
, wide_int cst
,
24511 rtx
*output
, rtx in0
, rtx in1
)
24513 if (code
!= TRUNC_DIV_EXPR
24514 || !TYPE_UNSIGNED (vectype
))
24517 machine_mode mode
= TYPE_MODE (vectype
);
24518 unsigned int flags
= aarch64_classify_vector_mode (mode
);
24519 if ((flags
& VEC_ANY_SVE
) && !TARGET_SVE2
)
24522 int pow
= wi::exact_log2 (cst
+ 1);
24523 auto insn_code
= maybe_code_for_aarch64_bitmask_udiv3 (TYPE_MODE (vectype
));
24524 /* SVE actually has a div operator, we may have gotten here through
24526 if (pow
!= (int) (element_precision (vectype
) / 2)
24527 || insn_code
== CODE_FOR_nothing
)
24530 /* We can use the optimized pattern. */
24531 if (in0
== NULL_RTX
&& in1
== NULL_RTX
)
24534 gcc_assert (output
);
24536 expand_operand ops
[3];
24537 create_output_operand (&ops
[0], *output
, mode
);
24538 create_input_operand (&ops
[1], in0
, mode
);
24539 create_fixed_operand (&ops
[2], in1
);
24540 expand_insn (insn_code
, 3, ops
);
24541 *output
= ops
[0].value
;
24545 /* Generate a byte permute mask for a register of mode MODE,
24546 which has NUNITS units. */
24549 aarch64_reverse_mask (machine_mode mode
, unsigned int nunits
)
24551 /* We have to reverse each vector because we dont have
24552 a permuted load that can reverse-load according to ABI rules. */
24554 rtvec v
= rtvec_alloc (16);
24556 unsigned int usize
= GET_MODE_UNIT_SIZE (mode
);
24558 gcc_assert (BYTES_BIG_ENDIAN
);
24559 gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode
));
24561 for (i
= 0; i
< nunits
; i
++)
24562 for (j
= 0; j
< usize
; j
++)
24563 RTVEC_ELT (v
, i
* usize
+ j
) = GEN_INT ((i
+ 1) * usize
- 1 - j
);
24564 mask
= gen_rtx_CONST_VECTOR (V16QImode
, v
);
24565 return force_reg (V16QImode
, mask
);
24568 /* Expand an SVE integer comparison using the SVE equivalent of:
24570 (set TARGET (CODE OP0 OP1)). */
24573 aarch64_expand_sve_vec_cmp_int (rtx target
, rtx_code code
, rtx op0
, rtx op1
)
24575 machine_mode pred_mode
= GET_MODE (target
);
24576 machine_mode data_mode
= GET_MODE (op0
);
24577 rtx res
= aarch64_sve_emit_int_cmp (target
, pred_mode
, code
, data_mode
,
24579 if (!rtx_equal_p (target
, res
))
24580 emit_move_insn (target
, res
);
24583 /* Return the UNSPEC_COND_* code for comparison CODE. */
24585 static unsigned int
24586 aarch64_unspec_cond_code (rtx_code code
)
24591 return UNSPEC_COND_FCMNE
;
24593 return UNSPEC_COND_FCMEQ
;
24595 return UNSPEC_COND_FCMLT
;
24597 return UNSPEC_COND_FCMGT
;
24599 return UNSPEC_COND_FCMLE
;
24601 return UNSPEC_COND_FCMGE
;
24603 return UNSPEC_COND_FCMUO
;
24605 gcc_unreachable ();
24611 (set TARGET (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X>))
24613 where <X> is the operation associated with comparison CODE.
24614 KNOWN_PTRUE_P is true if PRED is known to be a PTRUE. */
24617 aarch64_emit_sve_fp_cond (rtx target
, rtx_code code
, rtx pred
,
24618 bool known_ptrue_p
, rtx op0
, rtx op1
)
24620 rtx flag
= gen_int_mode (known_ptrue_p
, SImode
);
24621 rtx unspec
= gen_rtx_UNSPEC (GET_MODE (pred
),
24622 gen_rtvec (4, pred
, flag
, op0
, op1
),
24623 aarch64_unspec_cond_code (code
));
24624 emit_set_insn (target
, unspec
);
24627 /* Emit the SVE equivalent of:
24629 (set TMP1 (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X1>))
24630 (set TMP2 (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X2>))
24631 (set TARGET (ior:PRED_MODE TMP1 TMP2))
24633 where <Xi> is the operation associated with comparison CODEi.
24634 KNOWN_PTRUE_P is true if PRED is known to be a PTRUE. */
24637 aarch64_emit_sve_or_fp_conds (rtx target
, rtx_code code1
, rtx_code code2
,
24638 rtx pred
, bool known_ptrue_p
, rtx op0
, rtx op1
)
24640 machine_mode pred_mode
= GET_MODE (pred
);
24641 rtx tmp1
= gen_reg_rtx (pred_mode
);
24642 aarch64_emit_sve_fp_cond (tmp1
, code1
, pred
, known_ptrue_p
, op0
, op1
);
24643 rtx tmp2
= gen_reg_rtx (pred_mode
);
24644 aarch64_emit_sve_fp_cond (tmp2
, code2
, pred
, known_ptrue_p
, op0
, op1
);
24645 aarch64_emit_binop (target
, ior_optab
, tmp1
, tmp2
);
24648 /* Emit the SVE equivalent of:
24650 (set TMP (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X>))
24651 (set TARGET (not TMP))
24653 where <X> is the operation associated with comparison CODE.
24654 KNOWN_PTRUE_P is true if PRED is known to be a PTRUE. */
24657 aarch64_emit_sve_invert_fp_cond (rtx target
, rtx_code code
, rtx pred
,
24658 bool known_ptrue_p
, rtx op0
, rtx op1
)
24660 machine_mode pred_mode
= GET_MODE (pred
);
24661 rtx tmp
= gen_reg_rtx (pred_mode
);
24662 aarch64_emit_sve_fp_cond (tmp
, code
, pred
, known_ptrue_p
, op0
, op1
);
24663 aarch64_emit_unop (target
, one_cmpl_optab
, tmp
);
24666 /* Expand an SVE floating-point comparison using the SVE equivalent of:
24668 (set TARGET (CODE OP0 OP1))
24670 If CAN_INVERT_P is true, the caller can also handle inverted results;
24671 return true if the result is in fact inverted. */
24674 aarch64_expand_sve_vec_cmp_float (rtx target
, rtx_code code
,
24675 rtx op0
, rtx op1
, bool can_invert_p
)
24677 machine_mode pred_mode
= GET_MODE (target
);
24678 machine_mode data_mode
= GET_MODE (op0
);
24680 rtx ptrue
= aarch64_ptrue_reg (pred_mode
);
24684 /* UNORDERED has no immediate form. */
24685 op1
= force_reg (data_mode
, op1
);
24694 /* There is native support for the comparison. */
24695 aarch64_emit_sve_fp_cond (target
, code
, ptrue
, true, op0
, op1
);
24700 /* This is a trapping operation (LT or GT). */
24701 aarch64_emit_sve_or_fp_conds (target
, LT
, GT
, ptrue
, true, op0
, op1
);
24705 if (!flag_trapping_math
)
24707 /* This would trap for signaling NaNs. */
24708 op1
= force_reg (data_mode
, op1
);
24709 aarch64_emit_sve_or_fp_conds (target
, UNORDERED
, EQ
,
24710 ptrue
, true, op0
, op1
);
24718 if (flag_trapping_math
)
24720 /* Work out which elements are ordered. */
24721 rtx ordered
= gen_reg_rtx (pred_mode
);
24722 op1
= force_reg (data_mode
, op1
);
24723 aarch64_emit_sve_invert_fp_cond (ordered
, UNORDERED
,
24724 ptrue
, true, op0
, op1
);
24726 /* Test the opposite condition for the ordered elements,
24727 then invert the result. */
24731 code
= reverse_condition_maybe_unordered (code
);
24734 aarch64_emit_sve_fp_cond (target
, code
,
24735 ordered
, false, op0
, op1
);
24738 aarch64_emit_sve_invert_fp_cond (target
, code
,
24739 ordered
, false, op0
, op1
);
24745 /* ORDERED has no immediate form. */
24746 op1
= force_reg (data_mode
, op1
);
24750 gcc_unreachable ();
24753 /* There is native support for the inverse comparison. */
24754 code
= reverse_condition_maybe_unordered (code
);
24757 aarch64_emit_sve_fp_cond (target
, code
, ptrue
, true, op0
, op1
);
24760 aarch64_emit_sve_invert_fp_cond (target
, code
, ptrue
, true, op0
, op1
);
24764 /* Expand an SVE vcond pattern with operands OPS. DATA_MODE is the mode
24765 of the data being selected and CMP_MODE is the mode of the values being
24769 aarch64_expand_sve_vcond (machine_mode data_mode
, machine_mode cmp_mode
,
24772 machine_mode pred_mode
= aarch64_get_mask_mode (cmp_mode
).require ();
24773 rtx pred
= gen_reg_rtx (pred_mode
);
24774 if (FLOAT_MODE_P (cmp_mode
))
24776 if (aarch64_expand_sve_vec_cmp_float (pred
, GET_CODE (ops
[3]),
24777 ops
[4], ops
[5], true))
24778 std::swap (ops
[1], ops
[2]);
24781 aarch64_expand_sve_vec_cmp_int (pred
, GET_CODE (ops
[3]), ops
[4], ops
[5]);
24783 if (!aarch64_sve_reg_or_dup_imm (ops
[1], data_mode
))
24784 ops
[1] = force_reg (data_mode
, ops
[1]);
24785 /* The "false" value can only be zero if the "true" value is a constant. */
24786 if (register_operand (ops
[1], data_mode
)
24787 || !aarch64_simd_reg_or_zero (ops
[2], data_mode
))
24788 ops
[2] = force_reg (data_mode
, ops
[2]);
24790 rtvec vec
= gen_rtvec (3, pred
, ops
[1], ops
[2]);
24791 emit_set_insn (ops
[0], gen_rtx_UNSPEC (data_mode
, vec
, UNSPEC_SEL
));
24794 /* Implement TARGET_MODES_TIEABLE_P. In principle we should always return
24795 true. However due to issues with register allocation it is preferable
24796 to avoid tieing integer scalar and FP scalar modes. Executing integer
24797 operations in general registers is better than treating them as scalar
24798 vector operations. This reduces latency and avoids redundant int<->FP
24799 moves. So tie modes if they are either the same class, or vector modes
24800 with other vector modes, vector structs or any scalar mode. */
24803 aarch64_modes_tieable_p (machine_mode mode1
, machine_mode mode2
)
24805 if ((aarch64_advsimd_partial_struct_mode_p (mode1
)
24806 != aarch64_advsimd_partial_struct_mode_p (mode2
))
24807 && maybe_gt (GET_MODE_SIZE (mode1
), 8)
24808 && maybe_gt (GET_MODE_SIZE (mode2
), 8))
24811 if (GET_MODE_CLASS (mode1
) == GET_MODE_CLASS (mode2
))
24814 /* We specifically want to allow elements of "structure" modes to
24815 be tieable to the structure. This more general condition allows
24816 other rarer situations too. The reason we don't extend this to
24817 predicate modes is that there are no predicate structure modes
24818 nor any specific instructions for extracting part of a predicate
24820 if (aarch64_vector_data_mode_p (mode1
)
24821 && aarch64_vector_data_mode_p (mode2
))
24824 /* Also allow any scalar modes with vectors. */
24825 if (aarch64_vector_mode_supported_p (mode1
)
24826 || aarch64_vector_mode_supported_p (mode2
))
24832 /* Return a new RTX holding the result of moving POINTER forward by
24836 aarch64_move_pointer (rtx pointer
, poly_int64 amount
)
24838 rtx next
= plus_constant (Pmode
, XEXP (pointer
, 0), amount
);
24840 return adjust_automodify_address (pointer
, GET_MODE (pointer
),
24844 /* Return a new RTX holding the result of moving POINTER forward by the
24845 size of the mode it points to. */
24848 aarch64_progress_pointer (rtx pointer
)
24850 return aarch64_move_pointer (pointer
, GET_MODE_SIZE (GET_MODE (pointer
)));
24853 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
24857 aarch64_copy_one_block_and_progress_pointers (rtx
*src
, rtx
*dst
,
24860 /* Handle 256-bit memcpy separately. We do this by making 2 adjacent memory
24861 address copies using V4SImode so that we can use Q registers. */
24862 if (known_eq (GET_MODE_BITSIZE (mode
), 256))
24865 rtx reg1
= gen_reg_rtx (mode
);
24866 rtx reg2
= gen_reg_rtx (mode
);
24867 /* "Cast" the pointers to the correct mode. */
24868 *src
= adjust_address (*src
, mode
, 0);
24869 *dst
= adjust_address (*dst
, mode
, 0);
24870 /* Emit the memcpy. */
24871 emit_insn (aarch64_gen_load_pair (mode
, reg1
, *src
, reg2
,
24872 aarch64_progress_pointer (*src
)));
24873 emit_insn (aarch64_gen_store_pair (mode
, *dst
, reg1
,
24874 aarch64_progress_pointer (*dst
), reg2
));
24875 /* Move the pointers forward. */
24876 *src
= aarch64_move_pointer (*src
, 32);
24877 *dst
= aarch64_move_pointer (*dst
, 32);
24881 rtx reg
= gen_reg_rtx (mode
);
24883 /* "Cast" the pointers to the correct mode. */
24884 *src
= adjust_address (*src
, mode
, 0);
24885 *dst
= adjust_address (*dst
, mode
, 0);
24886 /* Emit the memcpy. */
24887 emit_move_insn (reg
, *src
);
24888 emit_move_insn (*dst
, reg
);
24889 /* Move the pointers forward. */
24890 *src
= aarch64_progress_pointer (*src
);
24891 *dst
= aarch64_progress_pointer (*dst
);
24894 /* Expand a cpymem using the MOPS extension. OPERANDS are taken
24895 from the cpymem pattern. Return true iff we succeeded. */
24897 aarch64_expand_cpymem_mops (rtx
*operands
)
24902 /* All three registers are changed by the instruction, so each one
24903 must be a fresh pseudo. */
24904 rtx dst_addr
= copy_to_mode_reg (Pmode
, XEXP (operands
[0], 0));
24905 rtx src_addr
= copy_to_mode_reg (Pmode
, XEXP (operands
[1], 0));
24906 rtx dst_mem
= replace_equiv_address (operands
[0], dst_addr
);
24907 rtx src_mem
= replace_equiv_address (operands
[1], src_addr
);
24908 rtx sz_reg
= copy_to_mode_reg (DImode
, operands
[2]);
24909 emit_insn (gen_aarch64_cpymemdi (dst_mem
, src_mem
, sz_reg
));
24914 /* Expand cpymem, as if from a __builtin_memcpy. Return true if
24915 we succeed, otherwise return false, indicating that a libcall to
24916 memcpy should be emitted. */
24919 aarch64_expand_cpymem (rtx
*operands
)
24922 rtx dst
= operands
[0];
24923 rtx src
= operands
[1];
24925 machine_mode cur_mode
= BLKmode
;
24927 /* Variable-sized memcpy can go through the MOPS expansion if available. */
24928 if (!CONST_INT_P (operands
[2]))
24929 return aarch64_expand_cpymem_mops (operands
);
24931 unsigned HOST_WIDE_INT size
= INTVAL (operands
[2]);
24933 /* Try to inline up to 256 bytes or use the MOPS threshold if available. */
24934 unsigned HOST_WIDE_INT max_copy_size
24935 = TARGET_MOPS
? aarch64_mops_memcpy_size_threshold
: 256;
24937 bool size_p
= optimize_function_for_size_p (cfun
);
24939 /* Large constant-sized cpymem should go through MOPS when possible.
24940 It should be a win even for size optimization in the general case.
24941 For speed optimization the choice between MOPS and the SIMD sequence
24942 depends on the size of the copy, rather than number of instructions,
24944 if (size
> max_copy_size
)
24945 return aarch64_expand_cpymem_mops (operands
);
24947 int copy_bits
= 256;
24949 /* Default to 256-bit LDP/STP on large copies, however small copies, no SIMD
24950 support or slow 256-bit LDP/STP fall back to 128-bit chunks. */
24953 || (aarch64_tune_params
.extra_tuning_flags
24954 & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS
))
24957 /* Emit an inline load+store sequence and count the number of operations
24958 involved. We use a simple count of just the loads and stores emitted
24959 rather than rtx_insn count as all the pointer adjustments and reg copying
24960 in this function will get optimized away later in the pipeline. */
24964 base
= copy_to_mode_reg (Pmode
, XEXP (dst
, 0));
24965 dst
= adjust_automodify_address (dst
, VOIDmode
, base
, 0);
24967 base
= copy_to_mode_reg (Pmode
, XEXP (src
, 0));
24968 src
= adjust_automodify_address (src
, VOIDmode
, base
, 0);
24970 /* Convert size to bits to make the rest of the code simpler. */
24971 int n
= size
* BITS_PER_UNIT
;
24975 /* Find the largest mode in which to do the copy in without over reading
24977 opt_scalar_int_mode mode_iter
;
24978 FOR_EACH_MODE_IN_CLASS (mode_iter
, MODE_INT
)
24979 if (GET_MODE_BITSIZE (mode_iter
.require ()) <= MIN (n
, copy_bits
))
24980 cur_mode
= mode_iter
.require ();
24982 gcc_assert (cur_mode
!= BLKmode
);
24984 mode_bits
= GET_MODE_BITSIZE (cur_mode
).to_constant ();
24986 /* Prefer Q-register accesses for the last bytes. */
24987 if (mode_bits
== 128 && copy_bits
== 256)
24988 cur_mode
= V4SImode
;
24990 aarch64_copy_one_block_and_progress_pointers (&src
, &dst
, cur_mode
);
24991 /* A single block copy is 1 load + 1 store. */
24995 /* Emit trailing copies using overlapping unaligned accesses
24996 (when !STRICT_ALIGNMENT) - this is smaller and faster. */
24997 if (n
> 0 && n
< copy_bits
/ 2 && !STRICT_ALIGNMENT
)
24999 machine_mode next_mode
= smallest_mode_for_size (n
, MODE_INT
);
25000 int n_bits
= GET_MODE_BITSIZE (next_mode
).to_constant ();
25001 gcc_assert (n_bits
<= mode_bits
);
25002 src
= aarch64_move_pointer (src
, (n
- n_bits
) / BITS_PER_UNIT
);
25003 dst
= aarch64_move_pointer (dst
, (n
- n_bits
) / BITS_PER_UNIT
);
25007 rtx_insn
*seq
= get_insns ();
25009 /* MOPS sequence requires 3 instructions for the memory copying + 1 to move
25010 the constant size into a register. */
25011 unsigned mops_cost
= 3 + 1;
25013 /* If MOPS is available at this point we don't consider the libcall as it's
25014 not a win even on code size. At this point only consider MOPS if
25015 optimizing for size. For speed optimizations we will have chosen between
25016 the two based on copy size already. */
25019 if (size_p
&& mops_cost
< nops
)
25020 return aarch64_expand_cpymem_mops (operands
);
25025 /* A memcpy libcall in the worst case takes 3 instructions to prepare the
25026 arguments + 1 for the call. When MOPS is not available and we're
25027 optimizing for size a libcall may be preferable. */
25028 unsigned libcall_cost
= 4;
25029 if (size_p
&& libcall_cost
< nops
)
25036 /* Like aarch64_copy_one_block_and_progress_pointers, except for memset where
25037 SRC is a register we have created with the duplicated value to be set. */
25039 aarch64_set_one_block_and_progress_pointer (rtx src
, rtx
*dst
,
25042 /* If we are copying 128bits or 256bits, we can do that straight from
25043 the SIMD register we prepared. */
25044 if (known_eq (GET_MODE_BITSIZE (mode
), 256))
25046 mode
= GET_MODE (src
);
25047 /* "Cast" the *dst to the correct mode. */
25048 *dst
= adjust_address (*dst
, mode
, 0);
25049 /* Emit the memset. */
25050 emit_insn (aarch64_gen_store_pair (mode
, *dst
, src
,
25051 aarch64_progress_pointer (*dst
), src
));
25053 /* Move the pointers forward. */
25054 *dst
= aarch64_move_pointer (*dst
, 32);
25057 if (known_eq (GET_MODE_BITSIZE (mode
), 128))
25059 /* "Cast" the *dst to the correct mode. */
25060 *dst
= adjust_address (*dst
, GET_MODE (src
), 0);
25061 /* Emit the memset. */
25062 emit_move_insn (*dst
, src
);
25063 /* Move the pointers forward. */
25064 *dst
= aarch64_move_pointer (*dst
, 16);
25067 /* For copying less, we have to extract the right amount from src. */
25068 rtx reg
= lowpart_subreg (mode
, src
, GET_MODE (src
));
25070 /* "Cast" the *dst to the correct mode. */
25071 *dst
= adjust_address (*dst
, mode
, 0);
25072 /* Emit the memset. */
25073 emit_move_insn (*dst
, reg
);
25074 /* Move the pointer forward. */
25075 *dst
= aarch64_progress_pointer (*dst
);
25078 /* Expand a setmem using the MOPS instructions. OPERANDS are the same
25079 as for the setmem pattern. Return true iff we succeed. */
25081 aarch64_expand_setmem_mops (rtx
*operands
)
25086 /* The first two registers are changed by the instruction, so both
25087 of them must be a fresh pseudo. */
25088 rtx dst_addr
= copy_to_mode_reg (Pmode
, XEXP (operands
[0], 0));
25089 rtx dst_mem
= replace_equiv_address (operands
[0], dst_addr
);
25090 rtx sz_reg
= copy_to_mode_reg (DImode
, operands
[1]);
25091 rtx val
= operands
[2];
25092 if (val
!= CONST0_RTX (QImode
))
25093 val
= force_reg (QImode
, val
);
25094 emit_insn (gen_aarch64_setmemdi (dst_mem
, val
, sz_reg
));
25098 /* Expand setmem, as if from a __builtin_memset. Return true if
25099 we succeed, otherwise return false. */
25102 aarch64_expand_setmem (rtx
*operands
)
25105 unsigned HOST_WIDE_INT len
;
25106 rtx dst
= operands
[0];
25107 rtx val
= operands
[2], src
;
25109 machine_mode cur_mode
= BLKmode
, next_mode
;
25111 /* If we don't have SIMD registers or the size is variable use the MOPS
25112 inlined sequence if possible. */
25113 if (!CONST_INT_P (operands
[1]) || !TARGET_SIMD
)
25114 return aarch64_expand_setmem_mops (operands
);
25116 bool size_p
= optimize_function_for_size_p (cfun
);
25118 /* Default the maximum to 256-bytes when considering only libcall vs
25119 SIMD broadcast sequence. */
25120 unsigned max_set_size
= 256;
25122 len
= INTVAL (operands
[1]);
25123 if (len
> max_set_size
&& !TARGET_MOPS
)
25126 int cst_val
= !!(CONST_INT_P (val
) && (INTVAL (val
) != 0));
25127 /* The MOPS sequence takes:
25128 3 instructions for the memory storing
25129 + 1 to move the constant size into a reg
25130 + 1 if VAL is a non-zero constant to move into a reg
25131 (zero constants can use XZR directly). */
25132 unsigned mops_cost
= 3 + 1 + cst_val
;
25133 /* A libcall to memset in the worst case takes 3 instructions to prepare
25134 the arguments + 1 for the call. */
25135 unsigned libcall_cost
= 4;
25137 /* Upper bound check. For large constant-sized setmem use the MOPS sequence
25140 && len
>= (unsigned HOST_WIDE_INT
) aarch64_mops_memset_size_threshold
)
25141 return aarch64_expand_setmem_mops (operands
);
25143 /* Attempt a sequence with a vector broadcast followed by stores.
25144 Count the number of operations involved to see if it's worth it
25145 against the alternatives. A simple counter simd_ops on the
25146 algorithmically-relevant operations is used rather than an rtx_insn count
25147 as all the pointer adjusmtents and mode reinterprets will be optimized
25150 unsigned simd_ops
= 0;
25152 base
= copy_to_mode_reg (Pmode
, XEXP (dst
, 0));
25153 dst
= adjust_automodify_address (dst
, VOIDmode
, base
, 0);
25155 /* Prepare the val using a DUP/MOVI v0.16B, val. */
25156 src
= expand_vector_broadcast (V16QImode
, val
);
25157 src
= force_reg (V16QImode
, src
);
25159 /* Convert len to bits to make the rest of the code simpler. */
25160 n
= len
* BITS_PER_UNIT
;
25162 /* Maximum amount to copy in one go. We allow 256-bit chunks based on the
25163 AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS tuning parameter. */
25164 const int copy_limit
= (aarch64_tune_params
.extra_tuning_flags
25165 & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS
)
25166 ? GET_MODE_BITSIZE (TImode
) : 256;
25170 /* Find the largest mode in which to do the copy without
25172 opt_scalar_int_mode mode_iter
;
25173 FOR_EACH_MODE_IN_CLASS (mode_iter
, MODE_INT
)
25174 if (GET_MODE_BITSIZE (mode_iter
.require ()) <= MIN (n
, copy_limit
))
25175 cur_mode
= mode_iter
.require ();
25177 gcc_assert (cur_mode
!= BLKmode
);
25179 mode_bits
= GET_MODE_BITSIZE (cur_mode
).to_constant ();
25180 aarch64_set_one_block_and_progress_pointer (src
, &dst
, cur_mode
);
25184 /* Do certain trailing copies as overlapping if it's going to be
25185 cheaper. i.e. less instructions to do so. For instance doing a 15
25186 byte copy it's more efficient to do two overlapping 8 byte copies than
25187 8 + 4 + 2 + 1. Only do this when -mstrict-align is not supplied. */
25188 if (n
> 0 && n
< copy_limit
/ 2 && !STRICT_ALIGNMENT
)
25190 next_mode
= smallest_mode_for_size (n
, MODE_INT
);
25191 int n_bits
= GET_MODE_BITSIZE (next_mode
).to_constant ();
25192 gcc_assert (n_bits
<= mode_bits
);
25193 dst
= aarch64_move_pointer (dst
, (n
- n_bits
) / BITS_PER_UNIT
);
25197 rtx_insn
*seq
= get_insns ();
25202 /* When optimizing for size we have 3 options: the SIMD broadcast sequence,
25203 call to memset or the MOPS expansion. */
25205 && mops_cost
<= libcall_cost
25206 && mops_cost
<= simd_ops
)
25207 return aarch64_expand_setmem_mops (operands
);
25208 /* If MOPS is not available or not shorter pick a libcall if the SIMD
25209 sequence is too long. */
25210 else if (libcall_cost
< simd_ops
)
25216 /* At this point the SIMD broadcast sequence is the best choice when
25217 optimizing for speed. */
25223 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
25224 SImode stores. Handle the case when the constant has identical
25225 bottom and top halves. This is beneficial when the two stores can be
25226 merged into an STP and we avoid synthesising potentially expensive
25227 immediates twice. Return true if such a split is possible. */
25230 aarch64_split_dimode_const_store (rtx dst
, rtx src
)
25232 rtx lo
= gen_lowpart (SImode
, src
);
25233 rtx hi
= gen_highpart_mode (SImode
, DImode
, src
);
25235 bool size_p
= optimize_function_for_size_p (cfun
);
25237 if (!rtx_equal_p (lo
, hi
))
25240 unsigned int orig_cost
25241 = aarch64_internal_mov_immediate (NULL_RTX
, src
, false, DImode
);
25242 unsigned int lo_cost
25243 = aarch64_internal_mov_immediate (NULL_RTX
, lo
, false, SImode
);
25245 /* We want to transform:
25247 MOVK x1, 0x140, lsl 16
25248 MOVK x1, 0xc0da, lsl 32
25249 MOVK x1, 0x140, lsl 48
25253 MOVK w1, 0x140, lsl 16
25255 So we want to perform this only when we save two instructions
25256 or more. When optimizing for size, however, accept any code size
25258 if (size_p
&& orig_cost
<= lo_cost
)
25262 && (orig_cost
<= lo_cost
+ 1))
25265 rtx mem_lo
= adjust_address (dst
, SImode
, 0);
25266 if (!aarch64_mem_pair_operand (mem_lo
, SImode
))
25269 rtx tmp_reg
= gen_reg_rtx (SImode
);
25270 aarch64_expand_mov_immediate (tmp_reg
, lo
);
25271 rtx mem_hi
= aarch64_move_pointer (mem_lo
, GET_MODE_SIZE (SImode
));
25272 /* Don't emit an explicit store pair as this may not be always profitable.
25273 Let the sched-fusion logic decide whether to merge them. */
25274 emit_move_insn (mem_lo
, tmp_reg
);
25275 emit_move_insn (mem_hi
, tmp_reg
);
25280 /* Generate RTL for a conditional branch with rtx comparison CODE in
25281 mode CC_MODE. The destination of the unlikely conditional branch
25285 aarch64_gen_unlikely_cbranch (enum rtx_code code
, machine_mode cc_mode
,
25289 x
= gen_rtx_fmt_ee (code
, VOIDmode
,
25290 gen_rtx_REG (cc_mode
, CC_REGNUM
),
25293 x
= gen_rtx_IF_THEN_ELSE (VOIDmode
, x
,
25294 gen_rtx_LABEL_REF (VOIDmode
, label_ref
),
25296 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx
, x
));
25299 /* Generate DImode scratch registers for 128-bit (TImode) addition.
25301 OP1 represents the TImode destination operand 1
25302 OP2 represents the TImode destination operand 2
25303 LOW_DEST represents the low half (DImode) of TImode operand 0
25304 LOW_IN1 represents the low half (DImode) of TImode operand 1
25305 LOW_IN2 represents the low half (DImode) of TImode operand 2
25306 HIGH_DEST represents the high half (DImode) of TImode operand 0
25307 HIGH_IN1 represents the high half (DImode) of TImode operand 1
25308 HIGH_IN2 represents the high half (DImode) of TImode operand 2. */
25311 aarch64_addti_scratch_regs (rtx op1
, rtx op2
, rtx
*low_dest
,
25312 rtx
*low_in1
, rtx
*low_in2
,
25313 rtx
*high_dest
, rtx
*high_in1
,
25316 *low_dest
= gen_reg_rtx (DImode
);
25317 *low_in1
= gen_lowpart (DImode
, op1
);
25318 *low_in2
= simplify_gen_subreg (DImode
, op2
, TImode
,
25319 subreg_lowpart_offset (DImode
, TImode
));
25320 *high_dest
= gen_reg_rtx (DImode
);
25321 *high_in1
= gen_highpart (DImode
, op1
);
25322 *high_in2
= simplify_gen_subreg (DImode
, op2
, TImode
,
25323 subreg_highpart_offset (DImode
, TImode
));
25326 /* Generate DImode scratch registers for 128-bit (TImode) subtraction.
25328 This function differs from 'arch64_addti_scratch_regs' in that
25329 OP1 can be an immediate constant (zero). We must call
25330 subreg_highpart_offset with DImode and TImode arguments, otherwise
25331 VOIDmode will be used for the const_int which generates an internal
25332 error from subreg_size_highpart_offset which does not expect a size of zero.
25334 OP1 represents the TImode destination operand 1
25335 OP2 represents the TImode destination operand 2
25336 LOW_DEST represents the low half (DImode) of TImode operand 0
25337 LOW_IN1 represents the low half (DImode) of TImode operand 1
25338 LOW_IN2 represents the low half (DImode) of TImode operand 2
25339 HIGH_DEST represents the high half (DImode) of TImode operand 0
25340 HIGH_IN1 represents the high half (DImode) of TImode operand 1
25341 HIGH_IN2 represents the high half (DImode) of TImode operand 2. */
25345 aarch64_subvti_scratch_regs (rtx op1
, rtx op2
, rtx
*low_dest
,
25346 rtx
*low_in1
, rtx
*low_in2
,
25347 rtx
*high_dest
, rtx
*high_in1
,
25350 *low_dest
= gen_reg_rtx (DImode
);
25351 *low_in1
= simplify_gen_subreg (DImode
, op1
, TImode
,
25352 subreg_lowpart_offset (DImode
, TImode
));
25354 *low_in2
= simplify_gen_subreg (DImode
, op2
, TImode
,
25355 subreg_lowpart_offset (DImode
, TImode
));
25356 *high_dest
= gen_reg_rtx (DImode
);
25358 *high_in1
= simplify_gen_subreg (DImode
, op1
, TImode
,
25359 subreg_highpart_offset (DImode
, TImode
));
25360 *high_in2
= simplify_gen_subreg (DImode
, op2
, TImode
,
25361 subreg_highpart_offset (DImode
, TImode
));
25364 /* Generate RTL for 128-bit (TImode) subtraction with overflow.
25366 OP0 represents the TImode destination operand 0
25367 LOW_DEST represents the low half (DImode) of TImode operand 0
25368 LOW_IN1 represents the low half (DImode) of TImode operand 1
25369 LOW_IN2 represents the low half (DImode) of TImode operand 2
25370 HIGH_DEST represents the high half (DImode) of TImode operand 0
25371 HIGH_IN1 represents the high half (DImode) of TImode operand 1
25372 HIGH_IN2 represents the high half (DImode) of TImode operand 2
25373 UNSIGNED_P is true if the operation is being performed on unsigned
25376 aarch64_expand_subvti (rtx op0
, rtx low_dest
, rtx low_in1
,
25377 rtx low_in2
, rtx high_dest
, rtx high_in1
,
25378 rtx high_in2
, bool unsigned_p
)
25380 if (low_in2
== const0_rtx
)
25382 low_dest
= low_in1
;
25383 high_in2
= force_reg (DImode
, high_in2
);
25385 emit_insn (gen_subdi3_compare1 (high_dest
, high_in1
, high_in2
));
25387 emit_insn (gen_subvdi_insn (high_dest
, high_in1
, high_in2
));
25391 if (aarch64_plus_immediate (low_in2
, DImode
))
25392 emit_insn (gen_subdi3_compare1_imm (low_dest
, low_in1
, low_in2
,
25393 GEN_INT (-UINTVAL (low_in2
))));
25396 low_in2
= force_reg (DImode
, low_in2
);
25397 emit_insn (gen_subdi3_compare1 (low_dest
, low_in1
, low_in2
));
25399 high_in2
= force_reg (DImode
, high_in2
);
25402 emit_insn (gen_usubdi3_carryinC (high_dest
, high_in1
, high_in2
));
25404 emit_insn (gen_subdi3_carryinV (high_dest
, high_in1
, high_in2
));
25407 emit_move_insn (gen_lowpart (DImode
, op0
), low_dest
);
25408 emit_move_insn (gen_highpart (DImode
, op0
), high_dest
);
25412 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
25414 static unsigned HOST_WIDE_INT
25415 aarch64_asan_shadow_offset (void)
25418 return (HOST_WIDE_INT_1
<< 29);
25420 return (HOST_WIDE_INT_1
<< 36);
25424 aarch64_gen_ccmp_first (rtx_insn
**prep_seq
, rtx_insn
**gen_seq
,
25425 int code
, tree treeop0
, tree treeop1
)
25427 machine_mode op_mode
, cmp_mode
, cc_mode
= CCmode
;
25429 int unsignedp
= TYPE_UNSIGNED (TREE_TYPE (treeop0
));
25431 struct expand_operand ops
[4];
25434 expand_operands (treeop0
, treeop1
, NULL_RTX
, &op0
, &op1
, EXPAND_NORMAL
);
25436 op_mode
= GET_MODE (op0
);
25437 if (op_mode
== VOIDmode
)
25438 op_mode
= GET_MODE (op1
);
25446 icode
= CODE_FOR_cmpsi
;
25451 icode
= CODE_FOR_cmpdi
;
25456 cc_mode
= aarch64_select_cc_mode ((rtx_code
) code
, op0
, op1
);
25457 icode
= cc_mode
== CCFPEmode
? CODE_FOR_fcmpesf
: CODE_FOR_fcmpsf
;
25462 cc_mode
= aarch64_select_cc_mode ((rtx_code
) code
, op0
, op1
);
25463 icode
= cc_mode
== CCFPEmode
? CODE_FOR_fcmpedf
: CODE_FOR_fcmpdf
;
25471 op0
= prepare_operand (icode
, op0
, 0, op_mode
, cmp_mode
, unsignedp
);
25472 op1
= prepare_operand (icode
, op1
, 1, op_mode
, cmp_mode
, unsignedp
);
25478 *prep_seq
= get_insns ();
25481 create_fixed_operand (&ops
[0], op0
);
25482 create_fixed_operand (&ops
[1], op1
);
25485 if (!maybe_expand_insn (icode
, 2, ops
))
25490 *gen_seq
= get_insns ();
25493 return gen_rtx_fmt_ee ((rtx_code
) code
, cc_mode
,
25494 gen_rtx_REG (cc_mode
, CC_REGNUM
), const0_rtx
);
25498 aarch64_gen_ccmp_next (rtx_insn
**prep_seq
, rtx_insn
**gen_seq
, rtx prev
,
25499 int cmp_code
, tree treeop0
, tree treeop1
, int bit_code
)
25501 rtx op0
, op1
, target
;
25502 machine_mode op_mode
, cmp_mode
, cc_mode
= CCmode
;
25503 int unsignedp
= TYPE_UNSIGNED (TREE_TYPE (treeop0
));
25505 struct expand_operand ops
[6];
25508 push_to_sequence (*prep_seq
);
25509 expand_operands (treeop0
, treeop1
, NULL_RTX
, &op0
, &op1
, EXPAND_NORMAL
);
25511 op_mode
= GET_MODE (op0
);
25512 if (op_mode
== VOIDmode
)
25513 op_mode
= GET_MODE (op1
);
25529 cc_mode
= aarch64_select_cc_mode ((rtx_code
) cmp_code
, op0
, op1
);
25534 cc_mode
= aarch64_select_cc_mode ((rtx_code
) cmp_code
, op0
, op1
);
25542 icode
= code_for_ccmp (cc_mode
, cmp_mode
);
25544 op0
= prepare_operand (icode
, op0
, 2, op_mode
, cmp_mode
, unsignedp
);
25545 op1
= prepare_operand (icode
, op1
, 3, op_mode
, cmp_mode
, unsignedp
);
25551 *prep_seq
= get_insns ();
25554 target
= gen_rtx_REG (cc_mode
, CC_REGNUM
);
25555 aarch64_cond
= aarch64_get_condition_code_1 (cc_mode
, (rtx_code
) cmp_code
);
25557 if (bit_code
!= AND
)
25559 /* Treat the ccmp patterns as canonical and use them where possible,
25560 but fall back to ccmp_rev patterns if there's no other option. */
25561 rtx_code prev_code
= GET_CODE (prev
);
25562 machine_mode prev_mode
= GET_MODE (XEXP (prev
, 0));
25563 if ((prev_mode
== CCFPmode
|| prev_mode
== CCFPEmode
)
25564 && !(prev_code
== EQ
25566 || prev_code
== ORDERED
25567 || prev_code
== UNORDERED
))
25568 icode
= code_for_ccmp_rev (cc_mode
, cmp_mode
);
25571 rtx_code code
= reverse_condition (prev_code
);
25572 prev
= gen_rtx_fmt_ee (code
, VOIDmode
, XEXP (prev
, 0), const0_rtx
);
25574 aarch64_cond
= AARCH64_INVERSE_CONDITION_CODE (aarch64_cond
);
25577 create_fixed_operand (&ops
[0], XEXP (prev
, 0));
25578 create_fixed_operand (&ops
[1], target
);
25579 create_fixed_operand (&ops
[2], op0
);
25580 create_fixed_operand (&ops
[3], op1
);
25581 create_fixed_operand (&ops
[4], prev
);
25582 create_fixed_operand (&ops
[5], GEN_INT (aarch64_cond
));
25584 push_to_sequence (*gen_seq
);
25585 if (!maybe_expand_insn (icode
, 6, ops
))
25591 *gen_seq
= get_insns ();
25594 return gen_rtx_fmt_ee ((rtx_code
) cmp_code
, VOIDmode
, target
, const0_rtx
);
25597 #undef TARGET_GEN_CCMP_FIRST
25598 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
25600 #undef TARGET_GEN_CCMP_NEXT
25601 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
25603 /* Implement TARGET_SCHED_MACRO_FUSION_P. Return true if target supports
25604 instruction fusion of some sort. */
25607 aarch64_macro_fusion_p (void)
25609 return aarch64_tune_params
.fusible_ops
!= AARCH64_FUSE_NOTHING
;
25613 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P. Return true if PREV and CURR
25614 should be kept together during scheduling. */
25617 aarch_macro_fusion_pair_p (rtx_insn
*prev
, rtx_insn
*curr
)
25620 rtx prev_set
= single_set (prev
);
25621 rtx curr_set
= single_set (curr
);
25622 /* prev and curr are simple SET insns i.e. no flag setting or branching. */
25623 bool simple_sets_p
= prev_set
&& curr_set
&& !any_condjump_p (curr
);
25625 if (!aarch64_macro_fusion_p ())
25628 if (simple_sets_p
&& aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK
))
25630 /* We are trying to match:
25631 prev (mov) == (set (reg r0) (const_int imm16))
25632 curr (movk) == (set (zero_extract (reg r0)
25635 (const_int imm16_1)) */
25637 set_dest
= SET_DEST (curr_set
);
25639 if (GET_CODE (set_dest
) == ZERO_EXTRACT
25640 && CONST_INT_P (SET_SRC (curr_set
))
25641 && CONST_INT_P (SET_SRC (prev_set
))
25642 && CONST_INT_P (XEXP (set_dest
, 2))
25643 && INTVAL (XEXP (set_dest
, 2)) == 16
25644 && REG_P (XEXP (set_dest
, 0))
25645 && REG_P (SET_DEST (prev_set
))
25646 && REGNO (XEXP (set_dest
, 0)) == REGNO (SET_DEST (prev_set
)))
25652 if (simple_sets_p
&& aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD
))
25655 /* We're trying to match:
25656 prev (adrp) == (set (reg r1)
25657 (high (symbol_ref ("SYM"))))
25658 curr (add) == (set (reg r0)
25660 (symbol_ref ("SYM"))))
25661 Note that r0 need not necessarily be the same as r1, especially
25662 during pre-regalloc scheduling. */
25664 if (satisfies_constraint_Ush (SET_SRC (prev_set
))
25665 && REG_P (SET_DEST (prev_set
)) && REG_P (SET_DEST (curr_set
)))
25667 if (GET_CODE (SET_SRC (curr_set
)) == LO_SUM
25668 && REG_P (XEXP (SET_SRC (curr_set
), 0))
25669 && REGNO (XEXP (SET_SRC (curr_set
), 0))
25670 == REGNO (SET_DEST (prev_set
))
25671 && rtx_equal_p (XEXP (SET_SRC (prev_set
), 0),
25672 XEXP (SET_SRC (curr_set
), 1)))
25677 if (simple_sets_p
&& aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK
))
25680 /* We're trying to match:
25681 prev (movk) == (set (zero_extract (reg r0)
25684 (const_int imm16_1))
25685 curr (movk) == (set (zero_extract (reg r0)
25688 (const_int imm16_2)) */
25690 if (GET_CODE (SET_DEST (prev_set
)) == ZERO_EXTRACT
25691 && GET_CODE (SET_DEST (curr_set
)) == ZERO_EXTRACT
25692 && REG_P (XEXP (SET_DEST (prev_set
), 0))
25693 && REG_P (XEXP (SET_DEST (curr_set
), 0))
25694 && REGNO (XEXP (SET_DEST (prev_set
), 0))
25695 == REGNO (XEXP (SET_DEST (curr_set
), 0))
25696 && CONST_INT_P (XEXP (SET_DEST (prev_set
), 2))
25697 && CONST_INT_P (XEXP (SET_DEST (curr_set
), 2))
25698 && INTVAL (XEXP (SET_DEST (prev_set
), 2)) == 32
25699 && INTVAL (XEXP (SET_DEST (curr_set
), 2)) == 48
25700 && CONST_INT_P (SET_SRC (prev_set
))
25701 && CONST_INT_P (SET_SRC (curr_set
)))
25705 if (simple_sets_p
&& aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR
))
25707 /* We're trying to match:
25708 prev (adrp) == (set (reg r0)
25709 (high (symbol_ref ("SYM"))))
25710 curr (ldr) == (set (reg r1)
25711 (mem (lo_sum (reg r0)
25712 (symbol_ref ("SYM")))))
25714 curr (ldr) == (set (reg r1)
25717 (symbol_ref ("SYM")))))) */
25718 if (satisfies_constraint_Ush (SET_SRC (prev_set
))
25719 && REG_P (SET_DEST (prev_set
)) && REG_P (SET_DEST (curr_set
)))
25721 rtx curr_src
= SET_SRC (curr_set
);
25723 if (GET_CODE (curr_src
) == ZERO_EXTEND
)
25724 curr_src
= XEXP (curr_src
, 0);
25726 if (MEM_P (curr_src
) && GET_CODE (XEXP (curr_src
, 0)) == LO_SUM
25727 && REG_P (XEXP (XEXP (curr_src
, 0), 0))
25728 && REGNO (XEXP (XEXP (curr_src
, 0), 0))
25729 == REGNO (SET_DEST (prev_set
))
25730 && rtx_equal_p (XEXP (XEXP (curr_src
, 0), 1),
25731 XEXP (SET_SRC (prev_set
), 0)))
25736 /* Fuse compare (CMP/CMN/TST/BICS) and conditional branch. */
25737 if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH
)
25738 && prev_set
&& curr_set
&& any_condjump_p (curr
)
25739 && GET_CODE (SET_SRC (prev_set
)) == COMPARE
25740 && SCALAR_INT_MODE_P (GET_MODE (XEXP (SET_SRC (prev_set
), 0)))
25741 && reg_referenced_p (SET_DEST (prev_set
), PATTERN (curr
)))
25744 /* Fuse flag-setting ALU instructions and conditional branch. */
25745 if (aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH
)
25746 && any_condjump_p (curr
))
25748 unsigned int condreg1
, condreg2
;
25750 aarch64_fixed_condition_code_regs (&condreg1
, &condreg2
);
25751 cc_reg_1
= gen_rtx_REG (CCmode
, condreg1
);
25753 if (reg_referenced_p (cc_reg_1
, PATTERN (curr
))
25755 && modified_in_p (cc_reg_1
, prev
))
25757 enum attr_type prev_type
= get_attr_type (prev
);
25759 /* FIXME: this misses some which is considered simple arthematic
25760 instructions for ThunderX. Simple shifts are missed here. */
25761 if (prev_type
== TYPE_ALUS_SREG
25762 || prev_type
== TYPE_ALUS_IMM
25763 || prev_type
== TYPE_LOGICS_REG
25764 || prev_type
== TYPE_LOGICS_IMM
)
25769 /* Fuse ALU instructions and CBZ/CBNZ. */
25772 && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_CBZ
)
25773 && any_condjump_p (curr
))
25775 /* We're trying to match:
25776 prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
25777 curr (cbz) == (set (pc) (if_then_else (eq/ne) (r0)
25779 (label_ref ("SYM"))
25781 if (SET_DEST (curr_set
) == (pc_rtx
)
25782 && GET_CODE (SET_SRC (curr_set
)) == IF_THEN_ELSE
25783 && REG_P (XEXP (XEXP (SET_SRC (curr_set
), 0), 0))
25784 && REG_P (SET_DEST (prev_set
))
25785 && REGNO (SET_DEST (prev_set
))
25786 == REGNO (XEXP (XEXP (SET_SRC (curr_set
), 0), 0)))
25788 /* Fuse ALU operations followed by conditional branch instruction. */
25789 switch (get_attr_type (prev
))
25792 case TYPE_ALU_SREG
:
25795 case TYPE_ADCS_REG
:
25796 case TYPE_ADCS_IMM
:
25797 case TYPE_LOGIC_REG
:
25798 case TYPE_LOGIC_IMM
:
25802 case TYPE_SHIFT_REG
:
25803 case TYPE_SHIFT_IMM
:
25815 /* Fuse A+B+1 and A-B-1 */
25817 && aarch64_fusion_enabled_p (AARCH64_FUSE_ADDSUB_2REG_CONST1
))
25819 /* We're trying to match:
25820 prev == (set (r0) (plus (r0) (r1)))
25821 curr == (set (r0) (plus (r0) (const_int 1)))
25823 prev == (set (r0) (minus (r0) (r1)))
25824 curr == (set (r0) (plus (r0) (const_int -1))) */
25826 rtx prev_src
= SET_SRC (prev_set
);
25827 rtx curr_src
= SET_SRC (curr_set
);
25830 if (GET_CODE (prev_src
) == MINUS
)
25833 if (GET_CODE (curr_src
) == PLUS
25834 && (GET_CODE (prev_src
) == PLUS
|| GET_CODE (prev_src
) == MINUS
)
25835 && CONST_INT_P (XEXP (curr_src
, 1))
25836 && INTVAL (XEXP (curr_src
, 1)) == polarity
25837 && REG_P (XEXP (curr_src
, 0))
25838 && REGNO (SET_DEST (prev_set
)) == REGNO (XEXP (curr_src
, 0)))
25845 /* Return true iff the instruction fusion described by OP is enabled. */
25848 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op
)
25850 return (aarch64_tune_params
.fusible_ops
& op
) != 0;
25853 /* If MEM is in the form of [base+offset], extract the two parts
25854 of address and set to BASE and OFFSET, otherwise return false
25855 after clearing BASE and OFFSET. */
25858 extract_base_offset_in_addr (rtx mem
, rtx
*base
, rtx
*offset
)
25862 gcc_assert (MEM_P (mem
));
25864 addr
= XEXP (mem
, 0);
25869 *offset
= const0_rtx
;
25873 if (GET_CODE (addr
) == PLUS
25874 && REG_P (XEXP (addr
, 0)) && CONST_INT_P (XEXP (addr
, 1)))
25876 *base
= XEXP (addr
, 0);
25877 *offset
= XEXP (addr
, 1);
25882 *offset
= NULL_RTX
;
25887 /* Types for scheduling fusion. */
25888 enum sched_fusion_type
25890 SCHED_FUSION_NONE
= 0,
25891 SCHED_FUSION_LD_SIGN_EXTEND
,
25892 SCHED_FUSION_LD_ZERO_EXTEND
,
25898 /* If INSN is a load or store of address in the form of [base+offset],
25899 extract the two parts and set to BASE and OFFSET. Return scheduling
25900 fusion type this INSN is. */
25902 static enum sched_fusion_type
25903 fusion_load_store (rtx_insn
*insn
, rtx
*base
, rtx
*offset
)
25906 enum sched_fusion_type fusion
= SCHED_FUSION_LD
;
25908 gcc_assert (INSN_P (insn
));
25909 x
= PATTERN (insn
);
25910 if (GET_CODE (x
) != SET
)
25911 return SCHED_FUSION_NONE
;
25914 dest
= SET_DEST (x
);
25916 machine_mode dest_mode
= GET_MODE (dest
);
25918 if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode
))
25919 return SCHED_FUSION_NONE
;
25921 if (GET_CODE (src
) == SIGN_EXTEND
)
25923 fusion
= SCHED_FUSION_LD_SIGN_EXTEND
;
25924 src
= XEXP (src
, 0);
25925 if (!MEM_P (src
) || GET_MODE (src
) != SImode
)
25926 return SCHED_FUSION_NONE
;
25928 else if (GET_CODE (src
) == ZERO_EXTEND
)
25930 fusion
= SCHED_FUSION_LD_ZERO_EXTEND
;
25931 src
= XEXP (src
, 0);
25932 if (!MEM_P (src
) || GET_MODE (src
) != SImode
)
25933 return SCHED_FUSION_NONE
;
25936 if (MEM_P (src
) && REG_P (dest
))
25937 extract_base_offset_in_addr (src
, base
, offset
);
25938 else if (MEM_P (dest
) && (REG_P (src
) || src
== const0_rtx
))
25940 fusion
= SCHED_FUSION_ST
;
25941 extract_base_offset_in_addr (dest
, base
, offset
);
25944 return SCHED_FUSION_NONE
;
25946 if (*base
== NULL_RTX
|| *offset
== NULL_RTX
)
25947 fusion
= SCHED_FUSION_NONE
;
25952 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
25954 Currently we only support to fuse ldr or str instructions, so FUSION_PRI
25955 and PRI are only calculated for these instructions. For other instruction,
25956 FUSION_PRI and PRI are simply set to MAX_PRI - 1. In the future, other
25957 type instruction fusion can be added by returning different priorities.
25959 It's important that irrelevant instructions get the largest FUSION_PRI. */
25962 aarch64_sched_fusion_priority (rtx_insn
*insn
, int max_pri
,
25963 int *fusion_pri
, int *pri
)
25967 enum sched_fusion_type fusion
;
25969 gcc_assert (INSN_P (insn
));
25972 fusion
= fusion_load_store (insn
, &base
, &offset
);
25973 if (fusion
== SCHED_FUSION_NONE
)
25980 /* Set FUSION_PRI according to fusion type and base register. */
25981 *fusion_pri
= tmp
- fusion
* FIRST_PSEUDO_REGISTER
- REGNO (base
);
25983 /* Calculate PRI. */
25986 /* INSN with smaller offset goes first. */
25987 off_val
= (int)(INTVAL (offset
));
25989 tmp
-= (off_val
& 0xfffff);
25991 tmp
+= ((- off_val
) & 0xfffff);
25997 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
25998 Adjust priority of sha1h instructions so they are scheduled before
25999 other SHA1 instructions. */
26002 aarch64_sched_adjust_priority (rtx_insn
*insn
, int priority
)
26004 rtx x
= PATTERN (insn
);
26006 if (GET_CODE (x
) == SET
)
26010 if (GET_CODE (x
) == UNSPEC
&& XINT (x
, 1) == UNSPEC_SHA1H
)
26011 return priority
+ 10;
26017 /* If REVERSED is null, return true if memory reference *MEM2 comes
26018 immediately after memory reference *MEM1. Do not change the references
26021 Otherwise, check if *MEM1 and *MEM2 are consecutive memory references and,
26022 if they are, try to make them use constant offsets from the same base
26023 register. Return true on success. When returning true, set *REVERSED
26024 to true if *MEM1 comes after *MEM2, false if *MEM1 comes before *MEM2. */
26026 aarch64_check_consecutive_mems (rtx
*mem1
, rtx
*mem2
, bool *reversed
)
26031 if (GET_RTX_CLASS (GET_CODE (XEXP (*mem1
, 0))) == RTX_AUTOINC
26032 || GET_RTX_CLASS (GET_CODE (XEXP (*mem2
, 0))) == RTX_AUTOINC
)
26035 if (!MEM_SIZE_KNOWN_P (*mem1
) || !MEM_SIZE_KNOWN_P (*mem2
))
26038 auto size1
= MEM_SIZE (*mem1
);
26039 auto size2
= MEM_SIZE (*mem2
);
26041 rtx base1
, base2
, offset1
, offset2
;
26042 extract_base_offset_in_addr (*mem1
, &base1
, &offset1
);
26043 extract_base_offset_in_addr (*mem2
, &base2
, &offset2
);
26045 /* Make sure at least one memory is in base+offset form. */
26046 if (!(base1
&& offset1
) && !(base2
&& offset2
))
26049 /* If both mems already use the same base register, just check the
26051 if (base1
&& base2
&& rtx_equal_p (base1
, base2
))
26053 if (!offset1
|| !offset2
)
26056 if (known_eq (UINTVAL (offset1
) + size1
, UINTVAL (offset2
)))
26059 if (known_eq (UINTVAL (offset2
) + size2
, UINTVAL (offset1
)) && reversed
)
26068 /* Otherwise, check whether the MEM_EXPRs and MEM_OFFSETs together
26069 guarantee that the values are consecutive. */
26070 if (MEM_EXPR (*mem1
)
26071 && MEM_EXPR (*mem2
)
26072 && MEM_OFFSET_KNOWN_P (*mem1
)
26073 && MEM_OFFSET_KNOWN_P (*mem2
))
26075 poly_int64 expr_offset1
;
26076 poly_int64 expr_offset2
;
26077 tree expr_base1
= get_addr_base_and_unit_offset (MEM_EXPR (*mem1
),
26079 tree expr_base2
= get_addr_base_and_unit_offset (MEM_EXPR (*mem2
),
26083 || !DECL_P (expr_base1
)
26084 || !operand_equal_p (expr_base1
, expr_base2
, OEP_ADDRESS_OF
))
26087 expr_offset1
+= MEM_OFFSET (*mem1
);
26088 expr_offset2
+= MEM_OFFSET (*mem2
);
26090 if (known_eq (expr_offset1
+ size1
, expr_offset2
))
26092 else if (known_eq (expr_offset2
+ size2
, expr_offset1
) && reversed
)
26101 rtx addr1
= plus_constant (Pmode
, XEXP (*mem2
, 0),
26102 expr_offset1
- expr_offset2
);
26103 *mem1
= replace_equiv_address_nv (*mem1
, addr1
);
26107 rtx addr2
= plus_constant (Pmode
, XEXP (*mem1
, 0),
26108 expr_offset2
- expr_offset1
);
26109 *mem2
= replace_equiv_address_nv (*mem2
, addr2
);
26118 /* Return true if MEM1 and MEM2 can be combined into a single access
26119 of mode MODE, with the combined access having the same address as MEM1. */
26122 aarch64_mergeable_load_pair_p (machine_mode mode
, rtx mem1
, rtx mem2
)
26124 if (STRICT_ALIGNMENT
&& MEM_ALIGN (mem1
) < GET_MODE_ALIGNMENT (mode
))
26126 return aarch64_check_consecutive_mems (&mem1
, &mem2
, nullptr);
26129 /* Given OPERANDS of consecutive load/store, check if we can merge
26130 them into ldp/stp. LOAD is true if they are load instructions.
26131 MODE is the mode of memory operands. */
26134 aarch64_operands_ok_for_ldpstp (rtx
*operands
, bool load
,
26137 enum reg_class rclass_1
, rclass_2
;
26138 rtx mem_1
, mem_2
, reg_1
, reg_2
;
26142 mem_1
= operands
[1];
26143 mem_2
= operands
[3];
26144 reg_1
= operands
[0];
26145 reg_2
= operands
[2];
26146 gcc_assert (REG_P (reg_1
) && REG_P (reg_2
));
26147 if (REGNO (reg_1
) == REGNO (reg_2
))
26149 if (reg_overlap_mentioned_p (reg_1
, mem_2
))
26154 mem_1
= operands
[0];
26155 mem_2
= operands
[2];
26156 reg_1
= operands
[1];
26157 reg_2
= operands
[3];
26160 /* The mems cannot be volatile. */
26161 if (MEM_VOLATILE_P (mem_1
) || MEM_VOLATILE_P (mem_2
))
26164 /* If we have SImode and slow unaligned ldp,
26165 check the alignment to be at least 8 byte. */
26167 && (aarch64_tune_params
.extra_tuning_flags
26168 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
)
26170 && MEM_ALIGN (mem_1
) < 8 * BITS_PER_UNIT
)
26173 /* Check if the addresses are in the form of [base+offset]. */
26174 bool reversed
= false;
26175 if (!aarch64_check_consecutive_mems (&mem_1
, &mem_2
, &reversed
))
26178 /* The operands must be of the same size. */
26179 gcc_assert (known_eq (GET_MODE_SIZE (GET_MODE (mem_1
)),
26180 GET_MODE_SIZE (GET_MODE (mem_2
))));
26182 /* One of the memory accesses must be a mempair operand.
26183 If it is not the first one, they need to be swapped by the
26185 if (!aarch64_mem_pair_operand (mem_1
, GET_MODE (mem_1
))
26186 && !aarch64_mem_pair_operand (mem_2
, GET_MODE (mem_2
)))
26189 if (REG_P (reg_1
) && FP_REGNUM_P (REGNO (reg_1
)))
26190 rclass_1
= FP_REGS
;
26192 rclass_1
= GENERAL_REGS
;
26194 if (REG_P (reg_2
) && FP_REGNUM_P (REGNO (reg_2
)))
26195 rclass_2
= FP_REGS
;
26197 rclass_2
= GENERAL_REGS
;
26199 /* Check if the registers are of same class. */
26200 if (rclass_1
!= rclass_2
)
26206 /* Given OPERANDS of consecutive load/store that can be merged,
26207 swap them if they are not in ascending order. */
26209 aarch64_swap_ldrstr_operands (rtx
* operands
, bool load
)
26211 int mem_op
= load
? 1 : 0;
26212 bool reversed
= false;
26213 if (!aarch64_check_consecutive_mems (operands
+ mem_op
,
26214 operands
+ mem_op
+ 2, &reversed
))
26215 gcc_unreachable ();
26219 /* Irrespective of whether this is a load or a store,
26220 we do the same swap. */
26221 std::swap (operands
[0], operands
[2]);
26222 std::swap (operands
[1], operands
[3]);
26226 /* Taking X and Y to be HOST_WIDE_INT pointers, return the result of a
26227 comparison between the two. */
26229 aarch64_host_wide_int_compare (const void *x
, const void *y
)
26231 return wi::cmps (* ((const HOST_WIDE_INT
*) x
),
26232 * ((const HOST_WIDE_INT
*) y
));
26235 /* Taking X and Y to be pairs of RTX, one pointing to a MEM rtx and the
26236 other pointing to a REG rtx containing an offset, compare the offsets
26241 1 iff offset (X) > offset (Y)
26242 0 iff offset (X) == offset (Y)
26243 -1 iff offset (X) < offset (Y) */
26245 aarch64_ldrstr_offset_compare (const void *x
, const void *y
)
26247 const rtx
* operands_1
= (const rtx
*) x
;
26248 const rtx
* operands_2
= (const rtx
*) y
;
26249 rtx mem_1
, mem_2
, base
, offset_1
, offset_2
;
26251 if (MEM_P (operands_1
[0]))
26252 mem_1
= operands_1
[0];
26254 mem_1
= operands_1
[1];
26256 if (MEM_P (operands_2
[0]))
26257 mem_2
= operands_2
[0];
26259 mem_2
= operands_2
[1];
26261 /* Extract the offsets. */
26262 extract_base_offset_in_addr (mem_1
, &base
, &offset_1
);
26263 extract_base_offset_in_addr (mem_2
, &base
, &offset_2
);
26265 gcc_assert (offset_1
!= NULL_RTX
&& offset_2
!= NULL_RTX
);
26267 return wi::cmps (INTVAL (offset_1
), INTVAL (offset_2
));
26270 /* Given OPERANDS of consecutive load/store, check if we can merge
26271 them into ldp/stp by adjusting the offset. LOAD is true if they
26272 are load instructions. MODE is the mode of memory operands.
26274 Given below consecutive stores:
26276 str w1, [xb, 0x100]
26277 str w1, [xb, 0x104]
26278 str w1, [xb, 0x108]
26279 str w1, [xb, 0x10c]
26281 Though the offsets are out of the range supported by stp, we can
26282 still pair them after adjusting the offset, like:
26284 add scratch, xb, 0x100
26285 stp w1, w1, [scratch]
26286 stp w1, w1, [scratch, 0x8]
26288 The peephole patterns detecting this opportunity should guarantee
26289 the scratch register is avaliable. */
26292 aarch64_operands_adjust_ok_for_ldpstp (rtx
*operands
, bool load
,
26295 const int num_insns
= 4;
26296 enum reg_class rclass
;
26297 HOST_WIDE_INT offvals
[num_insns
], msize
;
26298 rtx mem
[num_insns
], reg
[num_insns
], base
[num_insns
], offset
[num_insns
];
26302 for (int i
= 0; i
< num_insns
; i
++)
26304 reg
[i
] = operands
[2 * i
];
26305 mem
[i
] = operands
[2 * i
+ 1];
26307 gcc_assert (REG_P (reg
[i
]));
26310 /* Do not attempt to merge the loads if the loads clobber each other. */
26311 for (int i
= 0; i
< 8; i
+= 2)
26312 for (int j
= i
+ 2; j
< 8; j
+= 2)
26313 if (reg_overlap_mentioned_p (operands
[i
], operands
[j
]))
26317 for (int i
= 0; i
< num_insns
; i
++)
26319 mem
[i
] = operands
[2 * i
];
26320 reg
[i
] = operands
[2 * i
+ 1];
26323 /* Skip if memory operand is by itself valid for ldp/stp. */
26324 if (!MEM_P (mem
[0]) || aarch64_mem_pair_operand (mem
[0], mode
))
26327 for (int i
= 0; i
< num_insns
; i
++)
26329 /* The mems cannot be volatile. */
26330 if (MEM_VOLATILE_P (mem
[i
]))
26333 /* Check if the addresses are in the form of [base+offset]. */
26334 extract_base_offset_in_addr (mem
[i
], base
+ i
, offset
+ i
);
26335 if (base
[i
] == NULL_RTX
|| offset
[i
] == NULL_RTX
)
26339 /* Check if the registers are of same class. */
26340 rclass
= REG_P (reg
[0]) && FP_REGNUM_P (REGNO (reg
[0]))
26341 ? FP_REGS
: GENERAL_REGS
;
26343 for (int i
= 1; i
< num_insns
; i
++)
26344 if (REG_P (reg
[i
]) && FP_REGNUM_P (REGNO (reg
[i
])))
26346 if (rclass
!= FP_REGS
)
26351 if (rclass
!= GENERAL_REGS
)
26355 /* Only the last register in the order in which they occur
26356 may be clobbered by the load. */
26357 if (rclass
== GENERAL_REGS
&& load
)
26358 for (int i
= 0; i
< num_insns
- 1; i
++)
26359 if (reg_mentioned_p (reg
[i
], mem
[i
]))
26362 /* Check if the bases are same. */
26363 for (int i
= 0; i
< num_insns
- 1; i
++)
26364 if (!rtx_equal_p (base
[i
], base
[i
+ 1]))
26367 for (int i
= 0; i
< num_insns
; i
++)
26368 offvals
[i
] = INTVAL (offset
[i
]);
26370 msize
= GET_MODE_SIZE (mode
).to_constant ();
26372 /* Check if the offsets can be put in the right order to do a ldp/stp. */
26373 qsort (offvals
, num_insns
, sizeof (HOST_WIDE_INT
),
26374 aarch64_host_wide_int_compare
);
26376 if (!(offvals
[1] == offvals
[0] + msize
26377 && offvals
[3] == offvals
[2] + msize
))
26380 /* Check that offsets are within range of each other. The ldp/stp
26381 instructions have 7 bit immediate offsets, so use 0x80. */
26382 if (offvals
[2] - offvals
[0] >= msize
* 0x80)
26385 /* The offsets must be aligned with respect to each other. */
26386 if (offvals
[0] % msize
!= offvals
[2] % msize
)
26389 /* If we have SImode and slow unaligned ldp,
26390 check the alignment to be at least 8 byte. */
26392 && (aarch64_tune_params
.extra_tuning_flags
26393 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
)
26395 && MEM_ALIGN (mem
[0]) < 8 * BITS_PER_UNIT
)
26401 /* Given OPERANDS of consecutive load/store, this function pairs them
26402 into LDP/STP after adjusting the offset. It depends on the fact
26403 that the operands can be sorted so the offsets are correct for STP.
26404 MODE is the mode of memory operands. CODE is the rtl operator
26405 which should be applied to all memory operands, it's SIGN_EXTEND,
26406 ZERO_EXTEND or UNKNOWN. */
26409 aarch64_gen_adjusted_ldpstp (rtx
*operands
, bool load
,
26410 machine_mode mode
, RTX_CODE code
)
26412 rtx base
, offset_1
, offset_3
, t1
, t2
;
26413 rtx mem_1
, mem_2
, mem_3
, mem_4
;
26414 rtx temp_operands
[8];
26415 HOST_WIDE_INT off_val_1
, off_val_3
, base_off
, new_off_1
, new_off_3
,
26416 stp_off_upper_limit
, stp_off_lower_limit
, msize
;
26418 /* We make changes on a copy as we may still bail out. */
26419 for (int i
= 0; i
< 8; i
++)
26420 temp_operands
[i
] = operands
[i
];
26422 /* Sort the operands. Note for cases as below:
26427 We need stable sorting otherwise wrong data may be store to offset 0x320.
26428 Also note the dead store in above case should be optimized away, but no
26429 guarantees here. */
26430 gcc_stablesort(temp_operands
, 4, 2 * sizeof (rtx
*),
26431 aarch64_ldrstr_offset_compare
);
26433 /* Copy the memory operands so that if we have to bail for some
26434 reason the original addresses are unchanged. */
26437 mem_1
= copy_rtx (temp_operands
[1]);
26438 mem_2
= copy_rtx (temp_operands
[3]);
26439 mem_3
= copy_rtx (temp_operands
[5]);
26440 mem_4
= copy_rtx (temp_operands
[7]);
26444 mem_1
= copy_rtx (temp_operands
[0]);
26445 mem_2
= copy_rtx (temp_operands
[2]);
26446 mem_3
= copy_rtx (temp_operands
[4]);
26447 mem_4
= copy_rtx (temp_operands
[6]);
26448 gcc_assert (code
== UNKNOWN
);
26451 extract_base_offset_in_addr (mem_1
, &base
, &offset_1
);
26452 extract_base_offset_in_addr (mem_3
, &base
, &offset_3
);
26453 gcc_assert (base
!= NULL_RTX
&& offset_1
!= NULL_RTX
26454 && offset_3
!= NULL_RTX
);
26456 /* Adjust offset so it can fit in LDP/STP instruction. */
26457 msize
= GET_MODE_SIZE (mode
).to_constant();
26458 stp_off_upper_limit
= msize
* (0x40 - 1);
26459 stp_off_lower_limit
= - msize
* 0x40;
26461 off_val_1
= INTVAL (offset_1
);
26462 off_val_3
= INTVAL (offset_3
);
26464 /* The base offset is optimally half way between the two STP/LDP offsets. */
26466 base_off
= (off_val_1
+ off_val_3
) / 2;
26468 /* However, due to issues with negative LDP/STP offset generation for
26469 larger modes, for DF, DD, DI and vector modes. we must not use negative
26470 addresses smaller than 9 signed unadjusted bits can store. This
26471 provides the most range in this case. */
26472 base_off
= off_val_1
;
26474 /* Adjust the base so that it is aligned with the addresses but still
26476 if (base_off
% msize
!= off_val_1
% msize
)
26477 /* Fix the offset, bearing in mind we want to make it bigger not
26479 base_off
+= (((base_off
% msize
) - (off_val_1
% msize
)) + msize
) % msize
;
26480 else if (msize
<= 4)
26481 /* The negative range of LDP/STP is one larger than the positive range. */
26484 /* Check if base offset is too big or too small. We can attempt to resolve
26485 this issue by setting it to the maximum value and seeing if the offsets
26487 if (base_off
>= 0x1000)
26489 base_off
= 0x1000 - 1;
26490 /* We must still make sure that the base offset is aligned with respect
26491 to the address. But it may not be made any bigger. */
26492 base_off
-= (((base_off
% msize
) - (off_val_1
% msize
)) + msize
) % msize
;
26495 /* Likewise for the case where the base is too small. */
26496 if (base_off
<= -0x1000)
26498 base_off
= -0x1000 + 1;
26499 base_off
+= (((base_off
% msize
) - (off_val_1
% msize
)) + msize
) % msize
;
26502 /* Offset of the first STP/LDP. */
26503 new_off_1
= off_val_1
- base_off
;
26505 /* Offset of the second STP/LDP. */
26506 new_off_3
= off_val_3
- base_off
;
26508 /* The offsets must be within the range of the LDP/STP instructions. */
26509 if (new_off_1
> stp_off_upper_limit
|| new_off_1
< stp_off_lower_limit
26510 || new_off_3
> stp_off_upper_limit
|| new_off_3
< stp_off_lower_limit
)
26513 replace_equiv_address_nv (mem_1
, plus_constant (Pmode
, operands
[8],
26515 replace_equiv_address_nv (mem_2
, plus_constant (Pmode
, operands
[8],
26516 new_off_1
+ msize
), true);
26517 replace_equiv_address_nv (mem_3
, plus_constant (Pmode
, operands
[8],
26519 replace_equiv_address_nv (mem_4
, plus_constant (Pmode
, operands
[8],
26520 new_off_3
+ msize
), true);
26522 if (!aarch64_mem_pair_operand (mem_1
, mode
)
26523 || !aarch64_mem_pair_operand (mem_3
, mode
))
26526 if (code
== ZERO_EXTEND
)
26528 mem_1
= gen_rtx_ZERO_EXTEND (DImode
, mem_1
);
26529 mem_2
= gen_rtx_ZERO_EXTEND (DImode
, mem_2
);
26530 mem_3
= gen_rtx_ZERO_EXTEND (DImode
, mem_3
);
26531 mem_4
= gen_rtx_ZERO_EXTEND (DImode
, mem_4
);
26533 else if (code
== SIGN_EXTEND
)
26535 mem_1
= gen_rtx_SIGN_EXTEND (DImode
, mem_1
);
26536 mem_2
= gen_rtx_SIGN_EXTEND (DImode
, mem_2
);
26537 mem_3
= gen_rtx_SIGN_EXTEND (DImode
, mem_3
);
26538 mem_4
= gen_rtx_SIGN_EXTEND (DImode
, mem_4
);
26543 operands
[0] = temp_operands
[0];
26544 operands
[1] = mem_1
;
26545 operands
[2] = temp_operands
[2];
26546 operands
[3] = mem_2
;
26547 operands
[4] = temp_operands
[4];
26548 operands
[5] = mem_3
;
26549 operands
[6] = temp_operands
[6];
26550 operands
[7] = mem_4
;
26554 operands
[0] = mem_1
;
26555 operands
[1] = temp_operands
[1];
26556 operands
[2] = mem_2
;
26557 operands
[3] = temp_operands
[3];
26558 operands
[4] = mem_3
;
26559 operands
[5] = temp_operands
[5];
26560 operands
[6] = mem_4
;
26561 operands
[7] = temp_operands
[7];
26564 /* Emit adjusting instruction. */
26565 emit_insn (gen_rtx_SET (operands
[8], plus_constant (DImode
, base
, base_off
)));
26566 /* Emit ldp/stp instructions. */
26567 t1
= gen_rtx_SET (operands
[0], operands
[1]);
26568 t2
= gen_rtx_SET (operands
[2], operands
[3]);
26569 emit_insn (gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (2, t1
, t2
)));
26570 t1
= gen_rtx_SET (operands
[4], operands
[5]);
26571 t2
= gen_rtx_SET (operands
[6], operands
[7]);
26572 emit_insn (gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (2, t1
, t2
)));
26576 /* Implement TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE. Assume for now that
26577 it isn't worth branching around empty masked ops (including masked
26581 aarch64_empty_mask_is_expensive (unsigned)
26586 /* Return 1 if pseudo register should be created and used to hold
26587 GOT address for PIC code. */
26590 aarch64_use_pseudo_pic_reg (void)
26592 return aarch64_cmodel
== AARCH64_CMODEL_SMALL_SPIC
;
26595 /* Implement TARGET_UNSPEC_MAY_TRAP_P. */
26598 aarch64_unspec_may_trap_p (const_rtx x
, unsigned flags
)
26600 switch (XINT (x
, 1))
26602 case UNSPEC_GOTSMALLPIC
:
26603 case UNSPEC_GOTSMALLPIC28K
:
26604 case UNSPEC_GOTTINYPIC
:
26610 return default_unspec_may_trap_p (x
, flags
);
26614 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
26615 return the log2 of that value. Otherwise return -1. */
26618 aarch64_fpconst_pow_of_2 (rtx x
)
26620 const REAL_VALUE_TYPE
*r
;
26622 if (!CONST_DOUBLE_P (x
))
26625 r
= CONST_DOUBLE_REAL_VALUE (x
);
26627 if (REAL_VALUE_NEGATIVE (*r
)
26628 || REAL_VALUE_ISNAN (*r
)
26629 || REAL_VALUE_ISINF (*r
)
26630 || !real_isinteger (r
, DFmode
))
26633 return exact_log2 (real_to_integer (r
));
26636 /* If X is a positive CONST_DOUBLE with a value that is the reciprocal of a
26637 power of 2 (i.e 1/2^n) return the number of float bits. e.g. for x==(1/2^n)
26638 return n. Otherwise return -1. */
26641 aarch64_fpconst_pow2_recip (rtx x
)
26643 REAL_VALUE_TYPE r0
;
26645 if (!CONST_DOUBLE_P (x
))
26648 r0
= *CONST_DOUBLE_REAL_VALUE (x
);
26649 if (exact_real_inverse (DFmode
, &r0
)
26650 && !REAL_VALUE_NEGATIVE (r0
))
26652 int ret
= exact_log2 (real_to_integer (&r0
));
26653 if (ret
>= 1 && ret
<= 32)
26659 /* If X is a vector of equal CONST_DOUBLE values and that value is
26660 Y, return the aarch64_fpconst_pow_of_2 of Y. Otherwise return -1. */
26663 aarch64_vec_fpconst_pow_of_2 (rtx x
)
26666 if (!CONST_VECTOR_P (x
)
26667 || !CONST_VECTOR_NUNITS (x
).is_constant (&nelts
))
26670 if (GET_MODE_CLASS (GET_MODE (x
)) != MODE_VECTOR_FLOAT
)
26673 int firstval
= aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x
, 0));
26677 for (int i
= 1; i
< nelts
; i
++)
26678 if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x
, i
)) != firstval
)
26684 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
26687 __fp16 always promotes through this hook.
26688 _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
26689 through the generic excess precision logic rather than here. */
26692 aarch64_promoted_type (const_tree t
)
26694 if (SCALAR_FLOAT_TYPE_P (t
)
26695 && TYPE_MAIN_VARIANT (t
) == aarch64_fp16_type_node
)
26696 return float_type_node
;
26701 /* Implement the TARGET_OPTAB_SUPPORTED_P hook. */
26704 aarch64_optab_supported_p (int op
, machine_mode mode1
, machine_mode
,
26705 optimization_type opt_type
)
26710 return opt_type
== OPTIMIZE_FOR_SPEED
&& use_rsqrt_p (mode1
);
26717 /* Implement the TARGET_DWARF_POLY_INDETERMINATE_VALUE hook. */
26719 static unsigned int
26720 aarch64_dwarf_poly_indeterminate_value (unsigned int i
, unsigned int *factor
,
26723 /* Polynomial invariant 1 == (VG / 2) - 1. */
26724 gcc_assert (i
== 1);
26727 return AARCH64_DWARF_VG
;
26730 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
26731 if MODE is HFmode, and punt to the generic implementation otherwise. */
26734 aarch64_libgcc_floating_mode_supported_p (scalar_float_mode mode
)
26736 return (mode
== HFmode
26738 : default_libgcc_floating_mode_supported_p (mode
));
26741 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
26742 if MODE is HFmode, and punt to the generic implementation otherwise. */
26745 aarch64_scalar_mode_supported_p (scalar_mode mode
)
26747 if (DECIMAL_FLOAT_MODE_P (mode
))
26748 return default_decimal_float_supported_p ();
26750 return (mode
== HFmode
26752 : default_scalar_mode_supported_p (mode
));
26755 /* Set the value of FLT_EVAL_METHOD.
26756 ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
26758 0: evaluate all operations and constants, whose semantic type has at
26759 most the range and precision of type float, to the range and
26760 precision of float; evaluate all other operations and constants to
26761 the range and precision of the semantic type;
26763 N, where _FloatN is a supported interchange floating type
26764 evaluate all operations and constants, whose semantic type has at
26765 most the range and precision of _FloatN type, to the range and
26766 precision of the _FloatN type; evaluate all other operations and
26767 constants to the range and precision of the semantic type;
26769 If we have the ARMv8.2-A extensions then we support _Float16 in native
26770 precision, so we should set this to 16. Otherwise, we support the type,
26771 but want to evaluate expressions in float precision, so set this to
26774 static enum flt_eval_method
26775 aarch64_excess_precision (enum excess_precision_type type
)
26779 case EXCESS_PRECISION_TYPE_FAST
:
26780 case EXCESS_PRECISION_TYPE_STANDARD
:
26781 /* We can calculate either in 16-bit range and precision or
26782 32-bit range and precision. Make that decision based on whether
26783 we have native support for the ARMv8.2-A 16-bit floating-point
26784 instructions or not. */
26785 return (TARGET_FP_F16INST
26786 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
26787 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT
);
26788 case EXCESS_PRECISION_TYPE_IMPLICIT
:
26789 case EXCESS_PRECISION_TYPE_FLOAT16
:
26790 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
;
26792 gcc_unreachable ();
26794 return FLT_EVAL_METHOD_UNPREDICTABLE
;
26797 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN. Return true if INSN can be
26798 scheduled for speculative execution. Reject the long-running division
26799 and square-root instructions. */
26802 aarch64_sched_can_speculate_insn (rtx_insn
*insn
)
26804 switch (get_attr_type (insn
))
26812 case TYPE_NEON_FP_SQRT_S
:
26813 case TYPE_NEON_FP_SQRT_D
:
26814 case TYPE_NEON_FP_SQRT_S_Q
:
26815 case TYPE_NEON_FP_SQRT_D_Q
:
26816 case TYPE_NEON_FP_DIV_S
:
26817 case TYPE_NEON_FP_DIV_D
:
26818 case TYPE_NEON_FP_DIV_S_Q
:
26819 case TYPE_NEON_FP_DIV_D_Q
:
26826 /* Implement TARGET_COMPUTE_PRESSURE_CLASSES. */
26829 aarch64_compute_pressure_classes (reg_class
*classes
)
26832 classes
[i
++] = GENERAL_REGS
;
26833 classes
[i
++] = FP_REGS
;
26834 /* PR_REGS isn't a useful pressure class because many predicate pseudo
26835 registers need to go in PR_LO_REGS at some point during their
26836 lifetime. Splitting it into two halves has the effect of making
26837 all predicates count against PR_LO_REGS, so that we try whenever
26838 possible to restrict the number of live predicates to 8. This
26839 greatly reduces the amount of spilling in certain loops. */
26840 classes
[i
++] = PR_LO_REGS
;
26841 classes
[i
++] = PR_HI_REGS
;
26845 /* Implement TARGET_CAN_CHANGE_MODE_CLASS. */
26848 aarch64_can_change_mode_class (machine_mode from
,
26849 machine_mode to
, reg_class_t
)
26851 unsigned int from_flags
= aarch64_classify_vector_mode (from
);
26852 unsigned int to_flags
= aarch64_classify_vector_mode (to
);
26854 bool from_sve_p
= (from_flags
& VEC_ANY_SVE
);
26855 bool to_sve_p
= (to_flags
& VEC_ANY_SVE
);
26857 bool from_partial_sve_p
= from_sve_p
&& (from_flags
& VEC_PARTIAL
);
26858 bool to_partial_sve_p
= to_sve_p
&& (to_flags
& VEC_PARTIAL
);
26860 bool from_pred_p
= (from_flags
& VEC_SVE_PRED
);
26861 bool to_pred_p
= (to_flags
& VEC_SVE_PRED
);
26863 bool to_partial_advsimd_struct_p
= (to_flags
== (VEC_ADVSIMD
| VEC_STRUCT
26865 bool from_partial_advsimd_struct_p
= (from_flags
== (VEC_ADVSIMD
| VEC_STRUCT
26868 /* Don't allow changes between predicate modes and other modes.
26869 Only predicate registers can hold predicate modes and only
26870 non-predicate registers can hold non-predicate modes, so any
26871 attempt to mix them would require a round trip through memory. */
26872 if (from_pred_p
!= to_pred_p
)
26875 /* Don't allow changes between partial SVE modes and other modes.
26876 The contents of partial SVE modes are distributed evenly across
26877 the register, whereas GCC expects them to be clustered together. */
26878 if (from_partial_sve_p
!= to_partial_sve_p
)
26881 /* Similarly reject changes between partial SVE modes that have
26882 different patterns of significant and insignificant bits. */
26883 if (from_partial_sve_p
26884 && (aarch64_sve_container_bits (from
) != aarch64_sve_container_bits (to
)
26885 || GET_MODE_UNIT_SIZE (from
) != GET_MODE_UNIT_SIZE (to
)))
26888 /* Don't allow changes between partial and other registers only if
26889 one is a normal SIMD register, allow only if not larger than 64-bit. */
26890 if ((to_partial_advsimd_struct_p
^ from_partial_advsimd_struct_p
)
26891 && (known_gt (GET_MODE_SIZE (to
), 8) || known_gt (GET_MODE_SIZE (to
), 8)))
26894 if (maybe_ne (BITS_PER_SVE_VECTOR
, 128u))
26896 /* Don't allow changes between SVE modes and other modes that might
26897 be bigger than 128 bits. In particular, OImode, CImode and XImode
26898 divide into 128-bit quantities while SVE modes divide into
26899 BITS_PER_SVE_VECTOR quantities. */
26900 if (from_sve_p
&& !to_sve_p
&& maybe_gt (GET_MODE_BITSIZE (to
), 128))
26902 if (to_sve_p
&& !from_sve_p
&& maybe_gt (GET_MODE_BITSIZE (from
), 128))
26906 if (BYTES_BIG_ENDIAN
)
26908 /* Don't allow changes between SVE data modes and non-SVE modes.
26909 See the comment at the head of aarch64-sve.md for details. */
26910 if (from_sve_p
!= to_sve_p
)
26913 /* Don't allow changes in element size: lane 0 of the new vector
26914 would not then be lane 0 of the old vector. See the comment
26915 above aarch64_maybe_expand_sve_subreg_move for a more detailed
26918 In the worst case, this forces a register to be spilled in
26919 one mode and reloaded in the other, which handles the
26920 endianness correctly. */
26921 if (from_sve_p
&& GET_MODE_UNIT_SIZE (from
) != GET_MODE_UNIT_SIZE (to
))
26927 /* Implement TARGET_EARLY_REMAT_MODES. */
26930 aarch64_select_early_remat_modes (sbitmap modes
)
26932 /* SVE values are not normally live across a call, so it should be
26933 worth doing early rematerialization even in VL-specific mode. */
26934 for (int i
= 0; i
< NUM_MACHINE_MODES
; ++i
)
26935 if (aarch64_sve_mode_p ((machine_mode
) i
))
26936 bitmap_set_bit (modes
, i
);
26939 /* Override the default target speculation_safe_value. */
26941 aarch64_speculation_safe_value (machine_mode mode
,
26942 rtx result
, rtx val
, rtx failval
)
26944 /* Maybe we should warn if falling back to hard barriers. They are
26945 likely to be noticably more expensive than the alternative below. */
26946 if (!aarch64_track_speculation
)
26947 return default_speculation_safe_value (mode
, result
, val
, failval
);
26950 val
= copy_to_mode_reg (mode
, val
);
26952 if (!aarch64_reg_or_zero (failval
, mode
))
26953 failval
= copy_to_mode_reg (mode
, failval
);
26955 emit_insn (gen_despeculate_copy (mode
, result
, val
, failval
));
26959 /* Implement TARGET_ESTIMATED_POLY_VALUE.
26960 Look into the tuning structure for an estimate.
26961 KIND specifies the type of requested estimate: min, max or likely.
26962 For cores with a known SVE width all three estimates are the same.
26963 For generic SVE tuning we want to distinguish the maximum estimate from
26964 the minimum and likely ones.
26965 The likely estimate is the same as the minimum in that case to give a
26966 conservative behavior of auto-vectorizing with SVE when it is a win
26967 even for 128-bit SVE.
26968 When SVE width information is available VAL.coeffs[1] is multiplied by
26969 the number of VQ chunks over the initial Advanced SIMD 128 bits. */
26971 static HOST_WIDE_INT
26972 aarch64_estimated_poly_value (poly_int64 val
,
26973 poly_value_estimate_kind kind
26974 = POLY_VALUE_LIKELY
)
26976 unsigned int width_source
= aarch64_tune_params
.sve_width
;
26978 /* If there is no core-specific information then the minimum and likely
26979 values are based on 128-bit vectors and the maximum is based on
26980 the architectural maximum of 2048 bits. */
26981 if (width_source
== SVE_SCALABLE
)
26984 case POLY_VALUE_MIN
:
26985 case POLY_VALUE_LIKELY
:
26986 return val
.coeffs
[0];
26987 case POLY_VALUE_MAX
:
26988 return val
.coeffs
[0] + val
.coeffs
[1] * 15;
26991 /* Allow sve_width to be a bitmask of different VL, treating the lowest
26992 as likely. This could be made more general if future -mtune options
26994 if (kind
== POLY_VALUE_MAX
)
26995 width_source
= 1 << floor_log2 (width_source
);
26997 width_source
= least_bit_hwi (width_source
);
26999 /* If the core provides width information, use that. */
27000 HOST_WIDE_INT over_128
= width_source
- 128;
27001 return val
.coeffs
[0] + val
.coeffs
[1] * over_128
/ 128;
27005 /* Return true for types that could be supported as SIMD return or
27009 supported_simd_type (tree t
)
27011 if (SCALAR_FLOAT_TYPE_P (t
) || INTEGRAL_TYPE_P (t
) || POINTER_TYPE_P (t
))
27013 HOST_WIDE_INT s
= tree_to_shwi (TYPE_SIZE_UNIT (t
));
27014 return s
== 1 || s
== 2 || s
== 4 || s
== 8;
27019 /* Return true for types that currently are supported as SIMD return
27020 or argument types. */
27023 currently_supported_simd_type (tree t
, tree b
)
27025 if (COMPLEX_FLOAT_TYPE_P (t
))
27028 if (TYPE_SIZE (t
) != TYPE_SIZE (b
))
27031 return supported_simd_type (t
);
27034 /* Implement TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN. */
27037 aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node
*node
,
27038 struct cgraph_simd_clone
*clonei
,
27039 tree base_type
, int num
,
27043 unsigned int elt_bits
, count
;
27044 unsigned HOST_WIDE_INT const_simdlen
;
27045 poly_uint64 vec_bits
;
27050 /* For now, SVE simdclones won't produce illegal simdlen, So only check
27051 const simdlens here. */
27052 if (maybe_ne (clonei
->simdlen
, 0U)
27053 && clonei
->simdlen
.is_constant (&const_simdlen
)
27054 && (const_simdlen
< 2
27055 || const_simdlen
> 1024
27056 || (const_simdlen
& (const_simdlen
- 1)) != 0))
27059 warning_at (DECL_SOURCE_LOCATION (node
->decl
), 0,
27060 "unsupported simdlen %wd", const_simdlen
);
27064 ret_type
= TREE_TYPE (TREE_TYPE (node
->decl
));
27065 if (TREE_CODE (ret_type
) != VOID_TYPE
27066 && !currently_supported_simd_type (ret_type
, base_type
))
27070 else if (TYPE_SIZE (ret_type
) != TYPE_SIZE (base_type
))
27071 warning_at (DECL_SOURCE_LOCATION (node
->decl
), 0,
27072 "GCC does not currently support mixed size types "
27073 "for %<simd%> functions");
27074 else if (supported_simd_type (ret_type
))
27075 warning_at (DECL_SOURCE_LOCATION (node
->decl
), 0,
27076 "GCC does not currently support return type %qT "
27077 "for %<simd%> functions", ret_type
);
27079 warning_at (DECL_SOURCE_LOCATION (node
->decl
), 0,
27080 "unsupported return type %qT for %<simd%> functions",
27086 tree type_arg_types
= TYPE_ARG_TYPES (TREE_TYPE (node
->decl
));
27087 bool decl_arg_p
= (node
->definition
|| type_arg_types
== NULL_TREE
);
27089 for (t
= (decl_arg_p
? DECL_ARGUMENTS (node
->decl
) : type_arg_types
), i
= 0;
27090 t
&& t
!= void_list_node
; t
= TREE_CHAIN (t
), i
++)
27092 tree arg_type
= decl_arg_p
? TREE_TYPE (t
) : TREE_VALUE (t
);
27094 if (clonei
->args
[i
].arg_type
!= SIMD_CLONE_ARG_TYPE_UNIFORM
27095 && !currently_supported_simd_type (arg_type
, base_type
))
27099 else if (TYPE_SIZE (arg_type
) != TYPE_SIZE (base_type
))
27100 warning_at (DECL_SOURCE_LOCATION (node
->decl
), 0,
27101 "GCC does not currently support mixed size types "
27102 "for %<simd%> functions");
27104 warning_at (DECL_SOURCE_LOCATION (node
->decl
), 0,
27105 "GCC does not currently support argument type %qT "
27106 "for %<simd%> functions", arg_type
);
27111 clonei
->vecsize_mangle
= 'n';
27112 clonei
->mask_mode
= VOIDmode
;
27113 elt_bits
= GET_MODE_BITSIZE (SCALAR_TYPE_MODE (base_type
));
27114 if (known_eq (clonei
->simdlen
, 0U))
27117 vec_bits
= (num
== 0 ? 64 : 128);
27118 clonei
->simdlen
= exact_div (vec_bits
, elt_bits
);
27123 vec_bits
= clonei
->simdlen
* elt_bits
;
27124 /* For now, SVE simdclones won't produce illegal simdlen, So only check
27125 const simdlens here. */
27126 if (clonei
->simdlen
.is_constant (&const_simdlen
)
27127 && maybe_ne (vec_bits
, 64U) && maybe_ne (vec_bits
, 128U))
27130 warning_at (DECL_SOURCE_LOCATION (node
->decl
), 0,
27131 "GCC does not currently support simdlen %wd for "
27133 const_simdlen
, base_type
);
27137 clonei
->vecsize_int
= vec_bits
;
27138 clonei
->vecsize_float
= vec_bits
;
27142 /* Implement TARGET_SIMD_CLONE_ADJUST. */
27145 aarch64_simd_clone_adjust (struct cgraph_node
*node
)
27147 /* Add aarch64_vector_pcs target attribute to SIMD clones so they
27148 use the correct ABI. */
27150 tree t
= TREE_TYPE (node
->decl
);
27151 TYPE_ATTRIBUTES (t
) = make_attribute ("aarch64_vector_pcs", "default",
27152 TYPE_ATTRIBUTES (t
));
27155 /* Implement TARGET_SIMD_CLONE_USABLE. */
27158 aarch64_simd_clone_usable (struct cgraph_node
*node
)
27160 switch (node
->simdclone
->vecsize_mangle
)
27167 gcc_unreachable ();
27171 /* Implement TARGET_COMP_TYPE_ATTRIBUTES */
27174 aarch64_comp_type_attributes (const_tree type1
, const_tree type2
)
27176 auto check_attr
= [&](const char *name
) {
27177 tree attr1
= lookup_attribute (name
, TYPE_ATTRIBUTES (type1
));
27178 tree attr2
= lookup_attribute (name
, TYPE_ATTRIBUTES (type2
));
27179 if (!attr1
&& !attr2
)
27182 return attr1
&& attr2
&& attribute_value_equal (attr1
, attr2
);
27185 if (!check_attr ("aarch64_vector_pcs"))
27187 if (!check_attr ("Advanced SIMD type"))
27189 if (!check_attr ("SVE type"))
27191 if (!check_attr ("SVE sizeless type"))
27196 /* Implement TARGET_GET_MULTILIB_ABI_NAME */
27198 static const char *
27199 aarch64_get_multilib_abi_name (void)
27201 if (TARGET_BIG_END
)
27202 return TARGET_ILP32
? "aarch64_be_ilp32" : "aarch64_be";
27203 return TARGET_ILP32
? "aarch64_ilp32" : "aarch64";
27206 /* Implement TARGET_STACK_PROTECT_GUARD. In case of a
27207 global variable based guard use the default else
27208 return a null tree. */
27210 aarch64_stack_protect_guard (void)
27212 if (aarch64_stack_protector_guard
== SSP_GLOBAL
)
27213 return default_stack_protect_guard ();
27218 /* Return the diagnostic message string if conversion from FROMTYPE to
27219 TOTYPE is not allowed, NULL otherwise. */
27221 static const char *
27222 aarch64_invalid_conversion (const_tree fromtype
, const_tree totype
)
27224 if (element_mode (fromtype
) != element_mode (totype
))
27226 /* Do no allow conversions to/from BFmode scalar types. */
27227 if (TYPE_MODE (fromtype
) == BFmode
)
27228 return N_("invalid conversion from type %<bfloat16_t%>");
27229 if (TYPE_MODE (totype
) == BFmode
)
27230 return N_("invalid conversion to type %<bfloat16_t%>");
27233 /* Conversion allowed. */
27237 /* Return the diagnostic message string if the unary operation OP is
27238 not permitted on TYPE, NULL otherwise. */
27240 static const char *
27241 aarch64_invalid_unary_op (int op
, const_tree type
)
27243 /* Reject all single-operand operations on BFmode except for &. */
27244 if (element_mode (type
) == BFmode
&& op
!= ADDR_EXPR
)
27245 return N_("operation not permitted on type %<bfloat16_t%>");
27247 /* Operation allowed. */
27251 /* Return the diagnostic message string if the binary operation OP is
27252 not permitted on TYPE1 and TYPE2, NULL otherwise. */
27254 static const char *
27255 aarch64_invalid_binary_op (int op ATTRIBUTE_UNUSED
, const_tree type1
,
27258 /* Reject all 2-operand operations on BFmode. */
27259 if (element_mode (type1
) == BFmode
27260 || element_mode (type2
) == BFmode
)
27261 return N_("operation not permitted on type %<bfloat16_t%>");
27263 if (VECTOR_TYPE_P (type1
)
27264 && VECTOR_TYPE_P (type2
)
27265 && !TYPE_INDIVISIBLE_P (type1
)
27266 && !TYPE_INDIVISIBLE_P (type2
)
27267 && (aarch64_sve::builtin_type_p (type1
)
27268 != aarch64_sve::builtin_type_p (type2
)))
27269 return N_("cannot combine GNU and SVE vectors in a binary operation");
27271 /* Operation allowed. */
27275 /* Implement TARGET_MEMTAG_CAN_TAG_ADDRESSES. Here we tell the rest of the
27276 compiler that we automatically ignore the top byte of our pointers, which
27277 allows using -fsanitize=hwaddress. */
27279 aarch64_can_tag_addresses ()
27281 return !TARGET_ILP32
;
27284 /* Implement TARGET_ASM_FILE_END for AArch64. This adds the AArch64 GNU NOTE
27285 section at the end if needed. */
27286 #define GNU_PROPERTY_AARCH64_FEATURE_1_AND 0xc0000000
27287 #define GNU_PROPERTY_AARCH64_FEATURE_1_BTI (1U << 0)
27288 #define GNU_PROPERTY_AARCH64_FEATURE_1_PAC (1U << 1)
27290 aarch64_file_end_indicate_exec_stack ()
27292 file_end_indicate_exec_stack ();
27294 unsigned feature_1_and
= 0;
27295 if (aarch64_bti_enabled ())
27296 feature_1_and
|= GNU_PROPERTY_AARCH64_FEATURE_1_BTI
;
27298 if (aarch64_ra_sign_scope
!= AARCH64_FUNCTION_NONE
)
27299 feature_1_and
|= GNU_PROPERTY_AARCH64_FEATURE_1_PAC
;
27303 /* Generate .note.gnu.property section. */
27304 switch_to_section (get_section (".note.gnu.property",
27305 SECTION_NOTYPE
, NULL
));
27307 /* PT_NOTE header: namesz, descsz, type.
27308 namesz = 4 ("GNU\0")
27309 descsz = 16 (Size of the program property array)
27310 [(12 + padding) * Number of array elements]
27311 type = 5 (NT_GNU_PROPERTY_TYPE_0). */
27312 assemble_align (POINTER_SIZE
);
27313 assemble_integer (GEN_INT (4), 4, 32, 1);
27314 assemble_integer (GEN_INT (ROUND_UP (12, POINTER_BYTES
)), 4, 32, 1);
27315 assemble_integer (GEN_INT (5), 4, 32, 1);
27317 /* PT_NOTE name. */
27318 assemble_string ("GNU", 4);
27320 /* PT_NOTE contents for NT_GNU_PROPERTY_TYPE_0:
27321 type = GNU_PROPERTY_AARCH64_FEATURE_1_AND
27323 data = feature_1_and. */
27324 assemble_integer (GEN_INT (GNU_PROPERTY_AARCH64_FEATURE_1_AND
), 4, 32, 1);
27325 assemble_integer (GEN_INT (4), 4, 32, 1);
27326 assemble_integer (GEN_INT (feature_1_and
), 4, 32, 1);
27328 /* Pad the size of the note to the required alignment. */
27329 assemble_align (POINTER_SIZE
);
27332 #undef GNU_PROPERTY_AARCH64_FEATURE_1_PAC
27333 #undef GNU_PROPERTY_AARCH64_FEATURE_1_BTI
27334 #undef GNU_PROPERTY_AARCH64_FEATURE_1_AND
27336 /* Helper function for straight line speculation.
27337 Return what barrier should be emitted for straight line speculation
27339 When not mitigating against straight line speculation this function returns
27341 When mitigating against straight line speculation, use:
27342 * SB when the v8.5-A SB extension is enabled.
27343 * DSB+ISB otherwise. */
27345 aarch64_sls_barrier (int mitigation_required
)
27347 return mitigation_required
27348 ? (TARGET_SB
? "sb" : "dsb\tsy\n\tisb")
27352 static GTY (()) tree aarch64_sls_shared_thunks
[30];
27353 static GTY (()) bool aarch64_sls_shared_thunks_needed
= false;
27354 const char *indirect_symbol_names
[30] = {
27355 "__call_indirect_x0",
27356 "__call_indirect_x1",
27357 "__call_indirect_x2",
27358 "__call_indirect_x3",
27359 "__call_indirect_x4",
27360 "__call_indirect_x5",
27361 "__call_indirect_x6",
27362 "__call_indirect_x7",
27363 "__call_indirect_x8",
27364 "__call_indirect_x9",
27365 "__call_indirect_x10",
27366 "__call_indirect_x11",
27367 "__call_indirect_x12",
27368 "__call_indirect_x13",
27369 "__call_indirect_x14",
27370 "__call_indirect_x15",
27371 "", /* "__call_indirect_x16", */
27372 "", /* "__call_indirect_x17", */
27373 "__call_indirect_x18",
27374 "__call_indirect_x19",
27375 "__call_indirect_x20",
27376 "__call_indirect_x21",
27377 "__call_indirect_x22",
27378 "__call_indirect_x23",
27379 "__call_indirect_x24",
27380 "__call_indirect_x25",
27381 "__call_indirect_x26",
27382 "__call_indirect_x27",
27383 "__call_indirect_x28",
27384 "__call_indirect_x29",
27387 /* Function to create a BLR thunk. This thunk is used to mitigate straight
27388 line speculation. Instead of a simple BLR that can be speculated past,
27389 we emit a BL to this thunk, and this thunk contains a BR to the relevant
27390 register. These thunks have the relevant speculation barries put after
27391 their indirect branch so that speculation is blocked.
27393 We use such a thunk so the speculation barriers are kept off the
27394 architecturally executed path in order to reduce the performance overhead.
27396 When optimizing for size we use stubs shared by the linked object.
27397 When optimizing for performance we emit stubs for each function in the hope
27398 that the branch predictor can better train on jumps specific for a given
27401 aarch64_sls_create_blr_label (int regnum
)
27403 gcc_assert (STUB_REGNUM_P (regnum
));
27404 if (optimize_function_for_size_p (cfun
))
27406 /* For the thunks shared between different functions in this compilation
27407 unit we use a named symbol -- this is just for users to more easily
27408 understand the generated assembly. */
27409 aarch64_sls_shared_thunks_needed
= true;
27410 const char *thunk_name
= indirect_symbol_names
[regnum
];
27411 if (aarch64_sls_shared_thunks
[regnum
] == NULL
)
27413 /* Build a decl representing this function stub and record it for
27414 later. We build a decl here so we can use the GCC machinery for
27415 handling sections automatically (through `get_named_section` and
27416 `make_decl_one_only`). That saves us a lot of trouble handling
27417 the specifics of different output file formats. */
27418 tree decl
= build_decl (BUILTINS_LOCATION
, FUNCTION_DECL
,
27419 get_identifier (thunk_name
),
27420 build_function_type_list (void_type_node
,
27422 DECL_RESULT (decl
) = build_decl (BUILTINS_LOCATION
, RESULT_DECL
,
27423 NULL_TREE
, void_type_node
);
27424 TREE_PUBLIC (decl
) = 1;
27425 TREE_STATIC (decl
) = 1;
27426 DECL_IGNORED_P (decl
) = 1;
27427 DECL_ARTIFICIAL (decl
) = 1;
27428 make_decl_one_only (decl
, DECL_ASSEMBLER_NAME (decl
));
27429 resolve_unique_section (decl
, 0, false);
27430 aarch64_sls_shared_thunks
[regnum
] = decl
;
27433 return gen_rtx_SYMBOL_REF (Pmode
, thunk_name
);
27436 if (cfun
->machine
->call_via
[regnum
] == NULL
)
27437 cfun
->machine
->call_via
[regnum
]
27438 = gen_rtx_LABEL_REF (Pmode
, gen_label_rtx ());
27439 return cfun
->machine
->call_via
[regnum
];
27442 /* Helper function for aarch64_sls_emit_blr_function_thunks and
27443 aarch64_sls_emit_shared_blr_thunks below. */
27445 aarch64_sls_emit_function_stub (FILE *out_file
, int regnum
)
27447 /* Save in x16 and branch to that function so this transformation does
27448 not prevent jumping to `BTI c` instructions. */
27449 asm_fprintf (out_file
, "\tmov\tx16, x%d\n", regnum
);
27450 asm_fprintf (out_file
, "\tbr\tx16\n");
27453 /* Emit all BLR stubs for this particular function.
27454 Here we emit all the BLR stubs needed for the current function. Since we
27455 emit these stubs in a consecutive block we know there will be no speculation
27456 gadgets between each stub, and hence we only emit a speculation barrier at
27457 the end of the stub sequences.
27459 This is called in the TARGET_ASM_FUNCTION_EPILOGUE hook. */
27461 aarch64_sls_emit_blr_function_thunks (FILE *out_file
)
27463 if (! aarch64_harden_sls_blr_p ())
27466 bool any_functions_emitted
= false;
27467 /* We must save and restore the current function section since this assembly
27468 is emitted at the end of the function. This means it can be emitted *just
27469 after* the cold section of a function. That cold part would be emitted in
27470 a different section. That switch would trigger a `.cfi_endproc` directive
27471 to be emitted in the original section and a `.cfi_startproc` directive to
27472 be emitted in the new section. Switching to the original section without
27473 restoring would mean that the `.cfi_endproc` emitted as a function ends
27474 would happen in a different section -- leaving an unmatched
27475 `.cfi_startproc` in the cold text section and an unmatched `.cfi_endproc`
27476 in the standard text section. */
27477 section
*save_text_section
= in_section
;
27478 switch_to_section (function_section (current_function_decl
));
27479 for (int regnum
= 0; regnum
< 30; ++regnum
)
27481 rtx specu_label
= cfun
->machine
->call_via
[regnum
];
27482 if (specu_label
== NULL
)
27485 targetm
.asm_out
.print_operand (out_file
, specu_label
, 0);
27486 asm_fprintf (out_file
, ":\n");
27487 aarch64_sls_emit_function_stub (out_file
, regnum
);
27488 any_functions_emitted
= true;
27490 if (any_functions_emitted
)
27491 /* Can use the SB if needs be here, since this stub will only be used
27492 by the current function, and hence for the current target. */
27493 asm_fprintf (out_file
, "\t%s\n", aarch64_sls_barrier (true));
27494 switch_to_section (save_text_section
);
27497 /* Emit shared BLR stubs for the current compilation unit.
27498 Over the course of compiling this unit we may have converted some BLR
27499 instructions to a BL to a shared stub function. This is where we emit those
27501 This function is for the stubs shared between different functions in this
27502 compilation unit. We share when optimizing for size instead of speed.
27504 This function is called through the TARGET_ASM_FILE_END hook. */
27506 aarch64_sls_emit_shared_blr_thunks (FILE *out_file
)
27508 if (! aarch64_sls_shared_thunks_needed
)
27511 for (int regnum
= 0; regnum
< 30; ++regnum
)
27513 tree decl
= aarch64_sls_shared_thunks
[regnum
];
27517 const char *name
= indirect_symbol_names
[regnum
];
27518 switch_to_section (get_named_section (decl
, NULL
, 0));
27519 ASM_OUTPUT_ALIGN (out_file
, 2);
27520 targetm
.asm_out
.globalize_label (out_file
, name
);
27521 /* Only emits if the compiler is configured for an assembler that can
27522 handle visibility directives. */
27523 targetm
.asm_out
.assemble_visibility (decl
, VISIBILITY_HIDDEN
);
27524 ASM_OUTPUT_TYPE_DIRECTIVE (out_file
, name
, "function");
27525 ASM_OUTPUT_LABEL (out_file
, name
);
27526 aarch64_sls_emit_function_stub (out_file
, regnum
);
27527 /* Use the most conservative target to ensure it can always be used by any
27528 function in the translation unit. */
27529 asm_fprintf (out_file
, "\tdsb\tsy\n\tisb\n");
27530 ASM_DECLARE_FUNCTION_SIZE (out_file
, name
, decl
);
27534 /* Implement TARGET_ASM_FILE_END. */
27536 aarch64_asm_file_end ()
27538 aarch64_sls_emit_shared_blr_thunks (asm_out_file
);
27539 /* Since this function will be called for the ASM_FILE_END hook, we ensure
27540 that what would be called otherwise (e.g. `file_end_indicate_exec_stack`
27541 for FreeBSD) still gets called. */
27542 #ifdef TARGET_ASM_FILE_END
27543 TARGET_ASM_FILE_END ();
27548 aarch64_indirect_call_asm (rtx addr
)
27550 gcc_assert (REG_P (addr
));
27551 if (aarch64_harden_sls_blr_p ())
27553 rtx stub_label
= aarch64_sls_create_blr_label (REGNO (addr
));
27554 output_asm_insn ("bl\t%0", &stub_label
);
27557 output_asm_insn ("blr\t%0", &addr
);
27561 /* Target-specific selftests. */
27565 namespace selftest
{
27567 /* Selftest for the RTL loader.
27568 Verify that the RTL loader copes with a dump from
27569 print_rtx_function. This is essentially just a test that class
27570 function_reader can handle a real dump, but it also verifies
27571 that lookup_reg_by_dump_name correctly handles hard regs.
27572 The presence of hard reg names in the dump means that the test is
27573 target-specific, hence it is in this file. */
27576 aarch64_test_loading_full_dump ()
27578 rtl_dump_test
t (SELFTEST_LOCATION
, locate_file ("aarch64/times-two.rtl"));
27580 ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun
->decl
)));
27582 rtx_insn
*insn_1
= get_insn_by_uid (1);
27583 ASSERT_EQ (NOTE
, GET_CODE (insn_1
));
27585 rtx_insn
*insn_15
= get_insn_by_uid (15);
27586 ASSERT_EQ (INSN
, GET_CODE (insn_15
));
27587 ASSERT_EQ (USE
, GET_CODE (PATTERN (insn_15
)));
27589 /* Verify crtl->return_rtx. */
27590 ASSERT_EQ (REG
, GET_CODE (crtl
->return_rtx
));
27591 ASSERT_EQ (0, REGNO (crtl
->return_rtx
));
27592 ASSERT_EQ (SImode
, GET_MODE (crtl
->return_rtx
));
27595 /* Test the fractional_cost class. */
27598 aarch64_test_fractional_cost ()
27600 using cf
= fractional_cost
;
27602 ASSERT_EQ (cf (0, 20), 0);
27604 ASSERT_EQ (cf (4, 2), 2);
27605 ASSERT_EQ (3, cf (9, 3));
27607 ASSERT_NE (cf (5, 2), 2);
27608 ASSERT_NE (3, cf (8, 3));
27610 ASSERT_EQ (cf (7, 11) + cf (15, 11), 2);
27611 ASSERT_EQ (cf (2, 3) + cf (3, 5), cf (19, 15));
27612 ASSERT_EQ (cf (2, 3) + cf (1, 6) + cf (1, 6), 1);
27614 ASSERT_EQ (cf (14, 15) - cf (4, 15), cf (2, 3));
27615 ASSERT_EQ (cf (1, 4) - cf (1, 2), 0);
27616 ASSERT_EQ (cf (3, 5) - cf (1, 10), cf (1, 2));
27617 ASSERT_EQ (cf (11, 3) - 3, cf (2, 3));
27618 ASSERT_EQ (3 - cf (7, 3), cf (2, 3));
27619 ASSERT_EQ (3 - cf (10, 3), 0);
27621 ASSERT_EQ (cf (2, 3) * 5, cf (10, 3));
27622 ASSERT_EQ (14 * cf (11, 21), cf (22, 3));
27624 ASSERT_TRUE (cf (4, 15) < cf (5, 15));
27625 ASSERT_FALSE (cf (5, 15) < cf (5, 15));
27626 ASSERT_FALSE (cf (6, 15) < cf (5, 15));
27627 ASSERT_TRUE (cf (1, 3) < cf (2, 5));
27628 ASSERT_TRUE (cf (1, 12) < cf (1, 6));
27629 ASSERT_FALSE (cf (5, 3) < cf (5, 3));
27630 ASSERT_TRUE (cf (239, 240) < 1);
27631 ASSERT_FALSE (cf (240, 240) < 1);
27632 ASSERT_FALSE (cf (241, 240) < 1);
27633 ASSERT_FALSE (2 < cf (207, 104));
27634 ASSERT_FALSE (2 < cf (208, 104));
27635 ASSERT_TRUE (2 < cf (209, 104));
27637 ASSERT_TRUE (cf (4, 15) < cf (5, 15));
27638 ASSERT_FALSE (cf (5, 15) < cf (5, 15));
27639 ASSERT_FALSE (cf (6, 15) < cf (5, 15));
27640 ASSERT_TRUE (cf (1, 3) < cf (2, 5));
27641 ASSERT_TRUE (cf (1, 12) < cf (1, 6));
27642 ASSERT_FALSE (cf (5, 3) < cf (5, 3));
27643 ASSERT_TRUE (cf (239, 240) < 1);
27644 ASSERT_FALSE (cf (240, 240) < 1);
27645 ASSERT_FALSE (cf (241, 240) < 1);
27646 ASSERT_FALSE (2 < cf (207, 104));
27647 ASSERT_FALSE (2 < cf (208, 104));
27648 ASSERT_TRUE (2 < cf (209, 104));
27650 ASSERT_FALSE (cf (4, 15) >= cf (5, 15));
27651 ASSERT_TRUE (cf (5, 15) >= cf (5, 15));
27652 ASSERT_TRUE (cf (6, 15) >= cf (5, 15));
27653 ASSERT_FALSE (cf (1, 3) >= cf (2, 5));
27654 ASSERT_FALSE (cf (1, 12) >= cf (1, 6));
27655 ASSERT_TRUE (cf (5, 3) >= cf (5, 3));
27656 ASSERT_FALSE (cf (239, 240) >= 1);
27657 ASSERT_TRUE (cf (240, 240) >= 1);
27658 ASSERT_TRUE (cf (241, 240) >= 1);
27659 ASSERT_TRUE (2 >= cf (207, 104));
27660 ASSERT_TRUE (2 >= cf (208, 104));
27661 ASSERT_FALSE (2 >= cf (209, 104));
27663 ASSERT_FALSE (cf (4, 15) > cf (5, 15));
27664 ASSERT_FALSE (cf (5, 15) > cf (5, 15));
27665 ASSERT_TRUE (cf (6, 15) > cf (5, 15));
27666 ASSERT_FALSE (cf (1, 3) > cf (2, 5));
27667 ASSERT_FALSE (cf (1, 12) > cf (1, 6));
27668 ASSERT_FALSE (cf (5, 3) > cf (5, 3));
27669 ASSERT_FALSE (cf (239, 240) > 1);
27670 ASSERT_FALSE (cf (240, 240) > 1);
27671 ASSERT_TRUE (cf (241, 240) > 1);
27672 ASSERT_TRUE (2 > cf (207, 104));
27673 ASSERT_FALSE (2 > cf (208, 104));
27674 ASSERT_FALSE (2 > cf (209, 104));
27676 ASSERT_EQ (cf (1, 2).ceil (), 1);
27677 ASSERT_EQ (cf (11, 7).ceil (), 2);
27678 ASSERT_EQ (cf (20, 1).ceil (), 20);
27679 ASSERT_EQ ((cf (0xfffffffd) + 1).ceil (), 0xfffffffe);
27680 ASSERT_EQ ((cf (0xfffffffd) + 2).ceil (), 0xffffffff);
27681 ASSERT_EQ ((cf (0xfffffffd) + 3).ceil (), 0xffffffff);
27682 ASSERT_EQ ((cf (0x7fffffff) * 2).ceil (), 0xfffffffe);
27683 ASSERT_EQ ((cf (0x80000000) * 2).ceil (), 0xffffffff);
27685 ASSERT_EQ (cf (1, 2).as_double (), 0.5);
27688 /* Run all target-specific selftests. */
27691 aarch64_run_selftests (void)
27693 aarch64_test_loading_full_dump ();
27694 aarch64_test_fractional_cost ();
27697 } // namespace selftest
27699 #endif /* #if CHECKING_P */
27701 #undef TARGET_STACK_PROTECT_GUARD
27702 #define TARGET_STACK_PROTECT_GUARD aarch64_stack_protect_guard
27704 #undef TARGET_ADDRESS_COST
27705 #define TARGET_ADDRESS_COST aarch64_address_cost
27707 /* This hook will determines whether unnamed bitfields affect the alignment
27708 of the containing structure. The hook returns true if the structure
27709 should inherit the alignment requirements of an unnamed bitfield's
27711 #undef TARGET_ALIGN_ANON_BITFIELD
27712 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
27714 #undef TARGET_ASM_ALIGNED_DI_OP
27715 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
27717 #undef TARGET_ASM_ALIGNED_HI_OP
27718 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
27720 #undef TARGET_ASM_ALIGNED_SI_OP
27721 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
27723 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
27724 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
27725 hook_bool_const_tree_hwi_hwi_const_tree_true
27727 #undef TARGET_ASM_FILE_START
27728 #define TARGET_ASM_FILE_START aarch64_start_file
27730 #undef TARGET_ASM_OUTPUT_MI_THUNK
27731 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
27733 #undef TARGET_ASM_SELECT_RTX_SECTION
27734 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
27736 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
27737 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
27739 #undef TARGET_ASM_PRINT_PATCHABLE_FUNCTION_ENTRY
27740 #define TARGET_ASM_PRINT_PATCHABLE_FUNCTION_ENTRY aarch64_print_patchable_function_entry
27742 #undef TARGET_BUILD_BUILTIN_VA_LIST
27743 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
27745 #undef TARGET_CALLEE_COPIES
27746 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_arg_info_false
27748 #undef TARGET_CAN_ELIMINATE
27749 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
27751 #undef TARGET_CAN_INLINE_P
27752 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
27754 #undef TARGET_CANNOT_FORCE_CONST_MEM
27755 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
27757 #undef TARGET_CASE_VALUES_THRESHOLD
27758 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
27760 #undef TARGET_CONDITIONAL_REGISTER_USAGE
27761 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
27763 #undef TARGET_MEMBER_TYPE_FORCES_BLK
27764 #define TARGET_MEMBER_TYPE_FORCES_BLK aarch64_member_type_forces_blk
27766 /* Only the least significant bit is used for initialization guard
27768 #undef TARGET_CXX_GUARD_MASK_BIT
27769 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
27771 #undef TARGET_C_MODE_FOR_SUFFIX
27772 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
27774 #ifdef TARGET_BIG_ENDIAN_DEFAULT
27775 #undef TARGET_DEFAULT_TARGET_FLAGS
27776 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
27779 #undef TARGET_CLASS_MAX_NREGS
27780 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
27782 #undef TARGET_BUILTIN_DECL
27783 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
27785 #undef TARGET_BUILTIN_RECIPROCAL
27786 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
27788 #undef TARGET_C_EXCESS_PRECISION
27789 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
27791 #undef TARGET_EXPAND_BUILTIN
27792 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
27794 #undef TARGET_EXPAND_BUILTIN_VA_START
27795 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
27797 #undef TARGET_FOLD_BUILTIN
27798 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
27800 #undef TARGET_FUNCTION_ARG
27801 #define TARGET_FUNCTION_ARG aarch64_function_arg
27803 #undef TARGET_FUNCTION_ARG_ADVANCE
27804 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
27806 #undef TARGET_FUNCTION_ARG_BOUNDARY
27807 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
27809 #undef TARGET_FUNCTION_ARG_PADDING
27810 #define TARGET_FUNCTION_ARG_PADDING aarch64_function_arg_padding
27812 #undef TARGET_GET_RAW_RESULT_MODE
27813 #define TARGET_GET_RAW_RESULT_MODE aarch64_get_reg_raw_mode
27814 #undef TARGET_GET_RAW_ARG_MODE
27815 #define TARGET_GET_RAW_ARG_MODE aarch64_get_reg_raw_mode
27817 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
27818 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
27820 #undef TARGET_FUNCTION_VALUE
27821 #define TARGET_FUNCTION_VALUE aarch64_function_value
27823 #undef TARGET_FUNCTION_VALUE_REGNO_P
27824 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
27826 #undef TARGET_GIMPLE_FOLD_BUILTIN
27827 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
27829 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
27830 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
27832 #undef TARGET_INIT_BUILTINS
27833 #define TARGET_INIT_BUILTINS aarch64_init_builtins
27835 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
27836 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
27837 aarch64_ira_change_pseudo_allocno_class
27839 #undef TARGET_LEGITIMATE_ADDRESS_P
27840 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
27842 #undef TARGET_LEGITIMATE_CONSTANT_P
27843 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
27845 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
27846 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
27847 aarch64_legitimize_address_displacement
27849 #undef TARGET_LIBGCC_CMP_RETURN_MODE
27850 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
27852 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
27853 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
27854 aarch64_libgcc_floating_mode_supported_p
27856 #undef TARGET_MANGLE_TYPE
27857 #define TARGET_MANGLE_TYPE aarch64_mangle_type
27859 #undef TARGET_INVALID_CONVERSION
27860 #define TARGET_INVALID_CONVERSION aarch64_invalid_conversion
27862 #undef TARGET_INVALID_UNARY_OP
27863 #define TARGET_INVALID_UNARY_OP aarch64_invalid_unary_op
27865 #undef TARGET_INVALID_BINARY_OP
27866 #define TARGET_INVALID_BINARY_OP aarch64_invalid_binary_op
27868 #undef TARGET_VERIFY_TYPE_CONTEXT
27869 #define TARGET_VERIFY_TYPE_CONTEXT aarch64_verify_type_context
27871 #undef TARGET_MEMORY_MOVE_COST
27872 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
27874 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
27875 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
27877 #undef TARGET_MUST_PASS_IN_STACK
27878 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
27880 /* This target hook should return true if accesses to volatile bitfields
27881 should use the narrowest mode possible. It should return false if these
27882 accesses should use the bitfield container type. */
27883 #undef TARGET_NARROW_VOLATILE_BITFIELD
27884 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
27886 #undef TARGET_OPTION_OVERRIDE
27887 #define TARGET_OPTION_OVERRIDE aarch64_override_options
27889 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
27890 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
27891 aarch64_override_options_after_change
27893 #undef TARGET_OFFLOAD_OPTIONS
27894 #define TARGET_OFFLOAD_OPTIONS aarch64_offload_options
27896 #undef TARGET_OPTION_RESTORE
27897 #define TARGET_OPTION_RESTORE aarch64_option_restore
27899 #undef TARGET_OPTION_PRINT
27900 #define TARGET_OPTION_PRINT aarch64_option_print
27902 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
27903 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
27905 #undef TARGET_SET_CURRENT_FUNCTION
27906 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
27908 #undef TARGET_PASS_BY_REFERENCE
27909 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
27911 #undef TARGET_PREFERRED_RELOAD_CLASS
27912 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
27914 #undef TARGET_SCHED_REASSOCIATION_WIDTH
27915 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
27917 #undef TARGET_DWARF_FRAME_REG_MODE
27918 #define TARGET_DWARF_FRAME_REG_MODE aarch64_dwarf_frame_reg_mode
27920 #undef TARGET_PROMOTED_TYPE
27921 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
27923 #undef TARGET_SECONDARY_RELOAD
27924 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
27926 #undef TARGET_SECONDARY_MEMORY_NEEDED
27927 #define TARGET_SECONDARY_MEMORY_NEEDED aarch64_secondary_memory_needed
27929 #undef TARGET_SHIFT_TRUNCATION_MASK
27930 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
27932 #undef TARGET_SETUP_INCOMING_VARARGS
27933 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
27935 #undef TARGET_STRUCT_VALUE_RTX
27936 #define TARGET_STRUCT_VALUE_RTX aarch64_struct_value_rtx
27938 #undef TARGET_REGISTER_MOVE_COST
27939 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
27941 #undef TARGET_RETURN_IN_MEMORY
27942 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
27944 #undef TARGET_RETURN_IN_MSB
27945 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
27947 #undef TARGET_RTX_COSTS
27948 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
27950 #undef TARGET_SCALAR_MODE_SUPPORTED_P
27951 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
27953 #undef TARGET_SCHED_ISSUE_RATE
27954 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
27956 #undef TARGET_SCHED_VARIABLE_ISSUE
27957 #define TARGET_SCHED_VARIABLE_ISSUE aarch64_sched_variable_issue
27959 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
27960 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
27961 aarch64_sched_first_cycle_multipass_dfa_lookahead
27963 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
27964 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
27965 aarch64_first_cycle_multipass_dfa_lookahead_guard
27967 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
27968 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
27969 aarch64_get_separate_components
27971 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
27972 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
27973 aarch64_components_for_bb
27975 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
27976 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
27977 aarch64_disqualify_components
27979 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
27980 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
27981 aarch64_emit_prologue_components
27983 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
27984 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
27985 aarch64_emit_epilogue_components
27987 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
27988 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
27989 aarch64_set_handled_components
27991 #undef TARGET_TRAMPOLINE_INIT
27992 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
27994 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
27995 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
27997 #undef TARGET_VECTOR_MODE_SUPPORTED_P
27998 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
28000 #undef TARGET_COMPATIBLE_VECTOR_TYPES_P
28001 #define TARGET_COMPATIBLE_VECTOR_TYPES_P aarch64_compatible_vector_types_p
28003 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
28004 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
28005 aarch64_builtin_support_vector_misalignment
28007 #undef TARGET_ARRAY_MODE
28008 #define TARGET_ARRAY_MODE aarch64_array_mode
28010 #undef TARGET_ARRAY_MODE_SUPPORTED_P
28011 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
28013 #undef TARGET_VECTORIZE_CREATE_COSTS
28014 #define TARGET_VECTORIZE_CREATE_COSTS aarch64_vectorize_create_costs
28016 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
28017 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
28018 aarch64_builtin_vectorization_cost
28020 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
28021 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
28023 #undef TARGET_VECTORIZE_BUILTINS
28024 #define TARGET_VECTORIZE_BUILTINS
28026 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_MODES
28027 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_MODES \
28028 aarch64_autovectorize_vector_modes
28030 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
28031 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
28032 aarch64_atomic_assign_expand_fenv
28034 /* Section anchor support. */
28036 #undef TARGET_MIN_ANCHOR_OFFSET
28037 #define TARGET_MIN_ANCHOR_OFFSET -256
28039 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
28040 byte offset; we can do much more for larger data types, but have no way
28041 to determine the size of the access. We assume accesses are aligned. */
28042 #undef TARGET_MAX_ANCHOR_OFFSET
28043 #define TARGET_MAX_ANCHOR_OFFSET 4095
28045 #undef TARGET_VECTOR_ALIGNMENT
28046 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
28048 #undef TARGET_VECTORIZE_CAN_SPECIAL_DIV_BY_CONST
28049 #define TARGET_VECTORIZE_CAN_SPECIAL_DIV_BY_CONST \
28050 aarch64_vectorize_can_special_div_by_constant
28052 #undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
28053 #define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
28054 aarch64_vectorize_preferred_vector_alignment
28055 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
28056 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
28057 aarch64_simd_vector_alignment_reachable
28059 /* vec_perm support. */
28061 #undef TARGET_VECTORIZE_VEC_PERM_CONST
28062 #define TARGET_VECTORIZE_VEC_PERM_CONST \
28063 aarch64_vectorize_vec_perm_const
28065 #undef TARGET_VECTORIZE_RELATED_MODE
28066 #define TARGET_VECTORIZE_RELATED_MODE aarch64_vectorize_related_mode
28067 #undef TARGET_VECTORIZE_GET_MASK_MODE
28068 #define TARGET_VECTORIZE_GET_MASK_MODE aarch64_get_mask_mode
28069 #undef TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE
28070 #define TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE \
28071 aarch64_empty_mask_is_expensive
28072 #undef TARGET_PREFERRED_ELSE_VALUE
28073 #define TARGET_PREFERRED_ELSE_VALUE \
28074 aarch64_preferred_else_value
28076 #undef TARGET_INIT_LIBFUNCS
28077 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
28079 #undef TARGET_FIXED_CONDITION_CODE_REGS
28080 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
28082 #undef TARGET_FLAGS_REGNUM
28083 #define TARGET_FLAGS_REGNUM CC_REGNUM
28085 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
28086 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
28088 #undef TARGET_ASAN_SHADOW_OFFSET
28089 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
28091 #undef TARGET_LEGITIMIZE_ADDRESS
28092 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
28094 #undef TARGET_SCHED_CAN_SPECULATE_INSN
28095 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
28097 #undef TARGET_CAN_USE_DOLOOP_P
28098 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
28100 #undef TARGET_SCHED_ADJUST_PRIORITY
28101 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
28103 #undef TARGET_SCHED_MACRO_FUSION_P
28104 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
28106 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
28107 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
28109 #undef TARGET_SCHED_FUSION_PRIORITY
28110 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
28112 #undef TARGET_UNSPEC_MAY_TRAP_P
28113 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
28115 #undef TARGET_USE_PSEUDO_PIC_REG
28116 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
28118 #undef TARGET_PRINT_OPERAND
28119 #define TARGET_PRINT_OPERAND aarch64_print_operand
28121 #undef TARGET_PRINT_OPERAND_ADDRESS
28122 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
28124 #undef TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA
28125 #define TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA aarch64_output_addr_const_extra
28127 #undef TARGET_OPTAB_SUPPORTED_P
28128 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
28130 #undef TARGET_OMIT_STRUCT_RETURN_REG
28131 #define TARGET_OMIT_STRUCT_RETURN_REG true
28133 #undef TARGET_DWARF_POLY_INDETERMINATE_VALUE
28134 #define TARGET_DWARF_POLY_INDETERMINATE_VALUE \
28135 aarch64_dwarf_poly_indeterminate_value
28137 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors. */
28138 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
28139 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
28141 #undef TARGET_HARD_REGNO_NREGS
28142 #define TARGET_HARD_REGNO_NREGS aarch64_hard_regno_nregs
28143 #undef TARGET_HARD_REGNO_MODE_OK
28144 #define TARGET_HARD_REGNO_MODE_OK aarch64_hard_regno_mode_ok
28146 #undef TARGET_MODES_TIEABLE_P
28147 #define TARGET_MODES_TIEABLE_P aarch64_modes_tieable_p
28149 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
28150 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
28151 aarch64_hard_regno_call_part_clobbered
28153 #undef TARGET_INSN_CALLEE_ABI
28154 #define TARGET_INSN_CALLEE_ABI aarch64_insn_callee_abi
28156 #undef TARGET_CONSTANT_ALIGNMENT
28157 #define TARGET_CONSTANT_ALIGNMENT aarch64_constant_alignment
28159 #undef TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE
28160 #define TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE \
28161 aarch64_stack_clash_protection_alloca_probe_range
28163 #undef TARGET_COMPUTE_PRESSURE_CLASSES
28164 #define TARGET_COMPUTE_PRESSURE_CLASSES aarch64_compute_pressure_classes
28166 #undef TARGET_CAN_CHANGE_MODE_CLASS
28167 #define TARGET_CAN_CHANGE_MODE_CLASS aarch64_can_change_mode_class
28169 #undef TARGET_SELECT_EARLY_REMAT_MODES
28170 #define TARGET_SELECT_EARLY_REMAT_MODES aarch64_select_early_remat_modes
28172 #undef TARGET_SPECULATION_SAFE_VALUE
28173 #define TARGET_SPECULATION_SAFE_VALUE aarch64_speculation_safe_value
28175 #undef TARGET_ESTIMATED_POLY_VALUE
28176 #define TARGET_ESTIMATED_POLY_VALUE aarch64_estimated_poly_value
28178 #undef TARGET_ATTRIBUTE_TABLE
28179 #define TARGET_ATTRIBUTE_TABLE aarch64_attribute_table
28181 #undef TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN
28182 #define TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN \
28183 aarch64_simd_clone_compute_vecsize_and_simdlen
28185 #undef TARGET_SIMD_CLONE_ADJUST
28186 #define TARGET_SIMD_CLONE_ADJUST aarch64_simd_clone_adjust
28188 #undef TARGET_SIMD_CLONE_USABLE
28189 #define TARGET_SIMD_CLONE_USABLE aarch64_simd_clone_usable
28191 #undef TARGET_COMP_TYPE_ATTRIBUTES
28192 #define TARGET_COMP_TYPE_ATTRIBUTES aarch64_comp_type_attributes
28194 #undef TARGET_GET_MULTILIB_ABI_NAME
28195 #define TARGET_GET_MULTILIB_ABI_NAME aarch64_get_multilib_abi_name
28197 #undef TARGET_FNTYPE_ABI
28198 #define TARGET_FNTYPE_ABI aarch64_fntype_abi
28200 #undef TARGET_MEMTAG_CAN_TAG_ADDRESSES
28201 #define TARGET_MEMTAG_CAN_TAG_ADDRESSES aarch64_can_tag_addresses
28204 #undef TARGET_RUN_TARGET_SELFTESTS
28205 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
28206 #endif /* #if CHECKING_P */
28208 #undef TARGET_ASM_POST_CFI_STARTPROC
28209 #define TARGET_ASM_POST_CFI_STARTPROC aarch64_post_cfi_startproc
28211 #undef TARGET_STRICT_ARGUMENT_NAMING
28212 #define TARGET_STRICT_ARGUMENT_NAMING hook_bool_CUMULATIVE_ARGS_true
28214 #undef TARGET_MD_ASM_ADJUST
28215 #define TARGET_MD_ASM_ADJUST arm_md_asm_adjust
28217 #undef TARGET_ASM_FILE_END
28218 #define TARGET_ASM_FILE_END aarch64_asm_file_end
28220 #undef TARGET_ASM_FUNCTION_EPILOGUE
28221 #define TARGET_ASM_FUNCTION_EPILOGUE aarch64_sls_emit_blr_function_thunks
28223 #undef TARGET_HAVE_SHADOW_CALL_STACK
28224 #define TARGET_HAVE_SHADOW_CALL_STACK true
28226 #undef TARGET_CONST_ANCHOR
28227 #define TARGET_CONST_ANCHOR 0x1000000
28229 struct gcc_target targetm
= TARGET_INITIALIZER
;
28231 #include "gt-aarch64.h"