1 /* Machine description for AArch64 architecture.
2 Copyright (C) 2009-2020 Free Software Foundation, Inc.
3 Contributed by ARM Ltd.
5 This file is part of GCC.
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3, or (at your option)
12 GCC is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
21 #define IN_TARGET_CODE 1
24 #define INCLUDE_STRING
26 #include "coretypes.h"
37 #include "stringpool.h"
44 #include "diagnostic.h"
45 #include "insn-attr.h"
47 #include "fold-const.h"
48 #include "stor-layout.h"
56 #include "langhooks.h"
60 #include "gimple-iterator.h"
61 #include "tree-vectorizer.h"
62 #include "aarch64-cost-tables.h"
66 #include "tm-constrs.h"
67 #include "sched-int.h"
68 #include "target-globals.h"
69 #include "common/common-target.h"
72 #include "selftest-rtl.h"
73 #include "rtx-vector-builder.h"
76 #include "function-abi.h"
78 /* This file should be included last. */
79 #include "target-def.h"
81 /* Defined for convenience. */
82 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
84 /* Information about a legitimate vector immediate operand. */
85 struct simd_immediate_info
87 enum insn_type
{ MOV
, MVN
, INDEX
, PTRUE
};
88 enum modifier_type
{ LSL
, MSL
};
90 simd_immediate_info () {}
91 simd_immediate_info (scalar_float_mode
, rtx
);
92 simd_immediate_info (scalar_int_mode
, unsigned HOST_WIDE_INT
,
93 insn_type
= MOV
, modifier_type
= LSL
,
95 simd_immediate_info (scalar_mode
, rtx
, rtx
);
96 simd_immediate_info (scalar_int_mode
, aarch64_svpattern
);
98 /* The mode of the elements. */
101 /* The instruction to use to move the immediate into a vector. */
106 /* For MOV and MVN. */
109 /* The value of each element. */
112 /* The kind of shift modifier to use, and the number of bits to shift.
113 This is (LSL, 0) if no shift is needed. */
114 modifier_type modifier
;
121 /* The value of the first element and the step to be added for each
122 subsequent element. */
127 aarch64_svpattern pattern
;
131 /* Construct a floating-point immediate in which each element has mode
132 ELT_MODE_IN and value VALUE_IN. */
133 inline simd_immediate_info
134 ::simd_immediate_info (scalar_float_mode elt_mode_in
, rtx value_in
)
135 : elt_mode (elt_mode_in
), insn (MOV
)
137 u
.mov
.value
= value_in
;
138 u
.mov
.modifier
= LSL
;
142 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
143 and value VALUE_IN. The other parameters are as for the structure
145 inline simd_immediate_info
146 ::simd_immediate_info (scalar_int_mode elt_mode_in
,
147 unsigned HOST_WIDE_INT value_in
,
148 insn_type insn_in
, modifier_type modifier_in
,
149 unsigned int shift_in
)
150 : elt_mode (elt_mode_in
), insn (insn_in
)
152 u
.mov
.value
= gen_int_mode (value_in
, elt_mode_in
);
153 u
.mov
.modifier
= modifier_in
;
154 u
.mov
.shift
= shift_in
;
157 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
158 and where element I is equal to BASE_IN + I * STEP_IN. */
159 inline simd_immediate_info
160 ::simd_immediate_info (scalar_mode elt_mode_in
, rtx base_in
, rtx step_in
)
161 : elt_mode (elt_mode_in
), insn (INDEX
)
163 u
.index
.base
= base_in
;
164 u
.index
.step
= step_in
;
167 /* Construct a predicate that controls elements of mode ELT_MODE_IN
168 and has PTRUE pattern PATTERN_IN. */
169 inline simd_immediate_info
170 ::simd_immediate_info (scalar_int_mode elt_mode_in
,
171 aarch64_svpattern pattern_in
)
172 : elt_mode (elt_mode_in
), insn (PTRUE
)
174 u
.pattern
= pattern_in
;
179 /* Describes types that map to Pure Scalable Types (PSTs) in the AAPCS64. */
180 class pure_scalable_type_info
183 /* Represents the result of analyzing a type. All values are nonzero,
184 in the possibly forlorn hope that accidental conversions to bool
185 trigger a warning. */
188 /* The type does not have an ABI identity; i.e. it doesn't contain
189 at least one object whose type is a Fundamental Data Type. */
192 /* The type is definitely a Pure Scalable Type. */
195 /* The type is definitely not a Pure Scalable Type. */
198 /* It doesn't matter for PCS purposes whether the type is a Pure
199 Scalable Type or not, since the type will be handled the same
202 Specifically, this means that if the type is a Pure Scalable Type,
203 there aren't enough argument registers to hold it, and so it will
204 need to be passed or returned in memory. If the type isn't a
205 Pure Scalable Type, it's too big to be passed or returned in core
206 or SIMD&FP registers, and so again will need to go in memory. */
210 /* Aggregates of 17 bytes or more are normally passed and returned
211 in memory, so aggregates of that size can safely be analyzed as
212 DOESNT_MATTER. We need to be able to collect enough pieces to
213 represent a PST that is smaller than that. Since predicates are
214 2 bytes in size for -msve-vector-bits=128, that means we need to be
215 able to store at least 8 pieces.
217 We also need to be able to store enough pieces to represent
218 a single vector in each vector argument register and a single
219 predicate in each predicate argument register. This means that
220 we need at least 12 pieces. */
221 static const unsigned int MAX_PIECES
= NUM_FP_ARG_REGS
+ NUM_PR_ARG_REGS
;
222 #if __cplusplus >= 201103L
223 static_assert (MAX_PIECES
>= 8, "Need to store at least 8 predicates");
226 /* Describes one piece of a PST. Each piece is one of:
228 - a single Scalable Vector Type (SVT)
229 - a single Scalable Predicate Type (SPT)
230 - a PST containing 2, 3 or 4 SVTs, with no padding
232 It either represents a single built-in type or a PST formed from
233 multiple homogeneous built-in types. */
236 rtx
get_rtx (unsigned int, unsigned int) const;
238 /* The number of vector and predicate registers that the piece
239 occupies. One of the two is always zero. */
243 /* The mode of the registers described above. */
246 /* If this piece is formed from multiple homogeneous built-in types,
247 this is the mode of the built-in types, otherwise it is MODE. */
248 machine_mode orig_mode
;
250 /* The offset in bytes of the piece from the start of the type. */
251 poly_uint64_pod offset
;
254 /* Divides types analyzed as IS_PST into individual pieces. The pieces
255 are in memory order. */
256 auto_vec
<piece
, MAX_PIECES
> pieces
;
258 unsigned int num_zr () const;
259 unsigned int num_pr () const;
261 rtx
get_rtx (machine_mode mode
, unsigned int, unsigned int) const;
263 analysis_result
analyze (const_tree
);
264 bool analyze_registers (const_tree
);
267 analysis_result
analyze_array (const_tree
);
268 analysis_result
analyze_record (const_tree
);
269 void add_piece (const piece
&);
273 /* The current code model. */
274 enum aarch64_code_model aarch64_cmodel
;
276 /* The number of 64-bit elements in an SVE vector. */
277 poly_uint16 aarch64_sve_vg
;
280 #undef TARGET_HAVE_TLS
281 #define TARGET_HAVE_TLS 1
284 static bool aarch64_composite_type_p (const_tree
, machine_mode
);
285 static bool aarch64_return_in_memory_1 (const_tree
);
286 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode
,
288 machine_mode
*, int *,
290 static void aarch64_elf_asm_constructor (rtx
, int) ATTRIBUTE_UNUSED
;
291 static void aarch64_elf_asm_destructor (rtx
, int) ATTRIBUTE_UNUSED
;
292 static void aarch64_override_options_after_change (void);
293 static bool aarch64_vector_mode_supported_p (machine_mode
);
294 static int aarch64_address_cost (rtx
, machine_mode
, addr_space_t
, bool);
295 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode
,
299 static machine_mode
aarch64_simd_container_mode (scalar_mode
, poly_int64
);
300 static bool aarch64_print_address_internal (FILE*, machine_mode
, rtx
,
301 aarch64_addr_query_type
);
302 static HOST_WIDE_INT
aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val
);
304 /* Major revision number of the ARM Architecture implemented by the target. */
305 unsigned aarch64_architecture_version
;
307 /* The processor for which instructions should be scheduled. */
308 enum aarch64_processor aarch64_tune
= cortexa53
;
310 /* Mask to specify which instruction scheduling options should be used. */
311 uint64_t aarch64_tune_flags
= 0;
313 /* Global flag for PC relative loads. */
314 bool aarch64_pcrelative_literal_loads
;
316 /* Global flag for whether frame pointer is enabled. */
317 bool aarch64_use_frame_pointer
;
319 #define BRANCH_PROTECT_STR_MAX 255
320 char *accepted_branch_protection_string
= NULL
;
322 static enum aarch64_parse_opt_result
323 aarch64_parse_branch_protection (const char*, char**);
325 /* Support for command line parsing of boolean flags in the tuning
327 struct aarch64_flag_desc
333 #define AARCH64_FUSION_PAIR(name, internal_name) \
334 { name, AARCH64_FUSE_##internal_name },
335 static const struct aarch64_flag_desc aarch64_fusible_pairs
[] =
337 { "none", AARCH64_FUSE_NOTHING
},
338 #include "aarch64-fusion-pairs.def"
339 { "all", AARCH64_FUSE_ALL
},
340 { NULL
, AARCH64_FUSE_NOTHING
}
343 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
344 { name, AARCH64_EXTRA_TUNE_##internal_name },
345 static const struct aarch64_flag_desc aarch64_tuning_flags
[] =
347 { "none", AARCH64_EXTRA_TUNE_NONE
},
348 #include "aarch64-tuning-flags.def"
349 { "all", AARCH64_EXTRA_TUNE_ALL
},
350 { NULL
, AARCH64_EXTRA_TUNE_NONE
}
353 /* Tuning parameters. */
355 static const struct cpu_addrcost_table generic_addrcost_table
=
365 0, /* register_offset */
366 0, /* register_sextend */
367 0, /* register_zextend */
371 static const struct cpu_addrcost_table exynosm1_addrcost_table
=
381 1, /* register_offset */
382 1, /* register_sextend */
383 2, /* register_zextend */
387 static const struct cpu_addrcost_table xgene1_addrcost_table
=
397 0, /* register_offset */
398 1, /* register_sextend */
399 1, /* register_zextend */
403 static const struct cpu_addrcost_table thunderx2t99_addrcost_table
=
413 2, /* register_offset */
414 3, /* register_sextend */
415 3, /* register_zextend */
419 static const struct cpu_addrcost_table tsv110_addrcost_table
=
429 0, /* register_offset */
430 1, /* register_sextend */
431 1, /* register_zextend */
435 static const struct cpu_addrcost_table qdf24xx_addrcost_table
=
445 3, /* register_offset */
446 3, /* register_sextend */
447 3, /* register_zextend */
451 static const struct cpu_regmove_cost generic_regmove_cost
=
454 /* Avoid the use of slow int<->fp moves for spilling by setting
455 their cost higher than memmov_cost. */
461 static const struct cpu_regmove_cost cortexa57_regmove_cost
=
464 /* Avoid the use of slow int<->fp moves for spilling by setting
465 their cost higher than memmov_cost. */
471 static const struct cpu_regmove_cost cortexa53_regmove_cost
=
474 /* Avoid the use of slow int<->fp moves for spilling by setting
475 their cost higher than memmov_cost. */
481 static const struct cpu_regmove_cost exynosm1_regmove_cost
=
484 /* Avoid the use of slow int<->fp moves for spilling by setting
485 their cost higher than memmov_cost (actual, 4 and 9). */
491 static const struct cpu_regmove_cost thunderx_regmove_cost
=
499 static const struct cpu_regmove_cost xgene1_regmove_cost
=
502 /* Avoid the use of slow int<->fp moves for spilling by setting
503 their cost higher than memmov_cost. */
509 static const struct cpu_regmove_cost qdf24xx_regmove_cost
=
512 /* Avoid the use of int<->fp moves for spilling. */
518 static const struct cpu_regmove_cost thunderx2t99_regmove_cost
=
521 /* Avoid the use of int<->fp moves for spilling. */
527 static const struct cpu_regmove_cost tsv110_regmove_cost
=
530 /* Avoid the use of slow int<->fp moves for spilling by setting
531 their cost higher than memmov_cost. */
537 /* Generic costs for vector insn classes. */
538 static const struct cpu_vector_cost generic_vector_cost
=
540 1, /* scalar_int_stmt_cost */
541 1, /* scalar_fp_stmt_cost */
542 1, /* scalar_load_cost */
543 1, /* scalar_store_cost */
544 1, /* vec_int_stmt_cost */
545 1, /* vec_fp_stmt_cost */
546 2, /* vec_permute_cost */
547 2, /* vec_to_scalar_cost */
548 1, /* scalar_to_vec_cost */
549 1, /* vec_align_load_cost */
550 1, /* vec_unalign_load_cost */
551 1, /* vec_unalign_store_cost */
552 1, /* vec_store_cost */
553 3, /* cond_taken_branch_cost */
554 1 /* cond_not_taken_branch_cost */
557 /* QDF24XX costs for vector insn classes. */
558 static const struct cpu_vector_cost qdf24xx_vector_cost
=
560 1, /* scalar_int_stmt_cost */
561 1, /* scalar_fp_stmt_cost */
562 1, /* scalar_load_cost */
563 1, /* scalar_store_cost */
564 1, /* vec_int_stmt_cost */
565 3, /* vec_fp_stmt_cost */
566 2, /* vec_permute_cost */
567 1, /* vec_to_scalar_cost */
568 1, /* scalar_to_vec_cost */
569 1, /* vec_align_load_cost */
570 1, /* vec_unalign_load_cost */
571 1, /* vec_unalign_store_cost */
572 1, /* vec_store_cost */
573 3, /* cond_taken_branch_cost */
574 1 /* cond_not_taken_branch_cost */
577 /* ThunderX costs for vector insn classes. */
578 static const struct cpu_vector_cost thunderx_vector_cost
=
580 1, /* scalar_int_stmt_cost */
581 1, /* scalar_fp_stmt_cost */
582 3, /* scalar_load_cost */
583 1, /* scalar_store_cost */
584 4, /* vec_int_stmt_cost */
585 1, /* vec_fp_stmt_cost */
586 4, /* vec_permute_cost */
587 2, /* vec_to_scalar_cost */
588 2, /* scalar_to_vec_cost */
589 3, /* vec_align_load_cost */
590 5, /* vec_unalign_load_cost */
591 5, /* vec_unalign_store_cost */
592 1, /* vec_store_cost */
593 3, /* cond_taken_branch_cost */
594 3 /* cond_not_taken_branch_cost */
597 static const struct cpu_vector_cost tsv110_vector_cost
=
599 1, /* scalar_int_stmt_cost */
600 1, /* scalar_fp_stmt_cost */
601 5, /* scalar_load_cost */
602 1, /* scalar_store_cost */
603 2, /* vec_int_stmt_cost */
604 2, /* vec_fp_stmt_cost */
605 2, /* vec_permute_cost */
606 3, /* vec_to_scalar_cost */
607 2, /* scalar_to_vec_cost */
608 5, /* vec_align_load_cost */
609 5, /* vec_unalign_load_cost */
610 1, /* vec_unalign_store_cost */
611 1, /* vec_store_cost */
612 1, /* cond_taken_branch_cost */
613 1 /* cond_not_taken_branch_cost */
616 /* Generic costs for vector insn classes. */
617 static const struct cpu_vector_cost cortexa57_vector_cost
=
619 1, /* scalar_int_stmt_cost */
620 1, /* scalar_fp_stmt_cost */
621 4, /* scalar_load_cost */
622 1, /* scalar_store_cost */
623 2, /* vec_int_stmt_cost */
624 2, /* vec_fp_stmt_cost */
625 3, /* vec_permute_cost */
626 8, /* vec_to_scalar_cost */
627 8, /* scalar_to_vec_cost */
628 4, /* vec_align_load_cost */
629 4, /* vec_unalign_load_cost */
630 1, /* vec_unalign_store_cost */
631 1, /* vec_store_cost */
632 1, /* cond_taken_branch_cost */
633 1 /* cond_not_taken_branch_cost */
636 static const struct cpu_vector_cost exynosm1_vector_cost
=
638 1, /* scalar_int_stmt_cost */
639 1, /* scalar_fp_stmt_cost */
640 5, /* scalar_load_cost */
641 1, /* scalar_store_cost */
642 3, /* vec_int_stmt_cost */
643 3, /* vec_fp_stmt_cost */
644 3, /* vec_permute_cost */
645 3, /* vec_to_scalar_cost */
646 3, /* scalar_to_vec_cost */
647 5, /* vec_align_load_cost */
648 5, /* vec_unalign_load_cost */
649 1, /* vec_unalign_store_cost */
650 1, /* vec_store_cost */
651 1, /* cond_taken_branch_cost */
652 1 /* cond_not_taken_branch_cost */
655 /* Generic costs for vector insn classes. */
656 static const struct cpu_vector_cost xgene1_vector_cost
=
658 1, /* scalar_int_stmt_cost */
659 1, /* scalar_fp_stmt_cost */
660 5, /* scalar_load_cost */
661 1, /* scalar_store_cost */
662 2, /* vec_int_stmt_cost */
663 2, /* vec_fp_stmt_cost */
664 2, /* vec_permute_cost */
665 4, /* vec_to_scalar_cost */
666 4, /* scalar_to_vec_cost */
667 10, /* vec_align_load_cost */
668 10, /* vec_unalign_load_cost */
669 2, /* vec_unalign_store_cost */
670 2, /* vec_store_cost */
671 2, /* cond_taken_branch_cost */
672 1 /* cond_not_taken_branch_cost */
675 /* Costs for vector insn classes for Vulcan. */
676 static const struct cpu_vector_cost thunderx2t99_vector_cost
=
678 1, /* scalar_int_stmt_cost */
679 6, /* scalar_fp_stmt_cost */
680 4, /* scalar_load_cost */
681 1, /* scalar_store_cost */
682 5, /* vec_int_stmt_cost */
683 6, /* vec_fp_stmt_cost */
684 10, /* vec_permute_cost */
685 6, /* vec_to_scalar_cost */
686 5, /* scalar_to_vec_cost */
687 8, /* vec_align_load_cost */
688 8, /* vec_unalign_load_cost */
689 4, /* vec_unalign_store_cost */
690 4, /* vec_store_cost */
691 2, /* cond_taken_branch_cost */
692 1 /* cond_not_taken_branch_cost */
695 /* Generic costs for branch instructions. */
696 static const struct cpu_branch_cost generic_branch_cost
=
698 1, /* Predictable. */
699 3 /* Unpredictable. */
702 /* Generic approximation modes. */
703 static const cpu_approx_modes generic_approx_modes
=
705 AARCH64_APPROX_NONE
, /* division */
706 AARCH64_APPROX_NONE
, /* sqrt */
707 AARCH64_APPROX_NONE
/* recip_sqrt */
710 /* Approximation modes for Exynos M1. */
711 static const cpu_approx_modes exynosm1_approx_modes
=
713 AARCH64_APPROX_NONE
, /* division */
714 AARCH64_APPROX_ALL
, /* sqrt */
715 AARCH64_APPROX_ALL
/* recip_sqrt */
718 /* Approximation modes for X-Gene 1. */
719 static const cpu_approx_modes xgene1_approx_modes
=
721 AARCH64_APPROX_NONE
, /* division */
722 AARCH64_APPROX_NONE
, /* sqrt */
723 AARCH64_APPROX_ALL
/* recip_sqrt */
726 /* Generic prefetch settings (which disable prefetch). */
727 static const cpu_prefetch_tune generic_prefetch_tune
=
730 -1, /* l1_cache_size */
731 -1, /* l1_cache_line_size */
732 -1, /* l2_cache_size */
733 true, /* prefetch_dynamic_strides */
734 -1, /* minimum_stride */
735 -1 /* default_opt_level */
738 static const cpu_prefetch_tune exynosm1_prefetch_tune
=
741 -1, /* l1_cache_size */
742 64, /* l1_cache_line_size */
743 -1, /* l2_cache_size */
744 true, /* prefetch_dynamic_strides */
745 -1, /* minimum_stride */
746 -1 /* default_opt_level */
749 static const cpu_prefetch_tune qdf24xx_prefetch_tune
=
752 32, /* l1_cache_size */
753 64, /* l1_cache_line_size */
754 512, /* l2_cache_size */
755 false, /* prefetch_dynamic_strides */
756 2048, /* minimum_stride */
757 3 /* default_opt_level */
760 static const cpu_prefetch_tune thunderxt88_prefetch_tune
=
763 32, /* l1_cache_size */
764 128, /* l1_cache_line_size */
765 16*1024, /* l2_cache_size */
766 true, /* prefetch_dynamic_strides */
767 -1, /* minimum_stride */
768 3 /* default_opt_level */
771 static const cpu_prefetch_tune thunderx_prefetch_tune
=
774 32, /* l1_cache_size */
775 128, /* l1_cache_line_size */
776 -1, /* l2_cache_size */
777 true, /* prefetch_dynamic_strides */
778 -1, /* minimum_stride */
779 -1 /* default_opt_level */
782 static const cpu_prefetch_tune thunderx2t99_prefetch_tune
=
785 32, /* l1_cache_size */
786 64, /* l1_cache_line_size */
787 256, /* l2_cache_size */
788 true, /* prefetch_dynamic_strides */
789 -1, /* minimum_stride */
790 -1 /* default_opt_level */
793 static const cpu_prefetch_tune tsv110_prefetch_tune
=
796 64, /* l1_cache_size */
797 64, /* l1_cache_line_size */
798 512, /* l2_cache_size */
799 true, /* prefetch_dynamic_strides */
800 -1, /* minimum_stride */
801 -1 /* default_opt_level */
804 static const cpu_prefetch_tune xgene1_prefetch_tune
=
807 32, /* l1_cache_size */
808 64, /* l1_cache_line_size */
809 256, /* l2_cache_size */
810 true, /* prefetch_dynamic_strides */
811 -1, /* minimum_stride */
812 -1 /* default_opt_level */
815 static const struct tune_params generic_tunings
=
817 &cortexa57_extra_costs
,
818 &generic_addrcost_table
,
819 &generic_regmove_cost
,
820 &generic_vector_cost
,
821 &generic_branch_cost
,
822 &generic_approx_modes
,
823 SVE_NOT_IMPLEMENTED
, /* sve_width */
826 (AARCH64_FUSE_AES_AESMC
| AARCH64_FUSE_CMP_BRANCH
), /* fusible_ops */
827 "16:12", /* function_align. */
828 "4", /* jump_align. */
829 "8", /* loop_align. */
830 2, /* int_reassoc_width. */
831 4, /* fp_reassoc_width. */
832 1, /* vec_reassoc_width. */
833 2, /* min_div_recip_mul_sf. */
834 2, /* min_div_recip_mul_df. */
835 0, /* max_case_values. */
836 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
837 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
838 &generic_prefetch_tune
841 static const struct tune_params cortexa35_tunings
=
843 &cortexa53_extra_costs
,
844 &generic_addrcost_table
,
845 &cortexa53_regmove_cost
,
846 &generic_vector_cost
,
847 &generic_branch_cost
,
848 &generic_approx_modes
,
849 SVE_NOT_IMPLEMENTED
, /* sve_width */
852 (AARCH64_FUSE_AES_AESMC
| AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
853 | AARCH64_FUSE_MOVK_MOVK
| AARCH64_FUSE_ADRP_LDR
), /* fusible_ops */
854 "16", /* function_align. */
855 "4", /* jump_align. */
856 "8", /* loop_align. */
857 2, /* int_reassoc_width. */
858 4, /* fp_reassoc_width. */
859 1, /* vec_reassoc_width. */
860 2, /* min_div_recip_mul_sf. */
861 2, /* min_div_recip_mul_df. */
862 0, /* max_case_values. */
863 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
864 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
865 &generic_prefetch_tune
868 static const struct tune_params cortexa53_tunings
=
870 &cortexa53_extra_costs
,
871 &generic_addrcost_table
,
872 &cortexa53_regmove_cost
,
873 &generic_vector_cost
,
874 &generic_branch_cost
,
875 &generic_approx_modes
,
876 SVE_NOT_IMPLEMENTED
, /* sve_width */
879 (AARCH64_FUSE_AES_AESMC
| AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
880 | AARCH64_FUSE_MOVK_MOVK
| AARCH64_FUSE_ADRP_LDR
), /* fusible_ops */
881 "16", /* function_align. */
882 "4", /* jump_align. */
883 "8", /* loop_align. */
884 2, /* int_reassoc_width. */
885 4, /* fp_reassoc_width. */
886 1, /* vec_reassoc_width. */
887 2, /* min_div_recip_mul_sf. */
888 2, /* min_div_recip_mul_df. */
889 0, /* max_case_values. */
890 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
891 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
892 &generic_prefetch_tune
895 static const struct tune_params cortexa57_tunings
=
897 &cortexa57_extra_costs
,
898 &generic_addrcost_table
,
899 &cortexa57_regmove_cost
,
900 &cortexa57_vector_cost
,
901 &generic_branch_cost
,
902 &generic_approx_modes
,
903 SVE_NOT_IMPLEMENTED
, /* sve_width */
906 (AARCH64_FUSE_AES_AESMC
| AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
907 | AARCH64_FUSE_MOVK_MOVK
), /* fusible_ops */
908 "16", /* function_align. */
909 "4", /* jump_align. */
910 "8", /* loop_align. */
911 2, /* int_reassoc_width. */
912 4, /* fp_reassoc_width. */
913 1, /* vec_reassoc_width. */
914 2, /* min_div_recip_mul_sf. */
915 2, /* min_div_recip_mul_df. */
916 0, /* max_case_values. */
917 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
918 (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS
), /* tune_flags. */
919 &generic_prefetch_tune
922 static const struct tune_params cortexa72_tunings
=
924 &cortexa57_extra_costs
,
925 &generic_addrcost_table
,
926 &cortexa57_regmove_cost
,
927 &cortexa57_vector_cost
,
928 &generic_branch_cost
,
929 &generic_approx_modes
,
930 SVE_NOT_IMPLEMENTED
, /* sve_width */
933 (AARCH64_FUSE_AES_AESMC
| AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
934 | AARCH64_FUSE_MOVK_MOVK
), /* fusible_ops */
935 "16", /* function_align. */
936 "4", /* jump_align. */
937 "8", /* loop_align. */
938 2, /* int_reassoc_width. */
939 4, /* fp_reassoc_width. */
940 1, /* vec_reassoc_width. */
941 2, /* min_div_recip_mul_sf. */
942 2, /* min_div_recip_mul_df. */
943 0, /* max_case_values. */
944 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
945 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
946 &generic_prefetch_tune
949 static const struct tune_params cortexa73_tunings
=
951 &cortexa57_extra_costs
,
952 &generic_addrcost_table
,
953 &cortexa57_regmove_cost
,
954 &cortexa57_vector_cost
,
955 &generic_branch_cost
,
956 &generic_approx_modes
,
957 SVE_NOT_IMPLEMENTED
, /* sve_width */
958 4, /* memmov_cost. */
960 (AARCH64_FUSE_AES_AESMC
| AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
961 | AARCH64_FUSE_MOVK_MOVK
| AARCH64_FUSE_ADRP_LDR
), /* fusible_ops */
962 "16", /* function_align. */
963 "4", /* jump_align. */
964 "8", /* loop_align. */
965 2, /* int_reassoc_width. */
966 4, /* fp_reassoc_width. */
967 1, /* vec_reassoc_width. */
968 2, /* min_div_recip_mul_sf. */
969 2, /* min_div_recip_mul_df. */
970 0, /* max_case_values. */
971 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
972 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
973 &generic_prefetch_tune
978 static const struct tune_params exynosm1_tunings
=
980 &exynosm1_extra_costs
,
981 &exynosm1_addrcost_table
,
982 &exynosm1_regmove_cost
,
983 &exynosm1_vector_cost
,
984 &generic_branch_cost
,
985 &exynosm1_approx_modes
,
986 SVE_NOT_IMPLEMENTED
, /* sve_width */
989 (AARCH64_FUSE_AES_AESMC
), /* fusible_ops */
990 "4", /* function_align. */
991 "4", /* jump_align. */
992 "4", /* loop_align. */
993 2, /* int_reassoc_width. */
994 4, /* fp_reassoc_width. */
995 1, /* vec_reassoc_width. */
996 2, /* min_div_recip_mul_sf. */
997 2, /* min_div_recip_mul_df. */
998 48, /* max_case_values. */
999 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
1000 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
1001 &exynosm1_prefetch_tune
1004 static const struct tune_params thunderxt88_tunings
=
1006 &thunderx_extra_costs
,
1007 &generic_addrcost_table
,
1008 &thunderx_regmove_cost
,
1009 &thunderx_vector_cost
,
1010 &generic_branch_cost
,
1011 &generic_approx_modes
,
1012 SVE_NOT_IMPLEMENTED
, /* sve_width */
1013 6, /* memmov_cost */
1015 AARCH64_FUSE_ALU_BRANCH
, /* fusible_ops */
1016 "8", /* function_align. */
1017 "8", /* jump_align. */
1018 "8", /* loop_align. */
1019 2, /* int_reassoc_width. */
1020 4, /* fp_reassoc_width. */
1021 1, /* vec_reassoc_width. */
1022 2, /* min_div_recip_mul_sf. */
1023 2, /* min_div_recip_mul_df. */
1024 0, /* max_case_values. */
1025 tune_params::AUTOPREFETCHER_OFF
, /* autoprefetcher_model. */
1026 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
), /* tune_flags. */
1027 &thunderxt88_prefetch_tune
1030 static const struct tune_params thunderx_tunings
=
1032 &thunderx_extra_costs
,
1033 &generic_addrcost_table
,
1034 &thunderx_regmove_cost
,
1035 &thunderx_vector_cost
,
1036 &generic_branch_cost
,
1037 &generic_approx_modes
,
1038 SVE_NOT_IMPLEMENTED
, /* sve_width */
1039 6, /* memmov_cost */
1041 AARCH64_FUSE_ALU_BRANCH
, /* fusible_ops */
1042 "8", /* function_align. */
1043 "8", /* jump_align. */
1044 "8", /* loop_align. */
1045 2, /* int_reassoc_width. */
1046 4, /* fp_reassoc_width. */
1047 1, /* vec_reassoc_width. */
1048 2, /* min_div_recip_mul_sf. */
1049 2, /* min_div_recip_mul_df. */
1050 0, /* max_case_values. */
1051 tune_params::AUTOPREFETCHER_OFF
, /* autoprefetcher_model. */
1052 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
1053 | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND
), /* tune_flags. */
1054 &thunderx_prefetch_tune
1057 static const struct tune_params tsv110_tunings
=
1059 &tsv110_extra_costs
,
1060 &tsv110_addrcost_table
,
1061 &tsv110_regmove_cost
,
1062 &tsv110_vector_cost
,
1063 &generic_branch_cost
,
1064 &generic_approx_modes
,
1065 SVE_NOT_IMPLEMENTED
, /* sve_width */
1066 4, /* memmov_cost */
1068 (AARCH64_FUSE_AES_AESMC
| AARCH64_FUSE_ALU_BRANCH
1069 | AARCH64_FUSE_ALU_CBZ
), /* fusible_ops */
1070 "16", /* function_align. */
1071 "4", /* jump_align. */
1072 "8", /* loop_align. */
1073 2, /* int_reassoc_width. */
1074 4, /* fp_reassoc_width. */
1075 1, /* vec_reassoc_width. */
1076 2, /* min_div_recip_mul_sf. */
1077 2, /* min_div_recip_mul_df. */
1078 0, /* max_case_values. */
1079 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
1080 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
1081 &tsv110_prefetch_tune
1084 static const struct tune_params xgene1_tunings
=
1086 &xgene1_extra_costs
,
1087 &xgene1_addrcost_table
,
1088 &xgene1_regmove_cost
,
1089 &xgene1_vector_cost
,
1090 &generic_branch_cost
,
1091 &xgene1_approx_modes
,
1092 SVE_NOT_IMPLEMENTED
, /* sve_width */
1093 6, /* memmov_cost */
1095 AARCH64_FUSE_NOTHING
, /* fusible_ops */
1096 "16", /* function_align. */
1097 "16", /* jump_align. */
1098 "16", /* loop_align. */
1099 2, /* int_reassoc_width. */
1100 4, /* fp_reassoc_width. */
1101 1, /* vec_reassoc_width. */
1102 2, /* min_div_recip_mul_sf. */
1103 2, /* min_div_recip_mul_df. */
1104 17, /* max_case_values. */
1105 tune_params::AUTOPREFETCHER_OFF
, /* autoprefetcher_model. */
1106 (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS
), /* tune_flags. */
1107 &xgene1_prefetch_tune
1110 static const struct tune_params emag_tunings
=
1112 &xgene1_extra_costs
,
1113 &xgene1_addrcost_table
,
1114 &xgene1_regmove_cost
,
1115 &xgene1_vector_cost
,
1116 &generic_branch_cost
,
1117 &xgene1_approx_modes
,
1118 SVE_NOT_IMPLEMENTED
,
1119 6, /* memmov_cost */
1121 AARCH64_FUSE_NOTHING
, /* fusible_ops */
1122 "16", /* function_align. */
1123 "16", /* jump_align. */
1124 "16", /* loop_align. */
1125 2, /* int_reassoc_width. */
1126 4, /* fp_reassoc_width. */
1127 1, /* vec_reassoc_width. */
1128 2, /* min_div_recip_mul_sf. */
1129 2, /* min_div_recip_mul_df. */
1130 17, /* max_case_values. */
1131 tune_params::AUTOPREFETCHER_OFF
, /* autoprefetcher_model. */
1132 (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS
), /* tune_flags. */
1133 &xgene1_prefetch_tune
1136 static const struct tune_params qdf24xx_tunings
=
1138 &qdf24xx_extra_costs
,
1139 &qdf24xx_addrcost_table
,
1140 &qdf24xx_regmove_cost
,
1141 &qdf24xx_vector_cost
,
1142 &generic_branch_cost
,
1143 &generic_approx_modes
,
1144 SVE_NOT_IMPLEMENTED
, /* sve_width */
1145 4, /* memmov_cost */
1147 (AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
1148 | AARCH64_FUSE_MOVK_MOVK
), /* fuseable_ops */
1149 "16", /* function_align. */
1150 "8", /* jump_align. */
1151 "16", /* loop_align. */
1152 2, /* int_reassoc_width. */
1153 4, /* fp_reassoc_width. */
1154 1, /* vec_reassoc_width. */
1155 2, /* min_div_recip_mul_sf. */
1156 2, /* min_div_recip_mul_df. */
1157 0, /* max_case_values. */
1158 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
1159 AARCH64_EXTRA_TUNE_RENAME_LOAD_REGS
, /* tune_flags. */
1160 &qdf24xx_prefetch_tune
1163 /* Tuning structure for the Qualcomm Saphira core. Default to falkor values
1165 static const struct tune_params saphira_tunings
=
1167 &generic_extra_costs
,
1168 &generic_addrcost_table
,
1169 &generic_regmove_cost
,
1170 &generic_vector_cost
,
1171 &generic_branch_cost
,
1172 &generic_approx_modes
,
1173 SVE_NOT_IMPLEMENTED
, /* sve_width */
1174 4, /* memmov_cost */
1176 (AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
1177 | AARCH64_FUSE_MOVK_MOVK
), /* fuseable_ops */
1178 "16", /* function_align. */
1179 "8", /* jump_align. */
1180 "16", /* loop_align. */
1181 2, /* int_reassoc_width. */
1182 4, /* fp_reassoc_width. */
1183 1, /* vec_reassoc_width. */
1184 2, /* min_div_recip_mul_sf. */
1185 2, /* min_div_recip_mul_df. */
1186 0, /* max_case_values. */
1187 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
1188 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
1189 &generic_prefetch_tune
1192 static const struct tune_params thunderx2t99_tunings
=
1194 &thunderx2t99_extra_costs
,
1195 &thunderx2t99_addrcost_table
,
1196 &thunderx2t99_regmove_cost
,
1197 &thunderx2t99_vector_cost
,
1198 &generic_branch_cost
,
1199 &generic_approx_modes
,
1200 SVE_NOT_IMPLEMENTED
, /* sve_width */
1201 4, /* memmov_cost. */
1202 4, /* issue_rate. */
1203 (AARCH64_FUSE_ALU_BRANCH
| AARCH64_FUSE_AES_AESMC
1204 | AARCH64_FUSE_ALU_CBZ
), /* fusible_ops */
1205 "16", /* function_align. */
1206 "8", /* jump_align. */
1207 "16", /* loop_align. */
1208 3, /* int_reassoc_width. */
1209 2, /* fp_reassoc_width. */
1210 2, /* vec_reassoc_width. */
1211 2, /* min_div_recip_mul_sf. */
1212 2, /* min_div_recip_mul_df. */
1213 0, /* max_case_values. */
1214 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
1215 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
1216 &thunderx2t99_prefetch_tune
1219 static const struct tune_params neoversen1_tunings
=
1221 &cortexa57_extra_costs
,
1222 &generic_addrcost_table
,
1223 &generic_regmove_cost
,
1224 &cortexa57_vector_cost
,
1225 &generic_branch_cost
,
1226 &generic_approx_modes
,
1227 SVE_NOT_IMPLEMENTED
, /* sve_width */
1228 4, /* memmov_cost */
1230 (AARCH64_FUSE_AES_AESMC
| AARCH64_FUSE_CMP_BRANCH
), /* fusible_ops */
1231 "32:16", /* function_align. */
1232 "4", /* jump_align. */
1233 "32:16", /* loop_align. */
1234 2, /* int_reassoc_width. */
1235 4, /* fp_reassoc_width. */
1236 2, /* vec_reassoc_width. */
1237 2, /* min_div_recip_mul_sf. */
1238 2, /* min_div_recip_mul_df. */
1239 0, /* max_case_values. */
1240 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
1241 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
1242 &generic_prefetch_tune
1245 /* Support for fine-grained override of the tuning structures. */
1246 struct aarch64_tuning_override_function
1249 void (*parse_override
)(const char*, struct tune_params
*);
1252 static void aarch64_parse_fuse_string (const char*, struct tune_params
*);
1253 static void aarch64_parse_tune_string (const char*, struct tune_params
*);
1254 static void aarch64_parse_sve_width_string (const char*, struct tune_params
*);
1256 static const struct aarch64_tuning_override_function
1257 aarch64_tuning_override_functions
[] =
1259 { "fuse", aarch64_parse_fuse_string
},
1260 { "tune", aarch64_parse_tune_string
},
1261 { "sve_width", aarch64_parse_sve_width_string
},
1265 /* A processor implementing AArch64. */
1268 const char *const name
;
1269 enum aarch64_processor ident
;
1270 enum aarch64_processor sched_core
;
1271 enum aarch64_arch arch
;
1272 unsigned architecture_version
;
1273 const uint64_t flags
;
1274 const struct tune_params
*const tune
;
1277 /* Architectures implementing AArch64. */
1278 static const struct processor all_architectures
[] =
1280 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
1281 {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
1282 #include "aarch64-arches.def"
1283 {NULL
, aarch64_none
, aarch64_none
, aarch64_no_arch
, 0, 0, NULL
}
1286 /* Processor cores implementing AArch64. */
1287 static const struct processor all_cores
[] =
1289 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART, VARIANT) \
1290 {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH, \
1291 all_architectures[AARCH64_ARCH_##ARCH].architecture_version, \
1292 FLAGS, &COSTS##_tunings},
1293 #include "aarch64-cores.def"
1294 {"generic", generic
, cortexa53
, AARCH64_ARCH_8A
, 8,
1295 AARCH64_FL_FOR_ARCH8
, &generic_tunings
},
1296 {NULL
, aarch64_none
, aarch64_none
, aarch64_no_arch
, 0, 0, NULL
}
1300 /* Target specification. These are populated by the -march, -mtune, -mcpu
1301 handling code or by target attributes. */
1302 static const struct processor
*selected_arch
;
1303 static const struct processor
*selected_cpu
;
1304 static const struct processor
*selected_tune
;
1306 enum aarch64_key_type aarch64_ra_sign_key
= AARCH64_KEY_A
;
1308 /* The current tuning set. */
1309 struct tune_params aarch64_tune_params
= generic_tunings
;
1311 /* Check whether an 'aarch64_vector_pcs' attribute is valid. */
1314 handle_aarch64_vector_pcs_attribute (tree
*node
, tree name
, tree
,
1315 int, bool *no_add_attrs
)
1317 /* Since we set fn_type_req to true, the caller should have checked
1319 gcc_assert (FUNC_OR_METHOD_TYPE_P (*node
));
1320 switch ((arm_pcs
) fntype_abi (*node
).id ())
1322 case ARM_PCS_AAPCS64
:
1327 error ("the %qE attribute cannot be applied to an SVE function type",
1329 *no_add_attrs
= true;
1332 case ARM_PCS_TLSDESC
:
1333 case ARM_PCS_UNKNOWN
:
1339 /* Table of machine attributes. */
1340 static const struct attribute_spec aarch64_attribute_table
[] =
1342 /* { name, min_len, max_len, decl_req, type_req, fn_type_req,
1343 affects_type_identity, handler, exclude } */
1344 { "aarch64_vector_pcs", 0, 0, false, true, true, true,
1345 handle_aarch64_vector_pcs_attribute
, NULL
},
1346 { "arm_sve_vector_bits", 1, 1, false, true, false, true,
1347 aarch64_sve::handle_arm_sve_vector_bits_attribute
,
1349 { "SVE type", 3, 3, false, true, false, true, NULL
, NULL
},
1350 { "SVE sizeless type", 0, 0, false, true, false, true, NULL
, NULL
},
1351 { NULL
, 0, 0, false, false, false, false, NULL
, NULL
}
1354 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
1356 /* An ISA extension in the co-processor and main instruction set space. */
1357 struct aarch64_option_extension
1359 const char *const name
;
1360 const unsigned long flags_on
;
1361 const unsigned long flags_off
;
1364 typedef enum aarch64_cond_code
1366 AARCH64_EQ
= 0, AARCH64_NE
, AARCH64_CS
, AARCH64_CC
, AARCH64_MI
, AARCH64_PL
,
1367 AARCH64_VS
, AARCH64_VC
, AARCH64_HI
, AARCH64_LS
, AARCH64_GE
, AARCH64_LT
,
1368 AARCH64_GT
, AARCH64_LE
, AARCH64_AL
, AARCH64_NV
1372 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
1374 struct aarch64_branch_protect_type
1376 /* The type's name that the user passes to the branch-protection option
1379 /* Function to handle the protection type and set global variables.
1380 First argument is the string token corresponding with this type and the
1381 second argument is the next token in the option string.
1383 * AARCH64_PARSE_OK: Handling was sucessful.
1384 * AARCH64_INVALID_ARG: The type is invalid in this context and the caller
1385 should print an error.
1386 * AARCH64_INVALID_FEATURE: The type is invalid and the handler prints its
1388 enum aarch64_parse_opt_result (*handler
)(char*, char*);
1389 /* A list of types that can follow this type in the option string. */
1390 const aarch64_branch_protect_type
* subtypes
;
1391 unsigned int num_subtypes
;
1394 static enum aarch64_parse_opt_result
1395 aarch64_handle_no_branch_protection (char* str
, char* rest
)
1397 aarch64_ra_sign_scope
= AARCH64_FUNCTION_NONE
;
1398 aarch64_enable_bti
= 0;
1401 error ("unexpected %<%s%> after %<%s%>", rest
, str
);
1402 return AARCH64_PARSE_INVALID_FEATURE
;
1404 return AARCH64_PARSE_OK
;
1407 static enum aarch64_parse_opt_result
1408 aarch64_handle_standard_branch_protection (char* str
, char* rest
)
1410 aarch64_ra_sign_scope
= AARCH64_FUNCTION_NON_LEAF
;
1411 aarch64_ra_sign_key
= AARCH64_KEY_A
;
1412 aarch64_enable_bti
= 1;
1415 error ("unexpected %<%s%> after %<%s%>", rest
, str
);
1416 return AARCH64_PARSE_INVALID_FEATURE
;
1418 return AARCH64_PARSE_OK
;
1421 static enum aarch64_parse_opt_result
1422 aarch64_handle_pac_ret_protection (char* str ATTRIBUTE_UNUSED
,
1423 char* rest ATTRIBUTE_UNUSED
)
1425 aarch64_ra_sign_scope
= AARCH64_FUNCTION_NON_LEAF
;
1426 aarch64_ra_sign_key
= AARCH64_KEY_A
;
1427 return AARCH64_PARSE_OK
;
1430 static enum aarch64_parse_opt_result
1431 aarch64_handle_pac_ret_leaf (char* str ATTRIBUTE_UNUSED
,
1432 char* rest ATTRIBUTE_UNUSED
)
1434 aarch64_ra_sign_scope
= AARCH64_FUNCTION_ALL
;
1435 return AARCH64_PARSE_OK
;
1438 static enum aarch64_parse_opt_result
1439 aarch64_handle_pac_ret_b_key (char* str ATTRIBUTE_UNUSED
,
1440 char* rest ATTRIBUTE_UNUSED
)
1442 aarch64_ra_sign_key
= AARCH64_KEY_B
;
1443 return AARCH64_PARSE_OK
;
1446 static enum aarch64_parse_opt_result
1447 aarch64_handle_bti_protection (char* str ATTRIBUTE_UNUSED
,
1448 char* rest ATTRIBUTE_UNUSED
)
1450 aarch64_enable_bti
= 1;
1451 return AARCH64_PARSE_OK
;
1454 static const struct aarch64_branch_protect_type aarch64_pac_ret_subtypes
[] = {
1455 { "leaf", aarch64_handle_pac_ret_leaf
, NULL
, 0 },
1456 { "b-key", aarch64_handle_pac_ret_b_key
, NULL
, 0 },
1457 { NULL
, NULL
, NULL
, 0 }
1460 static const struct aarch64_branch_protect_type aarch64_branch_protect_types
[] = {
1461 { "none", aarch64_handle_no_branch_protection
, NULL
, 0 },
1462 { "standard", aarch64_handle_standard_branch_protection
, NULL
, 0 },
1463 { "pac-ret", aarch64_handle_pac_ret_protection
, aarch64_pac_ret_subtypes
,
1464 ARRAY_SIZE (aarch64_pac_ret_subtypes
) },
1465 { "bti", aarch64_handle_bti_protection
, NULL
, 0 },
1466 { NULL
, NULL
, NULL
, 0 }
1469 /* The condition codes of the processor, and the inverse function. */
1470 static const char * const aarch64_condition_codes
[] =
1472 "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
1473 "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
1476 /* The preferred condition codes for SVE conditions. */
1477 static const char *const aarch64_sve_condition_codes
[] =
1479 "none", "any", "nlast", "last", "first", "nfrst", "vs", "vc",
1480 "pmore", "plast", "tcont", "tstop", "gt", "le", "al", "nv"
1483 /* Return the assembly token for svpattern value VALUE. */
1486 svpattern_token (enum aarch64_svpattern pattern
)
1490 #define CASE(UPPER, LOWER, VALUE) case AARCH64_SV_##UPPER: return #LOWER;
1491 AARCH64_FOR_SVPATTERN (CASE
)
1493 case AARCH64_NUM_SVPATTERNS
:
1499 /* Return the location of a piece that is known to be passed or returned
1500 in registers. FIRST_ZR is the first unused vector argument register
1501 and FIRST_PR is the first unused predicate argument register. */
1504 pure_scalable_type_info::piece::get_rtx (unsigned int first_zr
,
1505 unsigned int first_pr
) const
1507 gcc_assert (VECTOR_MODE_P (mode
)
1508 && first_zr
+ num_zr
<= V0_REGNUM
+ NUM_FP_ARG_REGS
1509 && first_pr
+ num_pr
<= P0_REGNUM
+ NUM_PR_ARG_REGS
);
1511 if (num_zr
> 0 && num_pr
== 0)
1512 return gen_rtx_REG (mode
, first_zr
);
1514 if (num_zr
== 0 && num_pr
== 1)
1515 return gen_rtx_REG (mode
, first_pr
);
1520 /* Return the total number of vector registers required by the PST. */
1523 pure_scalable_type_info::num_zr () const
1525 unsigned int res
= 0;
1526 for (unsigned int i
= 0; i
< pieces
.length (); ++i
)
1527 res
+= pieces
[i
].num_zr
;
1531 /* Return the total number of predicate registers required by the PST. */
1534 pure_scalable_type_info::num_pr () const
1536 unsigned int res
= 0;
1537 for (unsigned int i
= 0; i
< pieces
.length (); ++i
)
1538 res
+= pieces
[i
].num_pr
;
1542 /* Return the location of a PST that is known to be passed or returned
1543 in registers. FIRST_ZR is the first unused vector argument register
1544 and FIRST_PR is the first unused predicate argument register. */
1547 pure_scalable_type_info::get_rtx (machine_mode mode
,
1548 unsigned int first_zr
,
1549 unsigned int first_pr
) const
1551 /* Try to return a single REG if possible. This leads to better
1552 code generation; it isn't required for correctness. */
1553 if (mode
== pieces
[0].mode
)
1555 gcc_assert (pieces
.length () == 1);
1556 return pieces
[0].get_rtx (first_zr
, first_pr
);
1559 /* Build up a PARALLEL that contains the individual pieces. */
1560 rtvec rtxes
= rtvec_alloc (pieces
.length ());
1561 for (unsigned int i
= 0; i
< pieces
.length (); ++i
)
1563 rtx reg
= pieces
[i
].get_rtx (first_zr
, first_pr
);
1564 rtx offset
= gen_int_mode (pieces
[i
].offset
, Pmode
);
1565 RTVEC_ELT (rtxes
, i
) = gen_rtx_EXPR_LIST (VOIDmode
, reg
, offset
);
1566 first_zr
+= pieces
[i
].num_zr
;
1567 first_pr
+= pieces
[i
].num_pr
;
1569 return gen_rtx_PARALLEL (mode
, rtxes
);
1572 /* Analyze whether TYPE is a Pure Scalable Type according to the rules
1575 pure_scalable_type_info::analysis_result
1576 pure_scalable_type_info::analyze (const_tree type
)
1578 /* Prevent accidental reuse. */
1579 gcc_assert (pieces
.is_empty ());
1581 /* No code will be generated for erroneous types, so we won't establish
1583 if (type
== error_mark_node
)
1584 return NO_ABI_IDENTITY
;
1586 /* Zero-sized types disappear in the language->ABI mapping. */
1587 if (TYPE_SIZE (type
) && integer_zerop (TYPE_SIZE (type
)))
1588 return NO_ABI_IDENTITY
;
1590 /* Check for SVTs, SPTs, and built-in tuple types that map to PSTs. */
1592 if (aarch64_sve::builtin_type_p (type
, &p
.num_zr
, &p
.num_pr
))
1594 machine_mode mode
= TYPE_MODE_RAW (type
);
1595 gcc_assert (VECTOR_MODE_P (mode
)
1596 && (!TARGET_SVE
|| aarch64_sve_mode_p (mode
)));
1598 p
.mode
= p
.orig_mode
= mode
;
1603 /* Check for user-defined PSTs. */
1604 if (TREE_CODE (type
) == ARRAY_TYPE
)
1605 return analyze_array (type
);
1606 if (TREE_CODE (type
) == RECORD_TYPE
)
1607 return analyze_record (type
);
1612 /* Analyze a type that is known not to be passed or returned in memory.
1613 Return true if it has an ABI identity and is a Pure Scalable Type. */
1616 pure_scalable_type_info::analyze_registers (const_tree type
)
1618 analysis_result result
= analyze (type
);
1619 gcc_assert (result
!= DOESNT_MATTER
);
1620 return result
== IS_PST
;
1623 /* Subroutine of analyze for handling ARRAY_TYPEs. */
1625 pure_scalable_type_info::analysis_result
1626 pure_scalable_type_info::analyze_array (const_tree type
)
1628 /* Analyze the element type. */
1629 pure_scalable_type_info element_info
;
1630 analysis_result result
= element_info
.analyze (TREE_TYPE (type
));
1631 if (result
!= IS_PST
)
1634 /* An array of unknown, flexible or variable length will be passed and
1635 returned by reference whatever we do. */
1636 tree nelts_minus_one
= array_type_nelts (type
);
1637 if (!tree_fits_uhwi_p (nelts_minus_one
))
1638 return DOESNT_MATTER
;
1640 /* Likewise if the array is constant-sized but too big to be interesting.
1641 The double checks against MAX_PIECES are to protect against overflow. */
1642 unsigned HOST_WIDE_INT count
= tree_to_uhwi (nelts_minus_one
);
1643 if (count
> MAX_PIECES
)
1644 return DOESNT_MATTER
;
1646 if (count
* element_info
.pieces
.length () > MAX_PIECES
)
1647 return DOESNT_MATTER
;
1649 /* The above checks should have weeded out elements of unknown size. */
1650 poly_uint64 element_bytes
;
1651 if (!poly_int_tree_p (TYPE_SIZE_UNIT (TREE_TYPE (type
)), &element_bytes
))
1654 /* Build up the list of individual vectors and predicates. */
1655 gcc_assert (!element_info
.pieces
.is_empty ());
1656 for (unsigned int i
= 0; i
< count
; ++i
)
1657 for (unsigned int j
= 0; j
< element_info
.pieces
.length (); ++j
)
1659 piece p
= element_info
.pieces
[j
];
1660 p
.offset
+= i
* element_bytes
;
1666 /* Subroutine of analyze for handling RECORD_TYPEs. */
1668 pure_scalable_type_info::analysis_result
1669 pure_scalable_type_info::analyze_record (const_tree type
)
1671 for (tree field
= TYPE_FIELDS (type
); field
; field
= TREE_CHAIN (field
))
1673 if (TREE_CODE (field
) != FIELD_DECL
)
1676 /* Zero-sized fields disappear in the language->ABI mapping. */
1677 if (DECL_SIZE (field
) && integer_zerop (DECL_SIZE (field
)))
1680 /* All fields with an ABI identity must be PSTs for the record as
1681 a whole to be a PST. If any individual field is too big to be
1682 interesting then the record is too. */
1683 pure_scalable_type_info field_info
;
1684 analysis_result subresult
= field_info
.analyze (TREE_TYPE (field
));
1685 if (subresult
== NO_ABI_IDENTITY
)
1687 if (subresult
!= IS_PST
)
1690 /* Since all previous fields are PSTs, we ought to be able to track
1691 the field offset using poly_ints. */
1692 tree bitpos
= bit_position (field
);
1693 gcc_assert (poly_int_tree_p (bitpos
));
1695 /* For the same reason, it shouldn't be possible to create a PST field
1696 whose offset isn't byte-aligned. */
1697 poly_widest_int wide_bytepos
= exact_div (wi::to_poly_widest (bitpos
),
1700 /* Punt if the record is too big to be interesting. */
1701 poly_uint64 bytepos
;
1702 if (!wide_bytepos
.to_uhwi (&bytepos
)
1703 || pieces
.length () + field_info
.pieces
.length () > MAX_PIECES
)
1704 return DOESNT_MATTER
;
1706 /* Add the individual vectors and predicates in the field to the
1708 gcc_assert (!field_info
.pieces
.is_empty ());
1709 for (unsigned int i
= 0; i
< field_info
.pieces
.length (); ++i
)
1711 piece p
= field_info
.pieces
[i
];
1712 p
.offset
+= bytepos
;
1716 /* Empty structures disappear in the language->ABI mapping. */
1717 return pieces
.is_empty () ? NO_ABI_IDENTITY
: IS_PST
;
1720 /* Add P to the list of pieces in the type. */
1723 pure_scalable_type_info::add_piece (const piece
&p
)
1725 /* Try to fold the new piece into the previous one to form a
1726 single-mode PST. For example, if we see three consecutive vectors
1727 of the same mode, we can represent them using the corresponding
1730 This is purely an optimization. */
1731 if (!pieces
.is_empty ())
1733 piece
&prev
= pieces
.last ();
1734 gcc_assert (VECTOR_MODE_P (p
.mode
) && VECTOR_MODE_P (prev
.mode
));
1735 unsigned int nelems1
, nelems2
;
1736 if (prev
.orig_mode
== p
.orig_mode
1737 && known_eq (prev
.offset
+ GET_MODE_SIZE (prev
.mode
), p
.offset
)
1738 && constant_multiple_p (GET_MODE_NUNITS (prev
.mode
),
1739 GET_MODE_NUNITS (p
.orig_mode
), &nelems1
)
1740 && constant_multiple_p (GET_MODE_NUNITS (p
.mode
),
1741 GET_MODE_NUNITS (p
.orig_mode
), &nelems2
)
1742 && targetm
.array_mode (p
.orig_mode
,
1743 nelems1
+ nelems2
).exists (&prev
.mode
))
1745 prev
.num_zr
+= p
.num_zr
;
1746 prev
.num_pr
+= p
.num_pr
;
1750 pieces
.quick_push (p
);
1753 /* Return true if at least one possible value of type TYPE includes at
1754 least one object of Pure Scalable Type, in the sense of the AAPCS64.
1756 This is a relatively expensive test for some types, so it should
1757 generally be made as late as possible. */
1760 aarch64_some_values_include_pst_objects_p (const_tree type
)
1762 if (TYPE_SIZE (type
) && integer_zerop (TYPE_SIZE (type
)))
1765 if (aarch64_sve::builtin_type_p (type
))
1768 if (TREE_CODE (type
) == ARRAY_TYPE
|| TREE_CODE (type
) == COMPLEX_TYPE
)
1769 return aarch64_some_values_include_pst_objects_p (TREE_TYPE (type
));
1771 if (RECORD_OR_UNION_TYPE_P (type
))
1772 for (tree field
= TYPE_FIELDS (type
); field
; field
= TREE_CHAIN (field
))
1773 if (TREE_CODE (field
) == FIELD_DECL
1774 && aarch64_some_values_include_pst_objects_p (TREE_TYPE (field
)))
1780 /* Return the descriptor of the SIMD ABI. */
1782 static const predefined_function_abi
&
1783 aarch64_simd_abi (void)
1785 predefined_function_abi
&simd_abi
= function_abis
[ARM_PCS_SIMD
];
1786 if (!simd_abi
.initialized_p ())
1788 HARD_REG_SET full_reg_clobbers
1789 = default_function_abi
.full_reg_clobbers ();
1790 for (int regno
= 0; regno
< FIRST_PSEUDO_REGISTER
; regno
++)
1791 if (FP_SIMD_SAVED_REGNUM_P (regno
))
1792 CLEAR_HARD_REG_BIT (full_reg_clobbers
, regno
);
1793 simd_abi
.initialize (ARM_PCS_SIMD
, full_reg_clobbers
);
1798 /* Return the descriptor of the SVE PCS. */
1800 static const predefined_function_abi
&
1801 aarch64_sve_abi (void)
1803 predefined_function_abi
&sve_abi
= function_abis
[ARM_PCS_SVE
];
1804 if (!sve_abi
.initialized_p ())
1806 HARD_REG_SET full_reg_clobbers
1807 = default_function_abi
.full_reg_clobbers ();
1808 for (int regno
= V8_REGNUM
; regno
<= V23_REGNUM
; ++regno
)
1809 CLEAR_HARD_REG_BIT (full_reg_clobbers
, regno
);
1810 for (int regno
= P4_REGNUM
; regno
<= P15_REGNUM
; ++regno
)
1811 CLEAR_HARD_REG_BIT (full_reg_clobbers
, regno
);
1812 sve_abi
.initialize (ARM_PCS_SVE
, full_reg_clobbers
);
1817 /* Generate code to enable conditional branches in functions over 1 MiB. */
1819 aarch64_gen_far_branch (rtx
* operands
, int pos_label
, const char * dest
,
1820 const char * branch_format
)
1822 rtx_code_label
* tmp_label
= gen_label_rtx ();
1823 char label_buf
[256];
1825 ASM_GENERATE_INTERNAL_LABEL (label_buf
, dest
,
1826 CODE_LABEL_NUMBER (tmp_label
));
1827 const char *label_ptr
= targetm
.strip_name_encoding (label_buf
);
1828 rtx dest_label
= operands
[pos_label
];
1829 operands
[pos_label
] = tmp_label
;
1831 snprintf (buffer
, sizeof (buffer
), "%s%s", branch_format
, label_ptr
);
1832 output_asm_insn (buffer
, operands
);
1834 snprintf (buffer
, sizeof (buffer
), "b\t%%l%d\n%s:", pos_label
, label_ptr
);
1835 operands
[pos_label
] = dest_label
;
1836 output_asm_insn (buffer
, operands
);
1841 aarch64_err_no_fpadvsimd (machine_mode mode
)
1843 if (TARGET_GENERAL_REGS_ONLY
)
1844 if (FLOAT_MODE_P (mode
))
1845 error ("%qs is incompatible with the use of floating-point types",
1846 "-mgeneral-regs-only");
1848 error ("%qs is incompatible with the use of vector types",
1849 "-mgeneral-regs-only");
1851 if (FLOAT_MODE_P (mode
))
1852 error ("%qs feature modifier is incompatible with the use of"
1853 " floating-point types", "+nofp");
1855 error ("%qs feature modifier is incompatible with the use of"
1856 " vector types", "+nofp");
1859 /* Report when we try to do something that requires SVE when SVE is disabled.
1860 This is an error of last resort and isn't very high-quality. It usually
1861 involves attempts to measure the vector length in some way. */
1863 aarch64_report_sve_required (void)
1865 static bool reported_p
= false;
1867 /* Avoid reporting a slew of messages for a single oversight. */
1871 error ("this operation requires the SVE ISA extension");
1872 inform (input_location
, "you can enable SVE using the command-line"
1873 " option %<-march%>, or by using the %<target%>"
1874 " attribute or pragma");
1878 /* Return true if REGNO is P0-P15 or one of the special FFR-related
1881 pr_or_ffr_regnum_p (unsigned int regno
)
1883 return PR_REGNUM_P (regno
) || regno
== FFR_REGNUM
|| regno
== FFRT_REGNUM
;
1886 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
1887 The register allocator chooses POINTER_AND_FP_REGS if FP_REGS and
1888 GENERAL_REGS have the same cost - even if POINTER_AND_FP_REGS has a much
1889 higher cost. POINTER_AND_FP_REGS is also used if the cost of both FP_REGS
1890 and GENERAL_REGS is lower than the memory cost (in this case the best class
1891 is the lowest cost one). Using POINTER_AND_FP_REGS irrespectively of its
1892 cost results in bad allocations with many redundant int<->FP moves which
1893 are expensive on various cores.
1894 To avoid this we don't allow POINTER_AND_FP_REGS as the allocno class, but
1895 force a decision between FP_REGS and GENERAL_REGS. We use the allocno class
1896 if it isn't POINTER_AND_FP_REGS. Similarly, use the best class if it isn't
1897 POINTER_AND_FP_REGS. Otherwise set the allocno class depending on the mode.
1898 The result of this is that it is no longer inefficient to have a higher
1899 memory move cost than the register move cost.
1903 aarch64_ira_change_pseudo_allocno_class (int regno
, reg_class_t allocno_class
,
1904 reg_class_t best_class
)
1908 if (!reg_class_subset_p (GENERAL_REGS
, allocno_class
)
1909 || !reg_class_subset_p (FP_REGS
, allocno_class
))
1910 return allocno_class
;
1912 if (!reg_class_subset_p (GENERAL_REGS
, best_class
)
1913 || !reg_class_subset_p (FP_REGS
, best_class
))
1916 mode
= PSEUDO_REGNO_MODE (regno
);
1917 return FLOAT_MODE_P (mode
) || VECTOR_MODE_P (mode
) ? FP_REGS
: GENERAL_REGS
;
1921 aarch64_min_divisions_for_recip_mul (machine_mode mode
)
1923 if (GET_MODE_UNIT_SIZE (mode
) == 4)
1924 return aarch64_tune_params
.min_div_recip_mul_sf
;
1925 return aarch64_tune_params
.min_div_recip_mul_df
;
1928 /* Return the reassociation width of treeop OPC with mode MODE. */
1930 aarch64_reassociation_width (unsigned opc
, machine_mode mode
)
1932 if (VECTOR_MODE_P (mode
))
1933 return aarch64_tune_params
.vec_reassoc_width
;
1934 if (INTEGRAL_MODE_P (mode
))
1935 return aarch64_tune_params
.int_reassoc_width
;
1936 /* Avoid reassociating floating point addition so we emit more FMAs. */
1937 if (FLOAT_MODE_P (mode
) && opc
!= PLUS_EXPR
)
1938 return aarch64_tune_params
.fp_reassoc_width
;
1942 /* Provide a mapping from gcc register numbers to dwarf register numbers. */
1944 aarch64_dbx_register_number (unsigned regno
)
1946 if (GP_REGNUM_P (regno
))
1947 return AARCH64_DWARF_R0
+ regno
- R0_REGNUM
;
1948 else if (regno
== SP_REGNUM
)
1949 return AARCH64_DWARF_SP
;
1950 else if (FP_REGNUM_P (regno
))
1951 return AARCH64_DWARF_V0
+ regno
- V0_REGNUM
;
1952 else if (PR_REGNUM_P (regno
))
1953 return AARCH64_DWARF_P0
+ regno
- P0_REGNUM
;
1954 else if (regno
== VG_REGNUM
)
1955 return AARCH64_DWARF_VG
;
1957 /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
1958 equivalent DWARF register. */
1959 return DWARF_FRAME_REGISTERS
;
1962 /* If X is a CONST_DOUBLE, return its bit representation as a constant
1963 integer, otherwise return X unmodified. */
1965 aarch64_bit_representation (rtx x
)
1967 if (CONST_DOUBLE_P (x
))
1968 x
= gen_lowpart (int_mode_for_mode (GET_MODE (x
)).require (), x
);
1972 /* Return true if MODE is any of the Advanced SIMD structure modes. */
1974 aarch64_advsimd_struct_mode_p (machine_mode mode
)
1977 && (mode
== OImode
|| mode
== CImode
|| mode
== XImode
));
1980 /* Return true if MODE is an SVE predicate mode. */
1982 aarch64_sve_pred_mode_p (machine_mode mode
)
1985 && (mode
== VNx16BImode
1986 || mode
== VNx8BImode
1987 || mode
== VNx4BImode
1988 || mode
== VNx2BImode
));
1991 /* Three mutually-exclusive flags describing a vector or predicate type. */
1992 const unsigned int VEC_ADVSIMD
= 1;
1993 const unsigned int VEC_SVE_DATA
= 2;
1994 const unsigned int VEC_SVE_PRED
= 4;
1995 /* Can be used in combination with VEC_ADVSIMD or VEC_SVE_DATA to indicate
1996 a structure of 2, 3 or 4 vectors. */
1997 const unsigned int VEC_STRUCT
= 8;
1998 /* Can be used in combination with VEC_SVE_DATA to indicate that the
1999 vector has fewer significant bytes than a full SVE vector. */
2000 const unsigned int VEC_PARTIAL
= 16;
2001 /* Useful combinations of the above. */
2002 const unsigned int VEC_ANY_SVE
= VEC_SVE_DATA
| VEC_SVE_PRED
;
2003 const unsigned int VEC_ANY_DATA
= VEC_ADVSIMD
| VEC_SVE_DATA
;
2005 /* Return a set of flags describing the vector properties of mode MODE.
2006 Ignore modes that are not supported by the current target. */
2008 aarch64_classify_vector_mode (machine_mode mode
)
2010 if (aarch64_advsimd_struct_mode_p (mode
))
2011 return VEC_ADVSIMD
| VEC_STRUCT
;
2013 if (aarch64_sve_pred_mode_p (mode
))
2014 return VEC_SVE_PRED
;
2016 /* Make the decision based on the mode's enum value rather than its
2017 properties, so that we keep the correct classification regardless
2018 of -msve-vector-bits. */
2021 /* Partial SVE QI vectors. */
2025 /* Partial SVE HI vectors. */
2028 /* Partial SVE SI vector. */
2030 /* Partial SVE HF vectors. */
2033 /* Partial SVE SF vector. */
2035 return TARGET_SVE
? VEC_SVE_DATA
| VEC_PARTIAL
: 0;
2045 return TARGET_SVE
? VEC_SVE_DATA
: 0;
2047 /* x2 SVE vectors. */
2056 /* x3 SVE vectors. */
2065 /* x4 SVE vectors. */
2074 return TARGET_SVE
? VEC_SVE_DATA
| VEC_STRUCT
: 0;
2076 /* 64-bit Advanced SIMD vectors. */
2080 /* ...E_V1DImode doesn't exist. */
2085 /* 128-bit Advanced SIMD vectors. */
2094 return TARGET_SIMD
? VEC_ADVSIMD
: 0;
2101 /* Return true if MODE is any of the data vector modes, including
2104 aarch64_vector_data_mode_p (machine_mode mode
)
2106 return aarch64_classify_vector_mode (mode
) & VEC_ANY_DATA
;
2109 /* Return true if MODE is any form of SVE mode, including predicates,
2110 vectors and structures. */
2112 aarch64_sve_mode_p (machine_mode mode
)
2114 return aarch64_classify_vector_mode (mode
) & VEC_ANY_SVE
;
2117 /* Return true if MODE is an SVE data vector mode; either a single vector
2118 or a structure of vectors. */
2120 aarch64_sve_data_mode_p (machine_mode mode
)
2122 return aarch64_classify_vector_mode (mode
) & VEC_SVE_DATA
;
2125 /* Return the number of defined bytes in one constituent vector of
2126 SVE mode MODE, which has vector flags VEC_FLAGS. */
2128 aarch64_vl_bytes (machine_mode mode
, unsigned int vec_flags
)
2130 if (vec_flags
& VEC_PARTIAL
)
2131 /* A single partial vector. */
2132 return GET_MODE_SIZE (mode
);
2134 if (vec_flags
& VEC_SVE_DATA
)
2135 /* A single vector or a tuple. */
2136 return BYTES_PER_SVE_VECTOR
;
2138 /* A single predicate. */
2139 gcc_assert (vec_flags
& VEC_SVE_PRED
);
2140 return BYTES_PER_SVE_PRED
;
2143 /* Implement target hook TARGET_ARRAY_MODE. */
2144 static opt_machine_mode
2145 aarch64_array_mode (machine_mode mode
, unsigned HOST_WIDE_INT nelems
)
2147 if (aarch64_classify_vector_mode (mode
) == VEC_SVE_DATA
2148 && IN_RANGE (nelems
, 2, 4))
2149 return mode_for_vector (GET_MODE_INNER (mode
),
2150 GET_MODE_NUNITS (mode
) * nelems
);
2152 return opt_machine_mode ();
2155 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P. */
2157 aarch64_array_mode_supported_p (machine_mode mode
,
2158 unsigned HOST_WIDE_INT nelems
)
2161 && (AARCH64_VALID_SIMD_QREG_MODE (mode
)
2162 || AARCH64_VALID_SIMD_DREG_MODE (mode
))
2163 && (nelems
>= 2 && nelems
<= 4))
2169 /* MODE is some form of SVE vector mode. For data modes, return the number
2170 of vector register bits that each element of MODE occupies, such as 64
2171 for both VNx2DImode and VNx2SImode (where each 32-bit value is stored
2172 in a 64-bit container). For predicate modes, return the number of
2173 data bits controlled by each significant predicate bit. */
2176 aarch64_sve_container_bits (machine_mode mode
)
2178 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
2179 poly_uint64 vector_bits
= (vec_flags
& (VEC_PARTIAL
| VEC_SVE_PRED
)
2180 ? BITS_PER_SVE_VECTOR
2181 : GET_MODE_BITSIZE (mode
));
2182 return vector_element_size (vector_bits
, GET_MODE_NUNITS (mode
));
2185 /* Return the SVE predicate mode to use for elements that have
2186 ELEM_NBYTES bytes, if such a mode exists. */
2189 aarch64_sve_pred_mode (unsigned int elem_nbytes
)
2193 if (elem_nbytes
== 1)
2195 if (elem_nbytes
== 2)
2197 if (elem_nbytes
== 4)
2199 if (elem_nbytes
== 8)
2202 return opt_machine_mode ();
2205 /* Return the SVE predicate mode that should be used to control
2209 aarch64_sve_pred_mode (machine_mode mode
)
2211 unsigned int bits
= aarch64_sve_container_bits (mode
);
2212 return aarch64_sve_pred_mode (bits
/ BITS_PER_UNIT
).require ();
2215 /* Implement TARGET_VECTORIZE_GET_MASK_MODE. */
2217 static opt_machine_mode
2218 aarch64_get_mask_mode (machine_mode mode
)
2220 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
2221 if (vec_flags
& VEC_SVE_DATA
)
2222 return aarch64_sve_pred_mode (mode
);
2224 return default_get_mask_mode (mode
);
2227 /* Return the SVE vector mode that has NUNITS elements of mode INNER_MODE. */
2230 aarch64_sve_data_mode (scalar_mode inner_mode
, poly_uint64 nunits
)
2232 enum mode_class mclass
= (is_a
<scalar_float_mode
> (inner_mode
)
2233 ? MODE_VECTOR_FLOAT
: MODE_VECTOR_INT
);
2235 FOR_EACH_MODE_IN_CLASS (mode
, mclass
)
2236 if (inner_mode
== GET_MODE_INNER (mode
)
2237 && known_eq (nunits
, GET_MODE_NUNITS (mode
))
2238 && aarch64_sve_data_mode_p (mode
))
2240 return opt_machine_mode ();
2243 /* Return the integer element mode associated with SVE mode MODE. */
2245 static scalar_int_mode
2246 aarch64_sve_element_int_mode (machine_mode mode
)
2248 poly_uint64 vector_bits
= (GET_MODE_CLASS (mode
) == MODE_VECTOR_BOOL
2249 ? BITS_PER_SVE_VECTOR
2250 : GET_MODE_BITSIZE (mode
));
2251 unsigned int elt_bits
= vector_element_size (vector_bits
,
2252 GET_MODE_NUNITS (mode
));
2253 return int_mode_for_size (elt_bits
, 0).require ();
2256 /* Return an integer element mode that contains exactly
2257 aarch64_sve_container_bits (MODE) bits. This is wider than
2258 aarch64_sve_element_int_mode if MODE is a partial vector,
2259 otherwise it's the same. */
2261 static scalar_int_mode
2262 aarch64_sve_container_int_mode (machine_mode mode
)
2264 return int_mode_for_size (aarch64_sve_container_bits (mode
), 0).require ();
2267 /* Return the integer vector mode associated with SVE mode MODE.
2268 Unlike related_int_vector_mode, this can handle the case in which
2269 MODE is a predicate (and thus has a different total size). */
2272 aarch64_sve_int_mode (machine_mode mode
)
2274 scalar_int_mode int_mode
= aarch64_sve_element_int_mode (mode
);
2275 return aarch64_sve_data_mode (int_mode
, GET_MODE_NUNITS (mode
)).require ();
2278 /* Implement TARGET_VECTORIZE_RELATED_MODE. */
2280 static opt_machine_mode
2281 aarch64_vectorize_related_mode (machine_mode vector_mode
,
2282 scalar_mode element_mode
,
2285 unsigned int vec_flags
= aarch64_classify_vector_mode (vector_mode
);
2287 /* If we're operating on SVE vectors, try to return an SVE mode. */
2288 poly_uint64 sve_nunits
;
2289 if ((vec_flags
& VEC_SVE_DATA
)
2290 && multiple_p (BYTES_PER_SVE_VECTOR
,
2291 GET_MODE_SIZE (element_mode
), &sve_nunits
))
2293 machine_mode sve_mode
;
2294 if (maybe_ne (nunits
, 0U))
2296 /* Try to find a full or partial SVE mode with exactly
2298 if (multiple_p (sve_nunits
, nunits
)
2299 && aarch64_sve_data_mode (element_mode
,
2300 nunits
).exists (&sve_mode
))
2305 /* Take the preferred number of units from the number of bytes
2306 that fit in VECTOR_MODE. We always start by "autodetecting"
2307 a full vector mode with preferred_simd_mode, so vectors
2308 chosen here will also be full vector modes. Then
2309 autovectorize_vector_modes tries smaller starting modes
2310 and thus smaller preferred numbers of units. */
2311 sve_nunits
= ordered_min (sve_nunits
, GET_MODE_SIZE (vector_mode
));
2312 if (aarch64_sve_data_mode (element_mode
,
2313 sve_nunits
).exists (&sve_mode
))
2318 /* Prefer to use 1 128-bit vector instead of 2 64-bit vectors. */
2319 if ((vec_flags
& VEC_ADVSIMD
)
2320 && known_eq (nunits
, 0U)
2321 && known_eq (GET_MODE_BITSIZE (vector_mode
), 64U)
2322 && maybe_ge (GET_MODE_BITSIZE (element_mode
)
2323 * GET_MODE_NUNITS (vector_mode
), 128U))
2325 machine_mode res
= aarch64_simd_container_mode (element_mode
, 128);
2326 if (VECTOR_MODE_P (res
))
2330 return default_vectorize_related_mode (vector_mode
, element_mode
, nunits
);
2333 /* Implement TARGET_PREFERRED_ELSE_VALUE. For binary operations,
2334 prefer to use the first arithmetic operand as the else value if
2335 the else value doesn't matter, since that exactly matches the SVE
2336 destructive merging form. For ternary operations we could either
2337 pick the first operand and use FMAD-like instructions or the last
2338 operand and use FMLA-like instructions; the latter seems more
2342 aarch64_preferred_else_value (unsigned, tree
, unsigned int nops
, tree
*ops
)
2344 return nops
== 3 ? ops
[2] : ops
[0];
2347 /* Implement TARGET_HARD_REGNO_NREGS. */
2350 aarch64_hard_regno_nregs (unsigned regno
, machine_mode mode
)
2352 /* ??? Logically we should only need to provide a value when
2353 HARD_REGNO_MODE_OK says that the combination is valid,
2354 but at the moment we need to handle all modes. Just ignore
2355 any runtime parts for registers that can't store them. */
2356 HOST_WIDE_INT lowest_size
= constant_lower_bound (GET_MODE_SIZE (mode
));
2357 switch (aarch64_regno_regclass (regno
))
2363 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
2364 if (vec_flags
& VEC_SVE_DATA
)
2365 return exact_div (GET_MODE_SIZE (mode
),
2366 aarch64_vl_bytes (mode
, vec_flags
)).to_constant ();
2367 return CEIL (lowest_size
, UNITS_PER_VREG
);
2373 case PR_AND_FFR_REGS
:
2376 return CEIL (lowest_size
, UNITS_PER_WORD
);
2381 /* Implement TARGET_HARD_REGNO_MODE_OK. */
2384 aarch64_hard_regno_mode_ok (unsigned regno
, machine_mode mode
)
2386 if (GET_MODE_CLASS (mode
) == MODE_CC
)
2387 return regno
== CC_REGNUM
;
2389 if (regno
== VG_REGNUM
)
2390 /* This must have the same size as _Unwind_Word. */
2391 return mode
== DImode
;
2393 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
2394 if (vec_flags
& VEC_SVE_PRED
)
2395 return pr_or_ffr_regnum_p (regno
);
2397 if (pr_or_ffr_regnum_p (regno
))
2400 if (regno
== SP_REGNUM
)
2401 /* The purpose of comparing with ptr_mode is to support the
2402 global register variable associated with the stack pointer
2403 register via the syntax of asm ("wsp") in ILP32. */
2404 return mode
== Pmode
|| mode
== ptr_mode
;
2406 if (regno
== FRAME_POINTER_REGNUM
|| regno
== ARG_POINTER_REGNUM
)
2407 return mode
== Pmode
;
2409 if (GP_REGNUM_P (regno
))
2411 if (vec_flags
& VEC_ANY_SVE
)
2413 if (known_le (GET_MODE_SIZE (mode
), 8))
2415 if (known_le (GET_MODE_SIZE (mode
), 16))
2416 return (regno
& 1) == 0;
2418 else if (FP_REGNUM_P (regno
))
2420 if (vec_flags
& VEC_STRUCT
)
2421 return end_hard_regno (mode
, regno
) - 1 <= V31_REGNUM
;
2423 return !VECTOR_MODE_P (mode
) || vec_flags
!= 0;
2429 /* Return true if a function with type FNTYPE returns its value in
2430 SVE vector or predicate registers. */
2433 aarch64_returns_value_in_sve_regs_p (const_tree fntype
)
2435 tree return_type
= TREE_TYPE (fntype
);
2437 pure_scalable_type_info pst_info
;
2438 switch (pst_info
.analyze (return_type
))
2440 case pure_scalable_type_info::IS_PST
:
2441 return (pst_info
.num_zr () <= NUM_FP_ARG_REGS
2442 && pst_info
.num_pr () <= NUM_PR_ARG_REGS
);
2444 case pure_scalable_type_info::DOESNT_MATTER
:
2445 gcc_assert (aarch64_return_in_memory_1 (return_type
));
2448 case pure_scalable_type_info::NO_ABI_IDENTITY
:
2449 case pure_scalable_type_info::ISNT_PST
:
2455 /* Return true if a function with type FNTYPE takes arguments in
2456 SVE vector or predicate registers. */
2459 aarch64_takes_arguments_in_sve_regs_p (const_tree fntype
)
2461 CUMULATIVE_ARGS args_so_far_v
;
2462 aarch64_init_cumulative_args (&args_so_far_v
, NULL_TREE
, NULL_RTX
,
2463 NULL_TREE
, 0, true);
2464 cumulative_args_t args_so_far
= pack_cumulative_args (&args_so_far_v
);
2466 for (tree chain
= TYPE_ARG_TYPES (fntype
);
2467 chain
&& chain
!= void_list_node
;
2468 chain
= TREE_CHAIN (chain
))
2470 tree arg_type
= TREE_VALUE (chain
);
2471 if (arg_type
== error_mark_node
)
2474 function_arg_info
arg (arg_type
, /*named=*/true);
2475 apply_pass_by_reference_rules (&args_so_far_v
, arg
);
2476 pure_scalable_type_info pst_info
;
2477 if (pst_info
.analyze_registers (arg
.type
))
2479 unsigned int end_zr
= args_so_far_v
.aapcs_nvrn
+ pst_info
.num_zr ();
2480 unsigned int end_pr
= args_so_far_v
.aapcs_nprn
+ pst_info
.num_pr ();
2481 gcc_assert (end_zr
<= NUM_FP_ARG_REGS
&& end_pr
<= NUM_PR_ARG_REGS
);
2485 targetm
.calls
.function_arg_advance (args_so_far
, arg
);
2490 /* Implement TARGET_FNTYPE_ABI. */
2492 static const predefined_function_abi
&
2493 aarch64_fntype_abi (const_tree fntype
)
2495 if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (fntype
)))
2496 return aarch64_simd_abi ();
2498 if (aarch64_returns_value_in_sve_regs_p (fntype
)
2499 || aarch64_takes_arguments_in_sve_regs_p (fntype
))
2500 return aarch64_sve_abi ();
2502 return default_function_abi
;
2505 /* Implement TARGET_COMPATIBLE_VECTOR_TYPES_P. */
2508 aarch64_compatible_vector_types_p (const_tree type1
, const_tree type2
)
2510 return (aarch64_sve::builtin_type_p (type1
)
2511 == aarch64_sve::builtin_type_p (type2
));
2514 /* Return true if we should emit CFI for register REGNO. */
2517 aarch64_emit_cfi_for_reg_p (unsigned int regno
)
2519 return (GP_REGNUM_P (regno
)
2520 || !default_function_abi
.clobbers_full_reg_p (regno
));
2523 /* Return the mode we should use to save and restore register REGNO. */
2526 aarch64_reg_save_mode (unsigned int regno
)
2528 if (GP_REGNUM_P (regno
))
2531 if (FP_REGNUM_P (regno
))
2532 switch (crtl
->abi
->id ())
2534 case ARM_PCS_AAPCS64
:
2535 /* Only the low 64 bits are saved by the base PCS. */
2539 /* The vector PCS saves the low 128 bits (which is the full
2540 register on non-SVE targets). */
2544 /* Use vectors of DImode for registers that need frame
2545 information, so that the first 64 bytes of the save slot
2546 are always the equivalent of what storing D<n> would give. */
2547 if (aarch64_emit_cfi_for_reg_p (regno
))
2550 /* Use vectors of bytes otherwise, so that the layout is
2551 endian-agnostic, and so that we can use LDR and STR for
2552 big-endian targets. */
2555 case ARM_PCS_TLSDESC
:
2556 case ARM_PCS_UNKNOWN
:
2560 if (PR_REGNUM_P (regno
))
2561 /* Save the full predicate register. */
2567 /* Implement TARGET_INSN_CALLEE_ABI. */
2569 const predefined_function_abi
&
2570 aarch64_insn_callee_abi (const rtx_insn
*insn
)
2572 rtx pat
= PATTERN (insn
);
2573 gcc_assert (GET_CODE (pat
) == PARALLEL
);
2574 rtx unspec
= XVECEXP (pat
, 0, 1);
2575 gcc_assert (GET_CODE (unspec
) == UNSPEC
2576 && XINT (unspec
, 1) == UNSPEC_CALLEE_ABI
);
2577 return function_abis
[INTVAL (XVECEXP (unspec
, 0, 0))];
2580 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED. The callee only saves
2581 the lower 64 bits of a 128-bit register. Tell the compiler the callee
2582 clobbers the top 64 bits when restoring the bottom 64 bits. */
2585 aarch64_hard_regno_call_part_clobbered (unsigned int abi_id
,
2589 if (FP_REGNUM_P (regno
) && abi_id
!= ARM_PCS_SVE
)
2591 poly_int64 per_register_size
= GET_MODE_SIZE (mode
);
2592 unsigned int nregs
= hard_regno_nregs (regno
, mode
);
2594 per_register_size
= exact_div (per_register_size
, nregs
);
2595 if (abi_id
== ARM_PCS_SIMD
|| abi_id
== ARM_PCS_TLSDESC
)
2596 return maybe_gt (per_register_size
, 16);
2597 return maybe_gt (per_register_size
, 8);
2602 /* Implement REGMODE_NATURAL_SIZE. */
2604 aarch64_regmode_natural_size (machine_mode mode
)
2606 /* The natural size for SVE data modes is one SVE data vector,
2607 and similarly for predicates. We can't independently modify
2608 anything smaller than that. */
2609 /* ??? For now, only do this for variable-width SVE registers.
2610 Doing it for constant-sized registers breaks lower-subreg.c. */
2611 /* ??? And once that's fixed, we should probably have similar
2612 code for Advanced SIMD. */
2613 if (!aarch64_sve_vg
.is_constant ())
2615 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
2616 if (vec_flags
& VEC_SVE_PRED
)
2617 return BYTES_PER_SVE_PRED
;
2618 if (vec_flags
& VEC_SVE_DATA
)
2619 return BYTES_PER_SVE_VECTOR
;
2621 return UNITS_PER_WORD
;
2624 /* Implement HARD_REGNO_CALLER_SAVE_MODE. */
2626 aarch64_hard_regno_caller_save_mode (unsigned regno
, unsigned,
2629 /* The predicate mode determines which bits are significant and
2630 which are "don't care". Decreasing the number of lanes would
2631 lose data while increasing the number of lanes would make bits
2632 unnecessarily significant. */
2633 if (PR_REGNUM_P (regno
))
2635 if (known_ge (GET_MODE_SIZE (mode
), 4))
2641 /* Return true if I's bits are consecutive ones from the MSB. */
2643 aarch64_high_bits_all_ones_p (HOST_WIDE_INT i
)
2645 return exact_log2 (-i
) != HOST_WIDE_INT_M1
;
2648 /* Implement TARGET_CONSTANT_ALIGNMENT. Make strings word-aligned so
2649 that strcpy from constants will be faster. */
2651 static HOST_WIDE_INT
2652 aarch64_constant_alignment (const_tree exp
, HOST_WIDE_INT align
)
2654 if (TREE_CODE (exp
) == STRING_CST
&& !optimize_size
)
2655 return MAX (align
, BITS_PER_WORD
);
2659 /* Return true if calls to DECL should be treated as
2660 long-calls (ie called via a register). */
2662 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED
)
2667 /* Return true if calls to symbol-ref SYM should be treated as
2668 long-calls (ie called via a register). */
2670 aarch64_is_long_call_p (rtx sym
)
2672 return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym
));
2675 /* Return true if calls to symbol-ref SYM should not go through
2679 aarch64_is_noplt_call_p (rtx sym
)
2681 const_tree decl
= SYMBOL_REF_DECL (sym
);
2686 || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl
)))
2687 && !targetm
.binds_local_p (decl
))
2693 /* Return true if the offsets to a zero/sign-extract operation
2694 represent an expression that matches an extend operation. The
2695 operands represent the parameters from
2697 (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)). */
2699 aarch64_is_extend_from_extract (scalar_int_mode mode
, rtx mult_imm
,
2702 HOST_WIDE_INT mult_val
, extract_val
;
2704 if (! CONST_INT_P (mult_imm
) || ! CONST_INT_P (extract_imm
))
2707 mult_val
= INTVAL (mult_imm
);
2708 extract_val
= INTVAL (extract_imm
);
2711 && extract_val
< GET_MODE_BITSIZE (mode
)
2712 && exact_log2 (extract_val
& ~7) > 0
2713 && (extract_val
& 7) <= 4
2714 && mult_val
== (1 << (extract_val
& 7)))
2720 /* Emit an insn that's a simple single-set. Both the operands must be
2721 known to be valid. */
2722 inline static rtx_insn
*
2723 emit_set_insn (rtx x
, rtx y
)
2725 return emit_insn (gen_rtx_SET (x
, y
));
2728 /* X and Y are two things to compare using CODE. Emit the compare insn and
2729 return the rtx for register 0 in the proper mode. */
2731 aarch64_gen_compare_reg (RTX_CODE code
, rtx x
, rtx y
)
2733 machine_mode cmp_mode
= GET_MODE (x
);
2734 machine_mode cc_mode
;
2737 if (cmp_mode
== TImode
)
2739 gcc_assert (code
== NE
);
2742 cc_reg
= gen_rtx_REG (cc_mode
, CC_REGNUM
);
2744 rtx x_lo
= operand_subword (x
, 0, 0, TImode
);
2745 rtx y_lo
= operand_subword (y
, 0, 0, TImode
);
2746 emit_set_insn (cc_reg
, gen_rtx_COMPARE (cc_mode
, x_lo
, y_lo
));
2748 rtx x_hi
= operand_subword (x
, 1, 0, TImode
);
2749 rtx y_hi
= operand_subword (y
, 1, 0, TImode
);
2750 emit_insn (gen_ccmpccdi (cc_reg
, cc_reg
, x_hi
, y_hi
,
2751 gen_rtx_EQ (cc_mode
, cc_reg
, const0_rtx
),
2752 GEN_INT (AARCH64_EQ
)));
2756 cc_mode
= SELECT_CC_MODE (code
, x
, y
);
2757 cc_reg
= gen_rtx_REG (cc_mode
, CC_REGNUM
);
2758 emit_set_insn (cc_reg
, gen_rtx_COMPARE (cc_mode
, x
, y
));
2763 /* Similarly, but maybe zero-extend Y if Y_MODE < SImode. */
2766 aarch64_gen_compare_reg_maybe_ze (RTX_CODE code
, rtx x
, rtx y
,
2767 machine_mode y_mode
)
2769 if (y_mode
== E_QImode
|| y_mode
== E_HImode
)
2771 if (CONST_INT_P (y
))
2773 y
= GEN_INT (INTVAL (y
) & GET_MODE_MASK (y_mode
));
2779 machine_mode cc_mode
;
2781 t
= gen_rtx_ZERO_EXTEND (SImode
, y
);
2782 t
= gen_rtx_COMPARE (CC_SWPmode
, t
, x
);
2783 cc_mode
= CC_SWPmode
;
2784 cc_reg
= gen_rtx_REG (cc_mode
, CC_REGNUM
);
2785 emit_set_insn (cc_reg
, t
);
2790 if (!aarch64_plus_operand (y
, y_mode
))
2791 y
= force_reg (y_mode
, y
);
2793 return aarch64_gen_compare_reg (code
, x
, y
);
2796 /* Build the SYMBOL_REF for __tls_get_addr. */
2798 static GTY(()) rtx tls_get_addr_libfunc
;
2801 aarch64_tls_get_addr (void)
2803 if (!tls_get_addr_libfunc
)
2804 tls_get_addr_libfunc
= init_one_libfunc ("__tls_get_addr");
2805 return tls_get_addr_libfunc
;
2808 /* Return the TLS model to use for ADDR. */
2810 static enum tls_model
2811 tls_symbolic_operand_type (rtx addr
)
2813 enum tls_model tls_kind
= TLS_MODEL_NONE
;
2814 if (GET_CODE (addr
) == CONST
)
2817 rtx sym
= strip_offset (addr
, &addend
);
2818 if (GET_CODE (sym
) == SYMBOL_REF
)
2819 tls_kind
= SYMBOL_REF_TLS_MODEL (sym
);
2821 else if (GET_CODE (addr
) == SYMBOL_REF
)
2822 tls_kind
= SYMBOL_REF_TLS_MODEL (addr
);
2827 /* We'll allow lo_sum's in addresses in our legitimate addresses
2828 so that combine would take care of combining addresses where
2829 necessary, but for generation purposes, we'll generate the address
2832 tmp = hi (symbol_ref); adrp x1, foo
2833 dest = lo_sum (tmp, symbol_ref); add dest, x1, :lo_12:foo
2837 adrp x1, :got:foo adrp tmp, :tlsgd:foo
2838 ldr x1, [:got_lo12:foo] add dest, tmp, :tlsgd_lo12:foo
2842 Load TLS symbol, depending on TLS mechanism and TLS access model.
2844 Global Dynamic - Traditional TLS:
2845 adrp tmp, :tlsgd:imm
2846 add dest, tmp, #:tlsgd_lo12:imm
2849 Global Dynamic - TLS Descriptors:
2850 adrp dest, :tlsdesc:imm
2851 ldr tmp, [dest, #:tlsdesc_lo12:imm]
2852 add dest, dest, #:tlsdesc_lo12:imm
2859 adrp tmp, :gottprel:imm
2860 ldr dest, [tmp, #:gottprel_lo12:imm]
2865 add t0, tp, #:tprel_hi12:imm, lsl #12
2866 add t0, t0, #:tprel_lo12_nc:imm
2870 aarch64_load_symref_appropriately (rtx dest
, rtx imm
,
2871 enum aarch64_symbol_type type
)
2875 case SYMBOL_SMALL_ABSOLUTE
:
2877 /* In ILP32, the mode of dest can be either SImode or DImode. */
2879 machine_mode mode
= GET_MODE (dest
);
2881 gcc_assert (mode
== Pmode
|| mode
== ptr_mode
);
2883 if (can_create_pseudo_p ())
2884 tmp_reg
= gen_reg_rtx (mode
);
2886 emit_move_insn (tmp_reg
, gen_rtx_HIGH (mode
, imm
));
2887 emit_insn (gen_add_losym (dest
, tmp_reg
, imm
));
2891 case SYMBOL_TINY_ABSOLUTE
:
2892 emit_insn (gen_rtx_SET (dest
, imm
));
2895 case SYMBOL_SMALL_GOT_28K
:
2897 machine_mode mode
= GET_MODE (dest
);
2898 rtx gp_rtx
= pic_offset_table_rtx
;
2902 /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
2903 here before rtl expand. Tree IVOPT will generate rtl pattern to
2904 decide rtx costs, in which case pic_offset_table_rtx is not
2905 initialized. For that case no need to generate the first adrp
2906 instruction as the final cost for global variable access is
2910 /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
2911 using the page base as GOT base, the first page may be wasted,
2912 in the worst scenario, there is only 28K space for GOT).
2914 The generate instruction sequence for accessing global variable
2917 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
2919 Only one instruction needed. But we must initialize
2920 pic_offset_table_rtx properly. We generate initialize insn for
2921 every global access, and allow CSE to remove all redundant.
2923 The final instruction sequences will look like the following
2924 for multiply global variables access.
2926 adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
2928 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
2929 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
2930 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
2933 rtx s
= gen_rtx_SYMBOL_REF (Pmode
, "_GLOBAL_OFFSET_TABLE_");
2934 crtl
->uses_pic_offset_table
= 1;
2935 emit_move_insn (gp_rtx
, gen_rtx_HIGH (Pmode
, s
));
2937 if (mode
!= GET_MODE (gp_rtx
))
2938 gp_rtx
= gen_lowpart (mode
, gp_rtx
);
2942 if (mode
== ptr_mode
)
2945 insn
= gen_ldr_got_small_28k_di (dest
, gp_rtx
, imm
);
2947 insn
= gen_ldr_got_small_28k_si (dest
, gp_rtx
, imm
);
2949 mem
= XVECEXP (SET_SRC (insn
), 0, 0);
2953 gcc_assert (mode
== Pmode
);
2955 insn
= gen_ldr_got_small_28k_sidi (dest
, gp_rtx
, imm
);
2956 mem
= XVECEXP (XEXP (SET_SRC (insn
), 0), 0, 0);
2959 /* The operand is expected to be MEM. Whenever the related insn
2960 pattern changed, above code which calculate mem should be
2962 gcc_assert (GET_CODE (mem
) == MEM
);
2963 MEM_READONLY_P (mem
) = 1;
2964 MEM_NOTRAP_P (mem
) = 1;
2969 case SYMBOL_SMALL_GOT_4G
:
2971 /* In ILP32, the mode of dest can be either SImode or DImode,
2972 while the got entry is always of SImode size. The mode of
2973 dest depends on how dest is used: if dest is assigned to a
2974 pointer (e.g. in the memory), it has SImode; it may have
2975 DImode if dest is dereferenced to access the memeory.
2976 This is why we have to handle three different ldr_got_small
2977 patterns here (two patterns for ILP32). */
2982 machine_mode mode
= GET_MODE (dest
);
2984 if (can_create_pseudo_p ())
2985 tmp_reg
= gen_reg_rtx (mode
);
2987 emit_move_insn (tmp_reg
, gen_rtx_HIGH (mode
, imm
));
2988 if (mode
== ptr_mode
)
2991 insn
= gen_ldr_got_small_di (dest
, tmp_reg
, imm
);
2993 insn
= gen_ldr_got_small_si (dest
, tmp_reg
, imm
);
2995 mem
= XVECEXP (SET_SRC (insn
), 0, 0);
2999 gcc_assert (mode
== Pmode
);
3001 insn
= gen_ldr_got_small_sidi (dest
, tmp_reg
, imm
);
3002 mem
= XVECEXP (XEXP (SET_SRC (insn
), 0), 0, 0);
3005 gcc_assert (GET_CODE (mem
) == MEM
);
3006 MEM_READONLY_P (mem
) = 1;
3007 MEM_NOTRAP_P (mem
) = 1;
3012 case SYMBOL_SMALL_TLSGD
:
3015 /* The return type of __tls_get_addr is the C pointer type
3017 rtx result
= gen_rtx_REG (ptr_mode
, R0_REGNUM
);
3020 if (GET_MODE (dest
) != ptr_mode
)
3021 tmp_reg
= can_create_pseudo_p () ? gen_reg_rtx (ptr_mode
) : result
;
3024 if (ptr_mode
== SImode
)
3025 aarch64_emit_call_insn (gen_tlsgd_small_si (result
, imm
));
3027 aarch64_emit_call_insn (gen_tlsgd_small_di (result
, imm
));
3028 insns
= get_insns ();
3031 RTL_CONST_CALL_P (insns
) = 1;
3032 emit_libcall_block (insns
, tmp_reg
, result
, imm
);
3033 /* Convert back to the mode of the dest adding a zero_extend
3034 from SImode (ptr_mode) to DImode (Pmode). */
3035 if (dest
!= tmp_reg
)
3036 convert_move (dest
, tmp_reg
, true);
3040 case SYMBOL_SMALL_TLSDESC
:
3042 machine_mode mode
= GET_MODE (dest
);
3043 rtx x0
= gen_rtx_REG (mode
, R0_REGNUM
);
3046 gcc_assert (mode
== Pmode
|| mode
== ptr_mode
);
3048 /* In ILP32, the got entry is always of SImode size. Unlike
3049 small GOT, the dest is fixed at reg 0. */
3051 emit_insn (gen_tlsdesc_small_si (imm
));
3053 emit_insn (gen_tlsdesc_small_di (imm
));
3054 tp
= aarch64_load_tp (NULL
);
3057 tp
= gen_lowpart (mode
, tp
);
3059 emit_insn (gen_rtx_SET (dest
, gen_rtx_PLUS (mode
, tp
, x0
)));
3061 set_unique_reg_note (get_last_insn (), REG_EQUIV
, imm
);
3065 case SYMBOL_SMALL_TLSIE
:
3067 /* In ILP32, the mode of dest can be either SImode or DImode,
3068 while the got entry is always of SImode size. The mode of
3069 dest depends on how dest is used: if dest is assigned to a
3070 pointer (e.g. in the memory), it has SImode; it may have
3071 DImode if dest is dereferenced to access the memeory.
3072 This is why we have to handle three different tlsie_small
3073 patterns here (two patterns for ILP32). */
3074 machine_mode mode
= GET_MODE (dest
);
3075 rtx tmp_reg
= gen_reg_rtx (mode
);
3076 rtx tp
= aarch64_load_tp (NULL
);
3078 if (mode
== ptr_mode
)
3081 emit_insn (gen_tlsie_small_di (tmp_reg
, imm
));
3084 emit_insn (gen_tlsie_small_si (tmp_reg
, imm
));
3085 tp
= gen_lowpart (mode
, tp
);
3090 gcc_assert (mode
== Pmode
);
3091 emit_insn (gen_tlsie_small_sidi (tmp_reg
, imm
));
3094 emit_insn (gen_rtx_SET (dest
, gen_rtx_PLUS (mode
, tp
, tmp_reg
)));
3096 set_unique_reg_note (get_last_insn (), REG_EQUIV
, imm
);
3100 case SYMBOL_TLSLE12
:
3101 case SYMBOL_TLSLE24
:
3102 case SYMBOL_TLSLE32
:
3103 case SYMBOL_TLSLE48
:
3105 machine_mode mode
= GET_MODE (dest
);
3106 rtx tp
= aarch64_load_tp (NULL
);
3109 tp
= gen_lowpart (mode
, tp
);
3113 case SYMBOL_TLSLE12
:
3114 emit_insn ((mode
== DImode
? gen_tlsle12_di
: gen_tlsle12_si
)
3117 case SYMBOL_TLSLE24
:
3118 emit_insn ((mode
== DImode
? gen_tlsle24_di
: gen_tlsle24_si
)
3121 case SYMBOL_TLSLE32
:
3122 emit_insn ((mode
== DImode
? gen_tlsle32_di
: gen_tlsle32_si
)
3124 emit_insn ((mode
== DImode
? gen_adddi3
: gen_addsi3
)
3127 case SYMBOL_TLSLE48
:
3128 emit_insn ((mode
== DImode
? gen_tlsle48_di
: gen_tlsle48_si
)
3130 emit_insn ((mode
== DImode
? gen_adddi3
: gen_addsi3
)
3138 set_unique_reg_note (get_last_insn (), REG_EQUIV
, imm
);
3142 case SYMBOL_TINY_GOT
:
3145 machine_mode mode
= GET_MODE (dest
);
3147 if (mode
== ptr_mode
)
3148 insn
= gen_ldr_got_tiny (mode
, dest
, imm
);
3151 gcc_assert (mode
== Pmode
);
3152 insn
= gen_ldr_got_tiny_sidi (dest
, imm
);
3159 case SYMBOL_TINY_TLSIE
:
3161 machine_mode mode
= GET_MODE (dest
);
3162 rtx tp
= aarch64_load_tp (NULL
);
3164 if (mode
== ptr_mode
)
3167 emit_insn (gen_tlsie_tiny_di (dest
, imm
, tp
));
3170 tp
= gen_lowpart (mode
, tp
);
3171 emit_insn (gen_tlsie_tiny_si (dest
, imm
, tp
));
3176 gcc_assert (mode
== Pmode
);
3177 emit_insn (gen_tlsie_tiny_sidi (dest
, imm
, tp
));
3181 set_unique_reg_note (get_last_insn (), REG_EQUIV
, imm
);
3190 /* Emit a move from SRC to DEST. Assume that the move expanders can
3191 handle all moves if !can_create_pseudo_p (). The distinction is
3192 important because, unlike emit_move_insn, the move expanders know
3193 how to force Pmode objects into the constant pool even when the
3194 constant pool address is not itself legitimate. */
3196 aarch64_emit_move (rtx dest
, rtx src
)
3198 return (can_create_pseudo_p ()
3199 ? emit_move_insn (dest
, src
)
3200 : emit_move_insn_1 (dest
, src
));
3203 /* Apply UNOPTAB to OP and store the result in DEST. */
3206 aarch64_emit_unop (rtx dest
, optab unoptab
, rtx op
)
3208 rtx tmp
= expand_unop (GET_MODE (dest
), unoptab
, op
, dest
, 0);
3210 emit_move_insn (dest
, tmp
);
3213 /* Apply BINOPTAB to OP0 and OP1 and store the result in DEST. */
3216 aarch64_emit_binop (rtx dest
, optab binoptab
, rtx op0
, rtx op1
)
3218 rtx tmp
= expand_binop (GET_MODE (dest
), binoptab
, op0
, op1
, dest
, 0,
3221 emit_move_insn (dest
, tmp
);
3224 /* Split a 128-bit move operation into two 64-bit move operations,
3225 taking care to handle partial overlap of register to register
3226 copies. Special cases are needed when moving between GP regs and
3227 FP regs. SRC can be a register, constant or memory; DST a register
3228 or memory. If either operand is memory it must not have any side
3231 aarch64_split_128bit_move (rtx dst
, rtx src
)
3236 machine_mode mode
= GET_MODE (dst
);
3238 gcc_assert (mode
== TImode
|| mode
== TFmode
);
3239 gcc_assert (!(side_effects_p (src
) || side_effects_p (dst
)));
3240 gcc_assert (mode
== GET_MODE (src
) || GET_MODE (src
) == VOIDmode
);
3242 if (REG_P (dst
) && REG_P (src
))
3244 int src_regno
= REGNO (src
);
3245 int dst_regno
= REGNO (dst
);
3247 /* Handle FP <-> GP regs. */
3248 if (FP_REGNUM_P (dst_regno
) && GP_REGNUM_P (src_regno
))
3250 src_lo
= gen_lowpart (word_mode
, src
);
3251 src_hi
= gen_highpart (word_mode
, src
);
3253 emit_insn (gen_aarch64_movlow_di (mode
, dst
, src_lo
));
3254 emit_insn (gen_aarch64_movhigh_di (mode
, dst
, src_hi
));
3257 else if (GP_REGNUM_P (dst_regno
) && FP_REGNUM_P (src_regno
))
3259 dst_lo
= gen_lowpart (word_mode
, dst
);
3260 dst_hi
= gen_highpart (word_mode
, dst
);
3262 emit_insn (gen_aarch64_movdi_low (mode
, dst_lo
, src
));
3263 emit_insn (gen_aarch64_movdi_high (mode
, dst_hi
, src
));
3268 dst_lo
= gen_lowpart (word_mode
, dst
);
3269 dst_hi
= gen_highpart (word_mode
, dst
);
3270 src_lo
= gen_lowpart (word_mode
, src
);
3271 src_hi
= gen_highpart_mode (word_mode
, mode
, src
);
3273 /* At most one pairing may overlap. */
3274 if (reg_overlap_mentioned_p (dst_lo
, src_hi
))
3276 aarch64_emit_move (dst_hi
, src_hi
);
3277 aarch64_emit_move (dst_lo
, src_lo
);
3281 aarch64_emit_move (dst_lo
, src_lo
);
3282 aarch64_emit_move (dst_hi
, src_hi
);
3287 aarch64_split_128bit_move_p (rtx dst
, rtx src
)
3289 return (! REG_P (src
)
3290 || ! (FP_REGNUM_P (REGNO (dst
)) && FP_REGNUM_P (REGNO (src
))));
3293 /* Split a complex SIMD combine. */
3296 aarch64_split_simd_combine (rtx dst
, rtx src1
, rtx src2
)
3298 machine_mode src_mode
= GET_MODE (src1
);
3299 machine_mode dst_mode
= GET_MODE (dst
);
3301 gcc_assert (VECTOR_MODE_P (dst_mode
));
3302 gcc_assert (register_operand (dst
, dst_mode
)
3303 && register_operand (src1
, src_mode
)
3304 && register_operand (src2
, src_mode
));
3306 emit_insn (gen_aarch64_simd_combine (src_mode
, dst
, src1
, src2
));
3310 /* Split a complex SIMD move. */
3313 aarch64_split_simd_move (rtx dst
, rtx src
)
3315 machine_mode src_mode
= GET_MODE (src
);
3316 machine_mode dst_mode
= GET_MODE (dst
);
3318 gcc_assert (VECTOR_MODE_P (dst_mode
));
3320 if (REG_P (dst
) && REG_P (src
))
3322 gcc_assert (VECTOR_MODE_P (src_mode
));
3323 emit_insn (gen_aarch64_split_simd_mov (src_mode
, dst
, src
));
3328 aarch64_zero_extend_const_eq (machine_mode xmode
, rtx x
,
3329 machine_mode ymode
, rtx y
)
3331 rtx r
= simplify_const_unary_operation (ZERO_EXTEND
, xmode
, y
, ymode
);
3332 gcc_assert (r
!= NULL
);
3333 return rtx_equal_p (x
, r
);
3336 /* Return TARGET if it is nonnull and a register of mode MODE.
3337 Otherwise, return a fresh register of mode MODE if we can,
3338 or TARGET reinterpreted as MODE if we can't. */
3341 aarch64_target_reg (rtx target
, machine_mode mode
)
3343 if (target
&& REG_P (target
) && GET_MODE (target
) == mode
)
3345 if (!can_create_pseudo_p ())
3347 gcc_assert (target
);
3348 return gen_lowpart (mode
, target
);
3350 return gen_reg_rtx (mode
);
3353 /* Return a register that contains the constant in BUILDER, given that
3354 the constant is a legitimate move operand. Use TARGET as the register
3355 if it is nonnull and convenient. */
3358 aarch64_emit_set_immediate (rtx target
, rtx_vector_builder
&builder
)
3360 rtx src
= builder
.build ();
3361 target
= aarch64_target_reg (target
, GET_MODE (src
));
3362 emit_insn (gen_rtx_SET (target
, src
));
3367 aarch64_force_temporary (machine_mode mode
, rtx x
, rtx value
)
3369 if (can_create_pseudo_p ())
3370 return force_reg (mode
, value
);
3374 aarch64_emit_move (x
, value
);
3379 /* Return true if predicate value X is a constant in which every element
3380 is a CONST_INT. When returning true, describe X in BUILDER as a VNx16BI
3381 value, i.e. as a predicate in which all bits are significant. */
3384 aarch64_get_sve_pred_bits (rtx_vector_builder
&builder
, rtx x
)
3386 if (GET_CODE (x
) != CONST_VECTOR
)
3389 unsigned int factor
= vector_element_size (GET_MODE_NUNITS (VNx16BImode
),
3390 GET_MODE_NUNITS (GET_MODE (x
)));
3391 unsigned int npatterns
= CONST_VECTOR_NPATTERNS (x
) * factor
;
3392 unsigned int nelts_per_pattern
= CONST_VECTOR_NELTS_PER_PATTERN (x
);
3393 builder
.new_vector (VNx16BImode
, npatterns
, nelts_per_pattern
);
3395 unsigned int nelts
= const_vector_encoded_nelts (x
);
3396 for (unsigned int i
= 0; i
< nelts
; ++i
)
3398 rtx elt
= CONST_VECTOR_ENCODED_ELT (x
, i
);
3399 if (!CONST_INT_P (elt
))
3402 builder
.quick_push (elt
);
3403 for (unsigned int j
= 1; j
< factor
; ++j
)
3404 builder
.quick_push (const0_rtx
);
3406 builder
.finalize ();
3410 /* BUILDER contains a predicate constant of mode VNx16BI. Return the
3411 widest predicate element size it can have (that is, the largest size
3412 for which each element would still be 0 or 1). */
3415 aarch64_widest_sve_pred_elt_size (rtx_vector_builder
&builder
)
3417 /* Start with the most optimistic assumption: that we only need
3418 one bit per pattern. This is what we will use if only the first
3419 bit in each pattern is ever set. */
3420 unsigned int mask
= GET_MODE_SIZE (DImode
);
3421 mask
|= builder
.npatterns ();
3423 /* Look for set bits. */
3424 unsigned int nelts
= builder
.encoded_nelts ();
3425 for (unsigned int i
= 1; i
< nelts
; ++i
)
3426 if (INTVAL (builder
.elt (i
)) != 0)
3432 return mask
& -mask
;
3435 /* If VNx16BImode rtx X is a canonical PTRUE for a predicate mode,
3436 return that predicate mode, otherwise return opt_machine_mode (). */
3439 aarch64_ptrue_all_mode (rtx x
)
3441 gcc_assert (GET_MODE (x
) == VNx16BImode
);
3442 if (GET_CODE (x
) != CONST_VECTOR
3443 || !CONST_VECTOR_DUPLICATE_P (x
)
3444 || !CONST_INT_P (CONST_VECTOR_ENCODED_ELT (x
, 0))
3445 || INTVAL (CONST_VECTOR_ENCODED_ELT (x
, 0)) == 0)
3446 return opt_machine_mode ();
3448 unsigned int nelts
= const_vector_encoded_nelts (x
);
3449 for (unsigned int i
= 1; i
< nelts
; ++i
)
3450 if (CONST_VECTOR_ENCODED_ELT (x
, i
) != const0_rtx
)
3451 return opt_machine_mode ();
3453 return aarch64_sve_pred_mode (nelts
);
3456 /* BUILDER is a predicate constant of mode VNx16BI. Consider the value
3457 that the constant would have with predicate element size ELT_SIZE
3458 (ignoring the upper bits in each element) and return:
3460 * -1 if all bits are set
3461 * N if the predicate has N leading set bits followed by all clear bits
3462 * 0 if the predicate does not have any of these forms. */
3465 aarch64_partial_ptrue_length (rtx_vector_builder
&builder
,
3466 unsigned int elt_size
)
3468 /* If nelts_per_pattern is 3, we have set bits followed by clear bits
3469 followed by set bits. */
3470 if (builder
.nelts_per_pattern () == 3)
3473 /* Skip over leading set bits. */
3474 unsigned int nelts
= builder
.encoded_nelts ();
3476 for (; i
< nelts
; i
+= elt_size
)
3477 if (INTVAL (builder
.elt (i
)) == 0)
3479 unsigned int vl
= i
/ elt_size
;
3481 /* Check for the all-true case. */
3485 /* If nelts_per_pattern is 1, then either VL is zero, or we have a
3486 repeating pattern of set bits followed by clear bits. */
3487 if (builder
.nelts_per_pattern () != 2)
3490 /* We have a "foreground" value and a duplicated "background" value.
3491 If the background might repeat and the last set bit belongs to it,
3492 we might have set bits followed by clear bits followed by set bits. */
3493 if (i
> builder
.npatterns () && maybe_ne (nelts
, builder
.full_nelts ()))
3496 /* Make sure that the rest are all clear. */
3497 for (; i
< nelts
; i
+= elt_size
)
3498 if (INTVAL (builder
.elt (i
)) != 0)
3504 /* See if there is an svpattern that encodes an SVE predicate of mode
3505 PRED_MODE in which the first VL bits are set and the rest are clear.
3506 Return the pattern if so, otherwise return AARCH64_NUM_SVPATTERNS.
3507 A VL of -1 indicates an all-true vector. */
3510 aarch64_svpattern_for_vl (machine_mode pred_mode
, int vl
)
3513 return AARCH64_SV_ALL
;
3515 if (maybe_gt (vl
, GET_MODE_NUNITS (pred_mode
)))
3516 return AARCH64_NUM_SVPATTERNS
;
3518 if (vl
>= 1 && vl
<= 8)
3519 return aarch64_svpattern (AARCH64_SV_VL1
+ (vl
- 1));
3521 if (vl
>= 16 && vl
<= 256 && pow2p_hwi (vl
))
3522 return aarch64_svpattern (AARCH64_SV_VL16
+ (exact_log2 (vl
) - 4));
3525 if (GET_MODE_NUNITS (pred_mode
).is_constant (&max_vl
))
3527 if (vl
== (max_vl
/ 3) * 3)
3528 return AARCH64_SV_MUL3
;
3529 /* These would only trigger for non-power-of-2 lengths. */
3530 if (vl
== (max_vl
& -4))
3531 return AARCH64_SV_MUL4
;
3532 if (vl
== (1 << floor_log2 (max_vl
)))
3533 return AARCH64_SV_POW2
;
3535 return AARCH64_SV_ALL
;
3537 return AARCH64_NUM_SVPATTERNS
;
3540 /* Return a VNx16BImode constant in which every sequence of ELT_SIZE
3541 bits has the lowest bit set and the upper bits clear. This is the
3542 VNx16BImode equivalent of a PTRUE for controlling elements of
3543 ELT_SIZE bytes. However, because the constant is VNx16BImode,
3544 all bits are significant, even the upper zeros. */
3547 aarch64_ptrue_all (unsigned int elt_size
)
3549 rtx_vector_builder
builder (VNx16BImode
, elt_size
, 1);
3550 builder
.quick_push (const1_rtx
);
3551 for (unsigned int i
= 1; i
< elt_size
; ++i
)
3552 builder
.quick_push (const0_rtx
);
3553 return builder
.build ();
3556 /* Return an all-true predicate register of mode MODE. */
3559 aarch64_ptrue_reg (machine_mode mode
)
3561 gcc_assert (GET_MODE_CLASS (mode
) == MODE_VECTOR_BOOL
);
3562 rtx reg
= force_reg (VNx16BImode
, CONSTM1_RTX (VNx16BImode
));
3563 return gen_lowpart (mode
, reg
);
3566 /* Return an all-false predicate register of mode MODE. */
3569 aarch64_pfalse_reg (machine_mode mode
)
3571 gcc_assert (GET_MODE_CLASS (mode
) == MODE_VECTOR_BOOL
);
3572 rtx reg
= force_reg (VNx16BImode
, CONST0_RTX (VNx16BImode
));
3573 return gen_lowpart (mode
, reg
);
3576 /* Return true if predicate PRED1[0] is true whenever predicate PRED2 is
3577 true, or alternatively if we know that the operation predicated by
3578 PRED1[0] is safe to perform whenever PRED2 is true. PRED1[1] is a
3579 aarch64_sve_gp_strictness operand that describes the operation
3580 predicated by PRED1[0]. */
3583 aarch64_sve_pred_dominates_p (rtx
*pred1
, rtx pred2
)
3585 machine_mode mode
= GET_MODE (pred2
);
3586 gcc_assert (GET_MODE_CLASS (mode
) == MODE_VECTOR_BOOL
3587 && mode
== GET_MODE (pred1
[0])
3588 && aarch64_sve_gp_strictness (pred1
[1], SImode
));
3589 return (pred1
[0] == CONSTM1_RTX (mode
)
3590 || INTVAL (pred1
[1]) == SVE_RELAXED_GP
3591 || rtx_equal_p (pred1
[0], pred2
));
3594 /* PRED1[0] is a PTEST predicate and PRED1[1] is an aarch64_sve_ptrue_flag
3595 for it. PRED2[0] is the predicate for the instruction whose result
3596 is tested by the PTEST and PRED2[1] is again an aarch64_sve_ptrue_flag
3597 for it. Return true if we can prove that the two predicates are
3598 equivalent for PTEST purposes; that is, if we can replace PRED2[0]
3599 with PRED1[0] without changing behavior. */
3602 aarch64_sve_same_pred_for_ptest_p (rtx
*pred1
, rtx
*pred2
)
3604 machine_mode mode
= GET_MODE (pred1
[0]);
3605 gcc_assert (GET_MODE_CLASS (mode
) == MODE_VECTOR_BOOL
3606 && mode
== GET_MODE (pred2
[0])
3607 && aarch64_sve_ptrue_flag (pred1
[1], SImode
)
3608 && aarch64_sve_ptrue_flag (pred2
[1], SImode
));
3610 bool ptrue1_p
= (pred1
[0] == CONSTM1_RTX (mode
)
3611 || INTVAL (pred1
[1]) == SVE_KNOWN_PTRUE
);
3612 bool ptrue2_p
= (pred2
[0] == CONSTM1_RTX (mode
)
3613 || INTVAL (pred2
[1]) == SVE_KNOWN_PTRUE
);
3614 return (ptrue1_p
&& ptrue2_p
) || rtx_equal_p (pred1
[0], pred2
[0]);
3617 /* Emit a comparison CMP between OP0 and OP1, both of which have mode
3618 DATA_MODE, and return the result in a predicate of mode PRED_MODE.
3619 Use TARGET as the target register if nonnull and convenient. */
3622 aarch64_sve_emit_int_cmp (rtx target
, machine_mode pred_mode
, rtx_code cmp
,
3623 machine_mode data_mode
, rtx op1
, rtx op2
)
3625 insn_code icode
= code_for_aarch64_pred_cmp (cmp
, data_mode
);
3626 expand_operand ops
[5];
3627 create_output_operand (&ops
[0], target
, pred_mode
);
3628 create_input_operand (&ops
[1], CONSTM1_RTX (pred_mode
), pred_mode
);
3629 create_integer_operand (&ops
[2], SVE_KNOWN_PTRUE
);
3630 create_input_operand (&ops
[3], op1
, data_mode
);
3631 create_input_operand (&ops
[4], op2
, data_mode
);
3632 expand_insn (icode
, 5, ops
);
3633 return ops
[0].value
;
3636 /* Use a comparison to convert integer vector SRC into MODE, which is
3637 the corresponding SVE predicate mode. Use TARGET for the result
3638 if it's nonnull and convenient. */
3641 aarch64_convert_sve_data_to_pred (rtx target
, machine_mode mode
, rtx src
)
3643 machine_mode src_mode
= GET_MODE (src
);
3644 return aarch64_sve_emit_int_cmp (target
, mode
, NE
, src_mode
,
3645 src
, CONST0_RTX (src_mode
));
3648 /* Return the assembly token for svprfop value PRFOP. */
3651 svprfop_token (enum aarch64_svprfop prfop
)
3655 #define CASE(UPPER, LOWER, VALUE) case AARCH64_SV_##UPPER: return #LOWER;
3656 AARCH64_FOR_SVPRFOP (CASE
)
3658 case AARCH64_NUM_SVPRFOPS
:
3664 /* Return the assembly string for an SVE prefetch operation with
3665 mnemonic MNEMONIC, given that PRFOP_RTX is the prefetch operation
3666 and that SUFFIX is the format for the remaining operands. */
3669 aarch64_output_sve_prefetch (const char *mnemonic
, rtx prfop_rtx
,
3672 static char buffer
[128];
3673 aarch64_svprfop prfop
= (aarch64_svprfop
) INTVAL (prfop_rtx
);
3674 unsigned int written
= snprintf (buffer
, sizeof (buffer
), "%s\t%s, %s",
3675 mnemonic
, svprfop_token (prfop
), suffix
);
3676 gcc_assert (written
< sizeof (buffer
));
3680 /* Check whether we can calculate the number of elements in PATTERN
3681 at compile time, given that there are NELTS_PER_VQ elements per
3682 128-bit block. Return the value if so, otherwise return -1. */
3685 aarch64_fold_sve_cnt_pat (aarch64_svpattern pattern
, unsigned int nelts_per_vq
)
3687 unsigned int vl
, const_vg
;
3688 if (pattern
>= AARCH64_SV_VL1
&& pattern
<= AARCH64_SV_VL8
)
3689 vl
= 1 + (pattern
- AARCH64_SV_VL1
);
3690 else if (pattern
>= AARCH64_SV_VL16
&& pattern
<= AARCH64_SV_VL256
)
3691 vl
= 16 << (pattern
- AARCH64_SV_VL16
);
3692 else if (aarch64_sve_vg
.is_constant (&const_vg
))
3694 /* There are two vector granules per quadword. */
3695 unsigned int nelts
= (const_vg
/ 2) * nelts_per_vq
;
3698 case AARCH64_SV_POW2
: return 1 << floor_log2 (nelts
);
3699 case AARCH64_SV_MUL4
: return nelts
& -4;
3700 case AARCH64_SV_MUL3
: return (nelts
/ 3) * 3;
3701 case AARCH64_SV_ALL
: return nelts
;
3702 default: gcc_unreachable ();
3708 /* There are two vector granules per quadword. */
3709 poly_uint64 nelts_all
= exact_div (aarch64_sve_vg
, 2) * nelts_per_vq
;
3710 if (known_le (vl
, nelts_all
))
3713 /* Requesting more elements than are available results in a PFALSE. */
3714 if (known_gt (vl
, nelts_all
))
3720 /* Return true if we can move VALUE into a register using a single
3721 CNT[BHWD] instruction. */
3724 aarch64_sve_cnt_immediate_p (poly_int64 value
)
3726 HOST_WIDE_INT factor
= value
.coeffs
[0];
3727 /* The coefficient must be [1, 16] * {2, 4, 8, 16}. */
3728 return (value
.coeffs
[1] == factor
3729 && IN_RANGE (factor
, 2, 16 * 16)
3730 && (factor
& 1) == 0
3731 && factor
<= 16 * (factor
& -factor
));
3734 /* Likewise for rtx X. */
3737 aarch64_sve_cnt_immediate_p (rtx x
)
3740 return poly_int_rtx_p (x
, &value
) && aarch64_sve_cnt_immediate_p (value
);
3743 /* Return the asm string for an instruction with a CNT-like vector size
3744 operand (a vector pattern followed by a multiplier in the range [1, 16]).
3745 PREFIX is the mnemonic without the size suffix and OPERANDS is the
3746 first part of the operands template (the part that comes before the
3747 vector size itself). PATTERN is the pattern to use. FACTOR is the
3748 number of quadwords. NELTS_PER_VQ, if nonzero, is the number of elements
3749 in each quadword. If it is zero, we can use any element size. */
3752 aarch64_output_sve_cnt_immediate (const char *prefix
, const char *operands
,
3753 aarch64_svpattern pattern
,
3754 unsigned int factor
,
3755 unsigned int nelts_per_vq
)
3757 static char buffer
[sizeof ("sqincd\t%x0, %w0, vl256, mul #16")];
3759 if (nelts_per_vq
== 0)
3760 /* There is some overlap in the ranges of the four CNT instructions.
3761 Here we always use the smallest possible element size, so that the
3762 multiplier is 1 whereever possible. */
3763 nelts_per_vq
= factor
& -factor
;
3764 int shift
= std::min (exact_log2 (nelts_per_vq
), 4);
3765 gcc_assert (IN_RANGE (shift
, 1, 4));
3766 char suffix
= "dwhb"[shift
- 1];
3769 unsigned int written
;
3770 if (pattern
== AARCH64_SV_ALL
&& factor
== 1)
3771 written
= snprintf (buffer
, sizeof (buffer
), "%s%c\t%s",
3772 prefix
, suffix
, operands
);
3773 else if (factor
== 1)
3774 written
= snprintf (buffer
, sizeof (buffer
), "%s%c\t%s, %s",
3775 prefix
, suffix
, operands
, svpattern_token (pattern
));
3777 written
= snprintf (buffer
, sizeof (buffer
), "%s%c\t%s, %s, mul #%d",
3778 prefix
, suffix
, operands
, svpattern_token (pattern
),
3780 gcc_assert (written
< sizeof (buffer
));
3784 /* Return the asm string for an instruction with a CNT-like vector size
3785 operand (a vector pattern followed by a multiplier in the range [1, 16]).
3786 PREFIX is the mnemonic without the size suffix and OPERANDS is the
3787 first part of the operands template (the part that comes before the
3788 vector size itself). X is the value of the vector size operand,
3789 as a polynomial integer rtx; we need to convert this into an "all"
3790 pattern with a multiplier. */
3793 aarch64_output_sve_cnt_immediate (const char *prefix
, const char *operands
,
3796 poly_int64 value
= rtx_to_poly_int64 (x
);
3797 gcc_assert (aarch64_sve_cnt_immediate_p (value
));
3798 return aarch64_output_sve_cnt_immediate (prefix
, operands
, AARCH64_SV_ALL
,
3799 value
.coeffs
[1], 0);
3802 /* Return the asm string for an instruction with a CNT-like vector size
3803 operand (a vector pattern followed by a multiplier in the range [1, 16]).
3804 PREFIX is the mnemonic without the size suffix and OPERANDS is the
3805 first part of the operands template (the part that comes before the
3806 vector size itself). CNT_PAT[0..2] are the operands of the
3807 UNSPEC_SVE_CNT_PAT; see aarch64_sve_cnt_pat for details. */
3810 aarch64_output_sve_cnt_pat_immediate (const char *prefix
,
3811 const char *operands
, rtx
*cnt_pat
)
3813 aarch64_svpattern pattern
= (aarch64_svpattern
) INTVAL (cnt_pat
[0]);
3814 unsigned int nelts_per_vq
= INTVAL (cnt_pat
[1]);
3815 unsigned int factor
= INTVAL (cnt_pat
[2]) * nelts_per_vq
;
3816 return aarch64_output_sve_cnt_immediate (prefix
, operands
, pattern
,
3817 factor
, nelts_per_vq
);
3820 /* Return true if we can add X using a single SVE INC or DEC instruction. */
3823 aarch64_sve_scalar_inc_dec_immediate_p (rtx x
)
3826 return (poly_int_rtx_p (x
, &value
)
3827 && (aarch64_sve_cnt_immediate_p (value
)
3828 || aarch64_sve_cnt_immediate_p (-value
)));
3831 /* Return the asm string for adding SVE INC/DEC immediate OFFSET to
3835 aarch64_output_sve_scalar_inc_dec (rtx offset
)
3837 poly_int64 offset_value
= rtx_to_poly_int64 (offset
);
3838 gcc_assert (offset_value
.coeffs
[0] == offset_value
.coeffs
[1]);
3839 if (offset_value
.coeffs
[1] > 0)
3840 return aarch64_output_sve_cnt_immediate ("inc", "%x0", AARCH64_SV_ALL
,
3841 offset_value
.coeffs
[1], 0);
3843 return aarch64_output_sve_cnt_immediate ("dec", "%x0", AARCH64_SV_ALL
,
3844 -offset_value
.coeffs
[1], 0);
3847 /* Return true if we can add VALUE to a register using a single ADDVL
3848 or ADDPL instruction. */
3851 aarch64_sve_addvl_addpl_immediate_p (poly_int64 value
)
3853 HOST_WIDE_INT factor
= value
.coeffs
[0];
3854 if (factor
== 0 || value
.coeffs
[1] != factor
)
3856 /* FACTOR counts VG / 2, so a value of 2 is one predicate width
3857 and a value of 16 is one vector width. */
3858 return (((factor
& 15) == 0 && IN_RANGE (factor
, -32 * 16, 31 * 16))
3859 || ((factor
& 1) == 0 && IN_RANGE (factor
, -32 * 2, 31 * 2)));
3862 /* Likewise for rtx X. */
3865 aarch64_sve_addvl_addpl_immediate_p (rtx x
)
3868 return (poly_int_rtx_p (x
, &value
)
3869 && aarch64_sve_addvl_addpl_immediate_p (value
));
3872 /* Return the asm string for adding ADDVL or ADDPL immediate OFFSET
3873 to operand 1 and storing the result in operand 0. */
3876 aarch64_output_sve_addvl_addpl (rtx offset
)
3878 static char buffer
[sizeof ("addpl\t%x0, %x1, #-") + 3 * sizeof (int)];
3879 poly_int64 offset_value
= rtx_to_poly_int64 (offset
);
3880 gcc_assert (aarch64_sve_addvl_addpl_immediate_p (offset_value
));
3882 int factor
= offset_value
.coeffs
[1];
3883 if ((factor
& 15) == 0)
3884 snprintf (buffer
, sizeof (buffer
), "addvl\t%%x0, %%x1, #%d", factor
/ 16);
3886 snprintf (buffer
, sizeof (buffer
), "addpl\t%%x0, %%x1, #%d", factor
/ 2);
3890 /* Return true if X is a valid immediate for an SVE vector INC or DEC
3891 instruction. If it is, store the number of elements in each vector
3892 quadword in *NELTS_PER_VQ_OUT (if nonnull) and store the multiplication
3893 factor in *FACTOR_OUT (if nonnull). */
3896 aarch64_sve_vector_inc_dec_immediate_p (rtx x
, int *factor_out
,
3897 unsigned int *nelts_per_vq_out
)
3902 if (!const_vec_duplicate_p (x
, &elt
)
3903 || !poly_int_rtx_p (elt
, &value
))
3906 unsigned int nelts_per_vq
= 128 / GET_MODE_UNIT_BITSIZE (GET_MODE (x
));
3907 if (nelts_per_vq
!= 8 && nelts_per_vq
!= 4 && nelts_per_vq
!= 2)
3908 /* There's no vector INCB. */
3911 HOST_WIDE_INT factor
= value
.coeffs
[0];
3912 if (value
.coeffs
[1] != factor
)
3915 /* The coefficient must be [1, 16] * NELTS_PER_VQ. */
3916 if ((factor
% nelts_per_vq
) != 0
3917 || !IN_RANGE (abs (factor
), nelts_per_vq
, 16 * nelts_per_vq
))
3921 *factor_out
= factor
;
3922 if (nelts_per_vq_out
)
3923 *nelts_per_vq_out
= nelts_per_vq
;
3927 /* Return true if X is a valid immediate for an SVE vector INC or DEC
3931 aarch64_sve_vector_inc_dec_immediate_p (rtx x
)
3933 return aarch64_sve_vector_inc_dec_immediate_p (x
, NULL
, NULL
);
3936 /* Return the asm template for an SVE vector INC or DEC instruction.
3937 OPERANDS gives the operands before the vector count and X is the
3938 value of the vector count operand itself. */
3941 aarch64_output_sve_vector_inc_dec (const char *operands
, rtx x
)
3944 unsigned int nelts_per_vq
;
3945 if (!aarch64_sve_vector_inc_dec_immediate_p (x
, &factor
, &nelts_per_vq
))
3948 return aarch64_output_sve_cnt_immediate ("dec", operands
, AARCH64_SV_ALL
,
3949 -factor
, nelts_per_vq
);
3951 return aarch64_output_sve_cnt_immediate ("inc", operands
, AARCH64_SV_ALL
,
3952 factor
, nelts_per_vq
);
3956 aarch64_internal_mov_immediate (rtx dest
, rtx imm
, bool generate
,
3957 scalar_int_mode mode
)
3960 unsigned HOST_WIDE_INT val
, val2
, mask
;
3961 int one_match
, zero_match
;
3966 if (aarch64_move_imm (val
, mode
))
3969 emit_insn (gen_rtx_SET (dest
, imm
));
3973 /* Check to see if the low 32 bits are either 0xffffXXXX or 0xXXXXffff
3974 (with XXXX non-zero). In that case check to see if the move can be done in
3976 val2
= val
& 0xffffffff;
3978 && aarch64_move_imm (val2
, SImode
)
3979 && (((val
>> 32) & 0xffff) == 0 || (val
>> 48) == 0))
3982 emit_insn (gen_rtx_SET (dest
, GEN_INT (val2
)));
3984 /* Check if we have to emit a second instruction by checking to see
3985 if any of the upper 32 bits of the original DI mode value is set. */
3989 i
= (val
>> 48) ? 48 : 32;
3992 emit_insn (gen_insv_immdi (dest
, GEN_INT (i
),
3993 GEN_INT ((val
>> i
) & 0xffff)));
3998 if ((val
>> 32) == 0 || mode
== SImode
)
4002 emit_insn (gen_rtx_SET (dest
, GEN_INT (val
& 0xffff)));
4004 emit_insn (gen_insv_immsi (dest
, GEN_INT (16),
4005 GEN_INT ((val
>> 16) & 0xffff)));
4007 emit_insn (gen_insv_immdi (dest
, GEN_INT (16),
4008 GEN_INT ((val
>> 16) & 0xffff)));
4013 /* Remaining cases are all for DImode. */
4016 zero_match
= ((val
& mask
) == 0) + ((val
& (mask
<< 16)) == 0) +
4017 ((val
& (mask
<< 32)) == 0) + ((val
& (mask
<< 48)) == 0);
4018 one_match
= ((~val
& mask
) == 0) + ((~val
& (mask
<< 16)) == 0) +
4019 ((~val
& (mask
<< 32)) == 0) + ((~val
& (mask
<< 48)) == 0);
4021 if (zero_match
!= 2 && one_match
!= 2)
4023 /* Try emitting a bitmask immediate with a movk replacing 16 bits.
4024 For a 64-bit bitmask try whether changing 16 bits to all ones or
4025 zeroes creates a valid bitmask. To check any repeated bitmask,
4026 try using 16 bits from the other 32-bit half of val. */
4028 for (i
= 0; i
< 64; i
+= 16, mask
<<= 16)
4031 if (val2
!= val
&& aarch64_bitmask_imm (val2
, mode
))
4034 if (val2
!= val
&& aarch64_bitmask_imm (val2
, mode
))
4036 val2
= val2
& ~mask
;
4037 val2
= val2
| (((val2
>> 32) | (val2
<< 32)) & mask
);
4038 if (val2
!= val
&& aarch64_bitmask_imm (val2
, mode
))
4045 emit_insn (gen_rtx_SET (dest
, GEN_INT (val2
)));
4046 emit_insn (gen_insv_immdi (dest
, GEN_INT (i
),
4047 GEN_INT ((val
>> i
) & 0xffff)));
4053 /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
4054 are emitted by the initial mov. If one_match > zero_match, skip set bits,
4055 otherwise skip zero bits. */
4059 val2
= one_match
> zero_match
? ~val
: val
;
4060 i
= (val2
& mask
) != 0 ? 0 : (val2
& (mask
<< 16)) != 0 ? 16 : 32;
4063 emit_insn (gen_rtx_SET (dest
, GEN_INT (one_match
> zero_match
4064 ? (val
| ~(mask
<< i
))
4065 : (val
& (mask
<< i
)))));
4066 for (i
+= 16; i
< 64; i
+= 16)
4068 if ((val2
& (mask
<< i
)) == 0)
4071 emit_insn (gen_insv_immdi (dest
, GEN_INT (i
),
4072 GEN_INT ((val
>> i
) & 0xffff)));
4079 /* Return whether imm is a 128-bit immediate which is simple enough to
4082 aarch64_mov128_immediate (rtx imm
)
4084 if (GET_CODE (imm
) == CONST_INT
)
4087 gcc_assert (CONST_WIDE_INT_NUNITS (imm
) == 2);
4089 rtx lo
= GEN_INT (CONST_WIDE_INT_ELT (imm
, 0));
4090 rtx hi
= GEN_INT (CONST_WIDE_INT_ELT (imm
, 1));
4092 return aarch64_internal_mov_immediate (NULL_RTX
, lo
, false, DImode
)
4093 + aarch64_internal_mov_immediate (NULL_RTX
, hi
, false, DImode
) <= 4;
4097 /* Return the number of temporary registers that aarch64_add_offset_1
4098 would need to add OFFSET to a register. */
4101 aarch64_add_offset_1_temporaries (HOST_WIDE_INT offset
)
4103 return abs_hwi (offset
) < 0x1000000 ? 0 : 1;
4106 /* A subroutine of aarch64_add_offset. Set DEST to SRC + OFFSET for
4107 a non-polynomial OFFSET. MODE is the mode of the addition.
4108 FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
4109 be set and CFA adjustments added to the generated instructions.
4111 TEMP1, if nonnull, is a register of mode MODE that can be used as a
4112 temporary if register allocation is already complete. This temporary
4113 register may overlap DEST but must not overlap SRC. If TEMP1 is known
4114 to hold abs (OFFSET), EMIT_MOVE_IMM can be set to false to avoid emitting
4115 the immediate again.
4117 Since this function may be used to adjust the stack pointer, we must
4118 ensure that it cannot cause transient stack deallocation (for example
4119 by first incrementing SP and then decrementing when adjusting by a
4120 large immediate). */
4123 aarch64_add_offset_1 (scalar_int_mode mode
, rtx dest
,
4124 rtx src
, HOST_WIDE_INT offset
, rtx temp1
,
4125 bool frame_related_p
, bool emit_move_imm
)
4127 gcc_assert (emit_move_imm
|| temp1
!= NULL_RTX
);
4128 gcc_assert (temp1
== NULL_RTX
|| !reg_overlap_mentioned_p (temp1
, src
));
4130 unsigned HOST_WIDE_INT moffset
= absu_hwi (offset
);
4135 if (!rtx_equal_p (dest
, src
))
4137 insn
= emit_insn (gen_rtx_SET (dest
, src
));
4138 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
4143 /* Single instruction adjustment. */
4144 if (aarch64_uimm12_shift (moffset
))
4146 insn
= emit_insn (gen_add3_insn (dest
, src
, GEN_INT (offset
)));
4147 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
4151 /* Emit 2 additions/subtractions if the adjustment is less than 24 bits
4154 a) the offset cannot be loaded by a 16-bit move or
4155 b) there is no spare register into which we can move it. */
4156 if (moffset
< 0x1000000
4157 && ((!temp1
&& !can_create_pseudo_p ())
4158 || !aarch64_move_imm (moffset
, mode
)))
4160 HOST_WIDE_INT low_off
= moffset
& 0xfff;
4162 low_off
= offset
< 0 ? -low_off
: low_off
;
4163 insn
= emit_insn (gen_add3_insn (dest
, src
, GEN_INT (low_off
)));
4164 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
4165 insn
= emit_insn (gen_add2_insn (dest
, GEN_INT (offset
- low_off
)));
4166 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
4170 /* Emit a move immediate if required and an addition/subtraction. */
4173 gcc_assert (temp1
!= NULL_RTX
|| can_create_pseudo_p ());
4174 temp1
= aarch64_force_temporary (mode
, temp1
,
4175 gen_int_mode (moffset
, mode
));
4177 insn
= emit_insn (offset
< 0
4178 ? gen_sub3_insn (dest
, src
, temp1
)
4179 : gen_add3_insn (dest
, src
, temp1
));
4180 if (frame_related_p
)
4182 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
4183 rtx adj
= plus_constant (mode
, src
, offset
);
4184 add_reg_note (insn
, REG_CFA_ADJUST_CFA
, gen_rtx_SET (dest
, adj
));
4188 /* Return the number of temporary registers that aarch64_add_offset
4189 would need to move OFFSET into a register or add OFFSET to a register;
4190 ADD_P is true if we want the latter rather than the former. */
4193 aarch64_offset_temporaries (bool add_p
, poly_int64 offset
)
4195 /* This follows the same structure as aarch64_add_offset. */
4196 if (add_p
&& aarch64_sve_addvl_addpl_immediate_p (offset
))
4199 unsigned int count
= 0;
4200 HOST_WIDE_INT factor
= offset
.coeffs
[1];
4201 HOST_WIDE_INT constant
= offset
.coeffs
[0] - factor
;
4202 poly_int64
poly_offset (factor
, factor
);
4203 if (add_p
&& aarch64_sve_addvl_addpl_immediate_p (poly_offset
))
4204 /* Need one register for the ADDVL/ADDPL result. */
4206 else if (factor
!= 0)
4208 factor
= abs (factor
);
4209 if (factor
> 16 * (factor
& -factor
))
4210 /* Need one register for the CNT result and one for the multiplication
4211 factor. If necessary, the second temporary can be reused for the
4212 constant part of the offset. */
4214 /* Need one register for the CNT result (which might then
4218 return count
+ aarch64_add_offset_1_temporaries (constant
);
4221 /* If X can be represented as a poly_int64, return the number
4222 of temporaries that are required to add it to a register.
4223 Return -1 otherwise. */
4226 aarch64_add_offset_temporaries (rtx x
)
4229 if (!poly_int_rtx_p (x
, &offset
))
4231 return aarch64_offset_temporaries (true, offset
);
4234 /* Set DEST to SRC + OFFSET. MODE is the mode of the addition.
4235 FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
4236 be set and CFA adjustments added to the generated instructions.
4238 TEMP1, if nonnull, is a register of mode MODE that can be used as a
4239 temporary if register allocation is already complete. This temporary
4240 register may overlap DEST if !FRAME_RELATED_P but must not overlap SRC.
4241 If TEMP1 is known to hold abs (OFFSET), EMIT_MOVE_IMM can be set to
4242 false to avoid emitting the immediate again.
4244 TEMP2, if nonnull, is a second temporary register that doesn't
4245 overlap either DEST or REG.
4247 Since this function may be used to adjust the stack pointer, we must
4248 ensure that it cannot cause transient stack deallocation (for example
4249 by first incrementing SP and then decrementing when adjusting by a
4250 large immediate). */
4253 aarch64_add_offset (scalar_int_mode mode
, rtx dest
, rtx src
,
4254 poly_int64 offset
, rtx temp1
, rtx temp2
,
4255 bool frame_related_p
, bool emit_move_imm
= true)
4257 gcc_assert (emit_move_imm
|| temp1
!= NULL_RTX
);
4258 gcc_assert (temp1
== NULL_RTX
|| !reg_overlap_mentioned_p (temp1
, src
));
4259 gcc_assert (temp1
== NULL_RTX
4261 || !reg_overlap_mentioned_p (temp1
, dest
));
4262 gcc_assert (temp2
== NULL_RTX
|| !reg_overlap_mentioned_p (dest
, temp2
));
4264 /* Try using ADDVL or ADDPL to add the whole value. */
4265 if (src
!= const0_rtx
&& aarch64_sve_addvl_addpl_immediate_p (offset
))
4267 rtx offset_rtx
= gen_int_mode (offset
, mode
);
4268 rtx_insn
*insn
= emit_insn (gen_add3_insn (dest
, src
, offset_rtx
));
4269 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
4273 /* Coefficient 1 is multiplied by the number of 128-bit blocks in an
4274 SVE vector register, over and above the minimum size of 128 bits.
4275 This is equivalent to half the value returned by CNTD with a
4276 vector shape of ALL. */
4277 HOST_WIDE_INT factor
= offset
.coeffs
[1];
4278 HOST_WIDE_INT constant
= offset
.coeffs
[0] - factor
;
4280 /* Try using ADDVL or ADDPL to add the VG-based part. */
4281 poly_int64
poly_offset (factor
, factor
);
4282 if (src
!= const0_rtx
4283 && aarch64_sve_addvl_addpl_immediate_p (poly_offset
))
4285 rtx offset_rtx
= gen_int_mode (poly_offset
, mode
);
4286 if (frame_related_p
)
4288 rtx_insn
*insn
= emit_insn (gen_add3_insn (dest
, src
, offset_rtx
));
4289 RTX_FRAME_RELATED_P (insn
) = true;
4294 rtx addr
= gen_rtx_PLUS (mode
, src
, offset_rtx
);
4295 src
= aarch64_force_temporary (mode
, temp1
, addr
);
4300 /* Otherwise use a CNT-based sequence. */
4301 else if (factor
!= 0)
4303 /* Use a subtraction if we have a negative factor. */
4304 rtx_code code
= PLUS
;
4311 /* Calculate CNTD * FACTOR / 2. First try to fold the division
4312 into the multiplication. */
4316 /* Use a right shift by 1. */
4320 HOST_WIDE_INT low_bit
= factor
& -factor
;
4321 if (factor
<= 16 * low_bit
)
4323 if (factor
> 16 * 8)
4325 /* "CNTB Xn, ALL, MUL #FACTOR" is out of range, so calculate
4326 the value with the minimum multiplier and shift it into
4328 int extra_shift
= exact_log2 (low_bit
);
4329 shift
+= extra_shift
;
4330 factor
>>= extra_shift
;
4332 val
= gen_int_mode (poly_int64 (factor
* 2, factor
* 2), mode
);
4336 /* Base the factor on LOW_BIT if we can calculate LOW_BIT
4337 directly, since that should increase the chances of being
4338 able to use a shift and add sequence. If LOW_BIT itself
4339 is out of range, just use CNTD. */
4340 if (low_bit
<= 16 * 8)
4345 val
= gen_int_mode (poly_int64 (low_bit
* 2, low_bit
* 2), mode
);
4346 val
= aarch64_force_temporary (mode
, temp1
, val
);
4348 if (can_create_pseudo_p ())
4350 rtx coeff1
= gen_int_mode (factor
, mode
);
4351 val
= expand_mult (mode
, val
, coeff1
, NULL_RTX
, false, true);
4355 /* Go back to using a negative multiplication factor if we have
4356 no register from which to subtract. */
4357 if (code
== MINUS
&& src
== const0_rtx
)
4362 rtx coeff1
= gen_int_mode (factor
, mode
);
4363 coeff1
= aarch64_force_temporary (mode
, temp2
, coeff1
);
4364 val
= gen_rtx_MULT (mode
, val
, coeff1
);
4370 /* Multiply by 1 << SHIFT. */
4371 val
= aarch64_force_temporary (mode
, temp1
, val
);
4372 val
= gen_rtx_ASHIFT (mode
, val
, GEN_INT (shift
));
4374 else if (shift
== -1)
4377 val
= aarch64_force_temporary (mode
, temp1
, val
);
4378 val
= gen_rtx_ASHIFTRT (mode
, val
, const1_rtx
);
4381 /* Calculate SRC +/- CNTD * FACTOR / 2. */
4382 if (src
!= const0_rtx
)
4384 val
= aarch64_force_temporary (mode
, temp1
, val
);
4385 val
= gen_rtx_fmt_ee (code
, mode
, src
, val
);
4387 else if (code
== MINUS
)
4389 val
= aarch64_force_temporary (mode
, temp1
, val
);
4390 val
= gen_rtx_NEG (mode
, val
);
4393 if (constant
== 0 || frame_related_p
)
4395 rtx_insn
*insn
= emit_insn (gen_rtx_SET (dest
, val
));
4396 if (frame_related_p
)
4398 RTX_FRAME_RELATED_P (insn
) = true;
4399 add_reg_note (insn
, REG_CFA_ADJUST_CFA
,
4400 gen_rtx_SET (dest
, plus_constant (Pmode
, src
,
4409 src
= aarch64_force_temporary (mode
, temp1
, val
);
4414 emit_move_imm
= true;
4417 aarch64_add_offset_1 (mode
, dest
, src
, constant
, temp1
,
4418 frame_related_p
, emit_move_imm
);
4421 /* Like aarch64_add_offset, but the offset is given as an rtx rather
4422 than a poly_int64. */
4425 aarch64_split_add_offset (scalar_int_mode mode
, rtx dest
, rtx src
,
4426 rtx offset_rtx
, rtx temp1
, rtx temp2
)
4428 aarch64_add_offset (mode
, dest
, src
, rtx_to_poly_int64 (offset_rtx
),
4429 temp1
, temp2
, false);
4432 /* Add DELTA to the stack pointer, marking the instructions frame-related.
4433 TEMP1 is available as a temporary if nonnull. EMIT_MOVE_IMM is false
4434 if TEMP1 already contains abs (DELTA). */
4437 aarch64_add_sp (rtx temp1
, rtx temp2
, poly_int64 delta
, bool emit_move_imm
)
4439 aarch64_add_offset (Pmode
, stack_pointer_rtx
, stack_pointer_rtx
, delta
,
4440 temp1
, temp2
, true, emit_move_imm
);
4443 /* Subtract DELTA from the stack pointer, marking the instructions
4444 frame-related if FRAME_RELATED_P. TEMP1 is available as a temporary
4448 aarch64_sub_sp (rtx temp1
, rtx temp2
, poly_int64 delta
, bool frame_related_p
,
4449 bool emit_move_imm
= true)
4451 aarch64_add_offset (Pmode
, stack_pointer_rtx
, stack_pointer_rtx
, -delta
,
4452 temp1
, temp2
, frame_related_p
, emit_move_imm
);
4455 /* Set DEST to (vec_series BASE STEP). */
4458 aarch64_expand_vec_series (rtx dest
, rtx base
, rtx step
)
4460 machine_mode mode
= GET_MODE (dest
);
4461 scalar_mode inner
= GET_MODE_INNER (mode
);
4463 /* Each operand can be a register or an immediate in the range [-16, 15]. */
4464 if (!aarch64_sve_index_immediate_p (base
))
4465 base
= force_reg (inner
, base
);
4466 if (!aarch64_sve_index_immediate_p (step
))
4467 step
= force_reg (inner
, step
);
4469 emit_set_insn (dest
, gen_rtx_VEC_SERIES (mode
, base
, step
));
4472 /* Duplicate 128-bit Advanced SIMD vector SRC so that it fills an SVE
4473 register of mode MODE. Use TARGET for the result if it's nonnull
4476 The two vector modes must have the same element mode. The behavior
4477 is to duplicate architectural lane N of SRC into architectural lanes
4478 N + I * STEP of the result. On big-endian targets, architectural
4479 lane 0 of an Advanced SIMD vector is the last element of the vector
4480 in memory layout, so for big-endian targets this operation has the
4481 effect of reversing SRC before duplicating it. Callers need to
4482 account for this. */
4485 aarch64_expand_sve_dupq (rtx target
, machine_mode mode
, rtx src
)
4487 machine_mode src_mode
= GET_MODE (src
);
4488 gcc_assert (GET_MODE_INNER (mode
) == GET_MODE_INNER (src_mode
));
4489 insn_code icode
= (BYTES_BIG_ENDIAN
4490 ? code_for_aarch64_vec_duplicate_vq_be (mode
)
4491 : code_for_aarch64_vec_duplicate_vq_le (mode
));
4494 expand_operand ops
[3];
4495 create_output_operand (&ops
[i
++], target
, mode
);
4496 create_output_operand (&ops
[i
++], src
, src_mode
);
4497 if (BYTES_BIG_ENDIAN
)
4499 /* Create a PARALLEL describing the reversal of SRC. */
4500 unsigned int nelts_per_vq
= 128 / GET_MODE_UNIT_BITSIZE (mode
);
4501 rtx sel
= aarch64_gen_stepped_int_parallel (nelts_per_vq
,
4502 nelts_per_vq
- 1, -1);
4503 create_fixed_operand (&ops
[i
++], sel
);
4505 expand_insn (icode
, i
, ops
);
4506 return ops
[0].value
;
4509 /* Try to force 128-bit vector value SRC into memory and use LD1RQ to fetch
4510 the memory image into DEST. Return true on success. */
4513 aarch64_expand_sve_ld1rq (rtx dest
, rtx src
)
4515 src
= force_const_mem (GET_MODE (src
), src
);
4519 /* Make sure that the address is legitimate. */
4520 if (!aarch64_sve_ld1rq_operand_p (src
))
4522 rtx addr
= force_reg (Pmode
, XEXP (src
, 0));
4523 src
= replace_equiv_address (src
, addr
);
4526 machine_mode mode
= GET_MODE (dest
);
4527 machine_mode pred_mode
= aarch64_sve_pred_mode (mode
);
4528 rtx ptrue
= aarch64_ptrue_reg (pred_mode
);
4529 emit_insn (gen_aarch64_sve_ld1rq (mode
, dest
, src
, ptrue
));
4533 /* Return a register containing CONST_VECTOR SRC, given that SRC has an
4534 SVE data mode and isn't a legitimate constant. Use TARGET for the
4535 result if convenient.
4537 The returned register can have whatever mode seems most natural
4538 given the contents of SRC. */
4541 aarch64_expand_sve_const_vector (rtx target
, rtx src
)
4543 machine_mode mode
= GET_MODE (src
);
4544 unsigned int npatterns
= CONST_VECTOR_NPATTERNS (src
);
4545 unsigned int nelts_per_pattern
= CONST_VECTOR_NELTS_PER_PATTERN (src
);
4546 scalar_mode elt_mode
= GET_MODE_INNER (mode
);
4547 unsigned int elt_bits
= GET_MODE_BITSIZE (elt_mode
);
4548 unsigned int container_bits
= aarch64_sve_container_bits (mode
);
4549 unsigned int encoded_bits
= npatterns
* nelts_per_pattern
* container_bits
;
4551 if (nelts_per_pattern
== 1
4552 && encoded_bits
<= 128
4553 && container_bits
!= elt_bits
)
4555 /* We have a partial vector mode and a constant whose full-vector
4556 equivalent would occupy a repeating 128-bit sequence. Build that
4557 full-vector equivalent instead, so that we have the option of
4558 using LD1RQ and Advanced SIMD operations. */
4559 unsigned int repeat
= container_bits
/ elt_bits
;
4560 machine_mode full_mode
= aarch64_full_sve_mode (elt_mode
).require ();
4561 rtx_vector_builder
builder (full_mode
, npatterns
* repeat
, 1);
4562 for (unsigned int i
= 0; i
< npatterns
; ++i
)
4563 for (unsigned int j
= 0; j
< repeat
; ++j
)
4564 builder
.quick_push (CONST_VECTOR_ENCODED_ELT (src
, i
));
4565 target
= aarch64_target_reg (target
, full_mode
);
4566 return aarch64_expand_sve_const_vector (target
, builder
.build ());
4569 if (nelts_per_pattern
== 1 && encoded_bits
== 128)
4571 /* The constant is a duplicated quadword but can't be narrowed
4572 beyond a quadword. Get the memory image of the first quadword
4573 as a 128-bit vector and try using LD1RQ to load it from memory.
4575 The effect for both endiannesses is to load memory lane N into
4576 architectural lanes N + I * STEP of the result. On big-endian
4577 targets, the layout of the 128-bit vector in an Advanced SIMD
4578 register would be different from its layout in an SVE register,
4579 but this 128-bit vector is a memory value only. */
4580 machine_mode vq_mode
= aarch64_vq_mode (elt_mode
).require ();
4581 rtx vq_value
= simplify_gen_subreg (vq_mode
, src
, mode
, 0);
4582 if (vq_value
&& aarch64_expand_sve_ld1rq (target
, vq_value
))
4586 if (nelts_per_pattern
== 1 && encoded_bits
< 128)
4588 /* The vector is a repeating sequence of 64 bits or fewer.
4589 See if we can load them using an Advanced SIMD move and then
4590 duplicate it to fill a vector. This is better than using a GPR
4591 move because it keeps everything in the same register file. */
4592 machine_mode vq_mode
= aarch64_vq_mode (elt_mode
).require ();
4593 rtx_vector_builder
builder (vq_mode
, npatterns
, 1);
4594 for (unsigned int i
= 0; i
< npatterns
; ++i
)
4596 /* We want memory lane N to go into architectural lane N,
4597 so reverse for big-endian targets. The DUP .Q pattern
4598 has a compensating reverse built-in. */
4599 unsigned int srci
= BYTES_BIG_ENDIAN
? npatterns
- i
- 1 : i
;
4600 builder
.quick_push (CONST_VECTOR_ENCODED_ELT (src
, srci
));
4602 rtx vq_src
= builder
.build ();
4603 if (aarch64_simd_valid_immediate (vq_src
, NULL
))
4605 vq_src
= force_reg (vq_mode
, vq_src
);
4606 return aarch64_expand_sve_dupq (target
, mode
, vq_src
);
4609 /* Get an integer representation of the repeating part of Advanced
4610 SIMD vector VQ_SRC. This preserves the endianness of VQ_SRC,
4611 which for big-endian targets is lane-swapped wrt a normal
4612 Advanced SIMD vector. This means that for both endiannesses,
4613 memory lane N of SVE vector SRC corresponds to architectural
4614 lane N of a register holding VQ_SRC. This in turn means that
4615 memory lane 0 of SVE vector SRC is in the lsb of VQ_SRC (viewed
4616 as a single 128-bit value) and thus that memory lane 0 of SRC is
4617 in the lsb of the integer. Duplicating the integer therefore
4618 ensures that memory lane N of SRC goes into architectural lane
4619 N + I * INDEX of the SVE register. */
4620 scalar_mode int_mode
= int_mode_for_size (encoded_bits
, 0).require ();
4621 rtx elt_value
= simplify_gen_subreg (int_mode
, vq_src
, vq_mode
, 0);
4624 /* Pretend that we had a vector of INT_MODE to start with. */
4625 elt_mode
= int_mode
;
4626 mode
= aarch64_full_sve_mode (int_mode
).require ();
4628 /* If the integer can be moved into a general register by a
4629 single instruction, do that and duplicate the result. */
4630 if (CONST_INT_P (elt_value
)
4631 && aarch64_move_imm (INTVAL (elt_value
), elt_mode
))
4633 elt_value
= force_reg (elt_mode
, elt_value
);
4634 return expand_vector_broadcast (mode
, elt_value
);
4637 else if (npatterns
== 1)
4638 /* We're duplicating a single value, but can't do better than
4639 force it to memory and load from there. This handles things
4640 like symbolic constants. */
4641 elt_value
= CONST_VECTOR_ENCODED_ELT (src
, 0);
4645 /* Load the element from memory if we can, otherwise move it into
4646 a register and use a DUP. */
4647 rtx op
= force_const_mem (elt_mode
, elt_value
);
4649 op
= force_reg (elt_mode
, elt_value
);
4650 return expand_vector_broadcast (mode
, op
);
4654 /* Try using INDEX. */
4656 if (const_vec_series_p (src
, &base
, &step
))
4658 aarch64_expand_vec_series (target
, base
, step
);
4662 /* From here on, it's better to force the whole constant to memory
4664 if (GET_MODE_NUNITS (mode
).is_constant ())
4667 /* Expand each pattern individually. */
4668 gcc_assert (npatterns
> 1);
4669 rtx_vector_builder builder
;
4670 auto_vec
<rtx
, 16> vectors (npatterns
);
4671 for (unsigned int i
= 0; i
< npatterns
; ++i
)
4673 builder
.new_vector (mode
, 1, nelts_per_pattern
);
4674 for (unsigned int j
= 0; j
< nelts_per_pattern
; ++j
)
4675 builder
.quick_push (CONST_VECTOR_ELT (src
, i
+ j
* npatterns
));
4676 vectors
.quick_push (force_reg (mode
, builder
.build ()));
4679 /* Use permutes to interleave the separate vectors. */
4680 while (npatterns
> 1)
4683 for (unsigned int i
= 0; i
< npatterns
; ++i
)
4685 rtx tmp
= (npatterns
== 1 ? target
: gen_reg_rtx (mode
));
4686 rtvec v
= gen_rtvec (2, vectors
[i
], vectors
[i
+ npatterns
]);
4687 emit_set_insn (tmp
, gen_rtx_UNSPEC (mode
, v
, UNSPEC_ZIP1
));
4691 gcc_assert (vectors
[0] == target
);
4695 /* Use WHILE to set a predicate register of mode MODE in which the first
4696 VL bits are set and the rest are clear. Use TARGET for the register
4697 if it's nonnull and convenient. */
4700 aarch64_sve_move_pred_via_while (rtx target
, machine_mode mode
,
4703 rtx limit
= force_reg (DImode
, gen_int_mode (vl
, DImode
));
4704 target
= aarch64_target_reg (target
, mode
);
4705 emit_insn (gen_while (UNSPEC_WHILELO
, DImode
, mode
,
4706 target
, const0_rtx
, limit
));
4711 aarch64_expand_sve_const_pred_1 (rtx
, rtx_vector_builder
&, bool);
4713 /* BUILDER is a constant predicate in which the index of every set bit
4714 is a multiple of ELT_SIZE (which is <= 8). Try to load the constant
4715 by inverting every element at a multiple of ELT_SIZE and EORing the
4716 result with an ELT_SIZE PTRUE.
4718 Return a register that contains the constant on success, otherwise
4719 return null. Use TARGET as the register if it is nonnull and
4723 aarch64_expand_sve_const_pred_eor (rtx target
, rtx_vector_builder
&builder
,
4724 unsigned int elt_size
)
4726 /* Invert every element at a multiple of ELT_SIZE, keeping the
4728 rtx_vector_builder
inv_builder (VNx16BImode
, builder
.npatterns (),
4729 builder
.nelts_per_pattern ());
4730 for (unsigned int i
= 0; i
< builder
.encoded_nelts (); ++i
)
4731 if ((i
& (elt_size
- 1)) == 0 && INTVAL (builder
.elt (i
)) == 0)
4732 inv_builder
.quick_push (const1_rtx
);
4734 inv_builder
.quick_push (const0_rtx
);
4735 inv_builder
.finalize ();
4737 /* See if we can load the constant cheaply. */
4738 rtx inv
= aarch64_expand_sve_const_pred_1 (NULL_RTX
, inv_builder
, false);
4742 /* EOR the result with an ELT_SIZE PTRUE. */
4743 rtx mask
= aarch64_ptrue_all (elt_size
);
4744 mask
= force_reg (VNx16BImode
, mask
);
4745 target
= aarch64_target_reg (target
, VNx16BImode
);
4746 emit_insn (gen_aarch64_pred_z (XOR
, VNx16BImode
, target
, mask
, inv
, mask
));
4750 /* BUILDER is a constant predicate in which the index of every set bit
4751 is a multiple of ELT_SIZE (which is <= 8). Try to load the constant
4752 using a TRN1 of size PERMUTE_SIZE, which is >= ELT_SIZE. Return the
4753 register on success, otherwise return null. Use TARGET as the register
4754 if nonnull and convenient. */
4757 aarch64_expand_sve_const_pred_trn (rtx target
, rtx_vector_builder
&builder
,
4758 unsigned int elt_size
,
4759 unsigned int permute_size
)
4761 /* We're going to split the constant into two new constants A and B,
4762 with element I of BUILDER going into A if (I & PERMUTE_SIZE) == 0
4763 and into B otherwise. E.g. for PERMUTE_SIZE == 4 && ELT_SIZE == 1:
4765 A: { 0, 1, 2, 3, _, _, _, _, 8, 9, 10, 11, _, _, _, _ }
4766 B: { 4, 5, 6, 7, _, _, _, _, 12, 13, 14, 15, _, _, _, _ }
4768 where _ indicates elements that will be discarded by the permute.
4770 First calculate the ELT_SIZEs for A and B. */
4771 unsigned int a_elt_size
= GET_MODE_SIZE (DImode
);
4772 unsigned int b_elt_size
= GET_MODE_SIZE (DImode
);
4773 for (unsigned int i
= 0; i
< builder
.encoded_nelts (); i
+= elt_size
)
4774 if (INTVAL (builder
.elt (i
)) != 0)
4776 if (i
& permute_size
)
4777 b_elt_size
|= i
- permute_size
;
4781 a_elt_size
&= -a_elt_size
;
4782 b_elt_size
&= -b_elt_size
;
4784 /* Now construct the vectors themselves. */
4785 rtx_vector_builder
a_builder (VNx16BImode
, builder
.npatterns (),
4786 builder
.nelts_per_pattern ());
4787 rtx_vector_builder
b_builder (VNx16BImode
, builder
.npatterns (),
4788 builder
.nelts_per_pattern ());
4789 unsigned int nelts
= builder
.encoded_nelts ();
4790 for (unsigned int i
= 0; i
< nelts
; ++i
)
4791 if (i
& (elt_size
- 1))
4793 a_builder
.quick_push (const0_rtx
);
4794 b_builder
.quick_push (const0_rtx
);
4796 else if ((i
& permute_size
) == 0)
4798 /* The A and B elements are significant. */
4799 a_builder
.quick_push (builder
.elt (i
));
4800 b_builder
.quick_push (builder
.elt (i
+ permute_size
));
4804 /* The A and B elements are going to be discarded, so pick whatever
4805 is likely to give a nice constant. We are targeting element
4806 sizes A_ELT_SIZE and B_ELT_SIZE for A and B respectively,
4807 with the aim of each being a sequence of ones followed by
4808 a sequence of zeros. So:
4810 * if X_ELT_SIZE <= PERMUTE_SIZE, the best approach is to
4811 duplicate the last X_ELT_SIZE element, to extend the
4812 current sequence of ones or zeros.
4814 * if X_ELT_SIZE > PERMUTE_SIZE, the best approach is to add a
4815 zero, so that the constant really does have X_ELT_SIZE and
4816 not a smaller size. */
4817 if (a_elt_size
> permute_size
)
4818 a_builder
.quick_push (const0_rtx
);
4820 a_builder
.quick_push (a_builder
.elt (i
- a_elt_size
));
4821 if (b_elt_size
> permute_size
)
4822 b_builder
.quick_push (const0_rtx
);
4824 b_builder
.quick_push (b_builder
.elt (i
- b_elt_size
));
4826 a_builder
.finalize ();
4827 b_builder
.finalize ();
4829 /* Try loading A into a register. */
4830 rtx_insn
*last
= get_last_insn ();
4831 rtx a
= aarch64_expand_sve_const_pred_1 (NULL_RTX
, a_builder
, false);
4835 /* Try loading B into a register. */
4837 if (a_builder
!= b_builder
)
4839 b
= aarch64_expand_sve_const_pred_1 (NULL_RTX
, b_builder
, false);
4842 delete_insns_since (last
);
4847 /* Emit the TRN1 itself. */
4848 machine_mode mode
= aarch64_sve_pred_mode (permute_size
).require ();
4849 target
= aarch64_target_reg (target
, mode
);
4850 emit_insn (gen_aarch64_sve (UNSPEC_TRN1
, mode
, target
,
4851 gen_lowpart (mode
, a
),
4852 gen_lowpart (mode
, b
)));
4856 /* Subroutine of aarch64_expand_sve_const_pred. Try to load the VNx16BI
4857 constant in BUILDER into an SVE predicate register. Return the register
4858 on success, otherwise return null. Use TARGET for the register if
4859 nonnull and convenient.
4861 ALLOW_RECURSE_P is true if we can use methods that would call this
4862 function recursively. */
4865 aarch64_expand_sve_const_pred_1 (rtx target
, rtx_vector_builder
&builder
,
4866 bool allow_recurse_p
)
4868 if (builder
.encoded_nelts () == 1)
4869 /* A PFALSE or a PTRUE .B ALL. */
4870 return aarch64_emit_set_immediate (target
, builder
);
4872 unsigned int elt_size
= aarch64_widest_sve_pred_elt_size (builder
);
4873 if (int vl
= aarch64_partial_ptrue_length (builder
, elt_size
))
4875 /* If we can load the constant using PTRUE, use it as-is. */
4876 machine_mode mode
= aarch64_sve_pred_mode (elt_size
).require ();
4877 if (aarch64_svpattern_for_vl (mode
, vl
) != AARCH64_NUM_SVPATTERNS
)
4878 return aarch64_emit_set_immediate (target
, builder
);
4880 /* Otherwise use WHILE to set the first VL bits. */
4881 return aarch64_sve_move_pred_via_while (target
, mode
, vl
);
4884 if (!allow_recurse_p
)
4887 /* Try inverting the vector in element size ELT_SIZE and then EORing
4888 the result with an ELT_SIZE PTRUE. */
4889 if (INTVAL (builder
.elt (0)) == 0)
4890 if (rtx res
= aarch64_expand_sve_const_pred_eor (target
, builder
,
4894 /* Try using TRN1 to permute two simpler constants. */
4895 for (unsigned int i
= elt_size
; i
<= 8; i
*= 2)
4896 if (rtx res
= aarch64_expand_sve_const_pred_trn (target
, builder
,
4903 /* Return an SVE predicate register that contains the VNx16BImode
4904 constant in BUILDER, without going through the move expanders.
4906 The returned register can have whatever mode seems most natural
4907 given the contents of BUILDER. Use TARGET for the result if
4911 aarch64_expand_sve_const_pred (rtx target
, rtx_vector_builder
&builder
)
4913 /* Try loading the constant using pure predicate operations. */
4914 if (rtx res
= aarch64_expand_sve_const_pred_1 (target
, builder
, true))
4917 /* Try forcing the constant to memory. */
4918 if (builder
.full_nelts ().is_constant ())
4919 if (rtx mem
= force_const_mem (VNx16BImode
, builder
.build ()))
4921 target
= aarch64_target_reg (target
, VNx16BImode
);
4922 emit_move_insn (target
, mem
);
4926 /* The last resort is to load the constant as an integer and then
4927 compare it against zero. Use -1 for set bits in order to increase
4928 the changes of using SVE DUPM or an Advanced SIMD byte mask. */
4929 rtx_vector_builder
int_builder (VNx16QImode
, builder
.npatterns (),
4930 builder
.nelts_per_pattern ());
4931 for (unsigned int i
= 0; i
< builder
.encoded_nelts (); ++i
)
4932 int_builder
.quick_push (INTVAL (builder
.elt (i
))
4933 ? constm1_rtx
: const0_rtx
);
4934 return aarch64_convert_sve_data_to_pred (target
, VNx16BImode
,
4935 int_builder
.build ());
4938 /* Set DEST to immediate IMM. */
4941 aarch64_expand_mov_immediate (rtx dest
, rtx imm
)
4943 machine_mode mode
= GET_MODE (dest
);
4945 /* Check on what type of symbol it is. */
4946 scalar_int_mode int_mode
;
4947 if ((GET_CODE (imm
) == SYMBOL_REF
4948 || GET_CODE (imm
) == LABEL_REF
4949 || GET_CODE (imm
) == CONST
4950 || GET_CODE (imm
) == CONST_POLY_INT
)
4951 && is_a
<scalar_int_mode
> (mode
, &int_mode
))
4955 HOST_WIDE_INT const_offset
;
4956 enum aarch64_symbol_type sty
;
4958 /* If we have (const (plus symbol offset)), separate out the offset
4959 before we start classifying the symbol. */
4960 rtx base
= strip_offset (imm
, &offset
);
4962 /* We must always add an offset involving VL separately, rather than
4963 folding it into the relocation. */
4964 if (!offset
.is_constant (&const_offset
))
4968 aarch64_report_sve_required ();
4971 if (base
== const0_rtx
&& aarch64_sve_cnt_immediate_p (offset
))
4972 emit_insn (gen_rtx_SET (dest
, imm
));
4975 /* Do arithmetic on 32-bit values if the result is smaller
4977 if (partial_subreg_p (int_mode
, SImode
))
4979 /* It is invalid to do symbol calculations in modes
4980 narrower than SImode. */
4981 gcc_assert (base
== const0_rtx
);
4982 dest
= gen_lowpart (SImode
, dest
);
4985 if (base
!= const0_rtx
)
4987 base
= aarch64_force_temporary (int_mode
, dest
, base
);
4988 aarch64_add_offset (int_mode
, dest
, base
, offset
,
4989 NULL_RTX
, NULL_RTX
, false);
4992 aarch64_add_offset (int_mode
, dest
, base
, offset
,
4993 dest
, NULL_RTX
, false);
4998 sty
= aarch64_classify_symbol (base
, const_offset
);
5001 case SYMBOL_FORCE_TO_MEM
:
5002 if (const_offset
!= 0
5003 && targetm
.cannot_force_const_mem (int_mode
, imm
))
5005 gcc_assert (can_create_pseudo_p ());
5006 base
= aarch64_force_temporary (int_mode
, dest
, base
);
5007 aarch64_add_offset (int_mode
, dest
, base
, const_offset
,
5008 NULL_RTX
, NULL_RTX
, false);
5012 mem
= force_const_mem (ptr_mode
, imm
);
5015 /* If we aren't generating PC relative literals, then
5016 we need to expand the literal pool access carefully.
5017 This is something that needs to be done in a number
5018 of places, so could well live as a separate function. */
5019 if (!aarch64_pcrelative_literal_loads
)
5021 gcc_assert (can_create_pseudo_p ());
5022 base
= gen_reg_rtx (ptr_mode
);
5023 aarch64_expand_mov_immediate (base
, XEXP (mem
, 0));
5024 if (ptr_mode
!= Pmode
)
5025 base
= convert_memory_address (Pmode
, base
);
5026 mem
= gen_rtx_MEM (ptr_mode
, base
);
5029 if (int_mode
!= ptr_mode
)
5030 mem
= gen_rtx_ZERO_EXTEND (int_mode
, mem
);
5032 emit_insn (gen_rtx_SET (dest
, mem
));
5036 case SYMBOL_SMALL_TLSGD
:
5037 case SYMBOL_SMALL_TLSDESC
:
5038 case SYMBOL_SMALL_TLSIE
:
5039 case SYMBOL_SMALL_GOT_28K
:
5040 case SYMBOL_SMALL_GOT_4G
:
5041 case SYMBOL_TINY_GOT
:
5042 case SYMBOL_TINY_TLSIE
:
5043 if (const_offset
!= 0)
5045 gcc_assert(can_create_pseudo_p ());
5046 base
= aarch64_force_temporary (int_mode
, dest
, base
);
5047 aarch64_add_offset (int_mode
, dest
, base
, const_offset
,
5048 NULL_RTX
, NULL_RTX
, false);
5053 case SYMBOL_SMALL_ABSOLUTE
:
5054 case SYMBOL_TINY_ABSOLUTE
:
5055 case SYMBOL_TLSLE12
:
5056 case SYMBOL_TLSLE24
:
5057 case SYMBOL_TLSLE32
:
5058 case SYMBOL_TLSLE48
:
5059 aarch64_load_symref_appropriately (dest
, imm
, sty
);
5067 if (!CONST_INT_P (imm
))
5069 if (GET_MODE_CLASS (mode
) == MODE_VECTOR_BOOL
)
5071 /* Only the low bit of each .H, .S and .D element is defined,
5072 so we can set the upper bits to whatever we like. If the
5073 predicate is all-true in MODE, prefer to set all the undefined
5074 bits as well, so that we can share a single .B predicate for
5076 if (imm
== CONSTM1_RTX (mode
))
5077 imm
= CONSTM1_RTX (VNx16BImode
);
5079 /* All methods for constructing predicate modes wider than VNx16BI
5080 will set the upper bits of each element to zero. Expose this
5081 by moving such constants as a VNx16BI, so that all bits are
5082 significant and so that constants for different modes can be
5083 shared. The wider constant will still be available as a
5085 rtx_vector_builder builder
;
5086 if (aarch64_get_sve_pred_bits (builder
, imm
))
5088 rtx res
= aarch64_expand_sve_const_pred (dest
, builder
);
5090 emit_move_insn (dest
, gen_lowpart (mode
, res
));
5095 if (GET_CODE (imm
) == HIGH
5096 || aarch64_simd_valid_immediate (imm
, NULL
))
5098 emit_insn (gen_rtx_SET (dest
, imm
));
5102 if (GET_CODE (imm
) == CONST_VECTOR
&& aarch64_sve_data_mode_p (mode
))
5103 if (rtx res
= aarch64_expand_sve_const_vector (dest
, imm
))
5106 emit_insn (gen_aarch64_sve_reinterpret (mode
, dest
, res
));
5110 rtx mem
= force_const_mem (mode
, imm
);
5112 emit_move_insn (dest
, mem
);
5116 aarch64_internal_mov_immediate (dest
, imm
, true,
5117 as_a
<scalar_int_mode
> (mode
));
5120 /* Emit an SVE predicated move from SRC to DEST. PRED is a predicate
5121 that is known to contain PTRUE. */
5124 aarch64_emit_sve_pred_move (rtx dest
, rtx pred
, rtx src
)
5126 expand_operand ops
[3];
5127 machine_mode mode
= GET_MODE (dest
);
5128 create_output_operand (&ops
[0], dest
, mode
);
5129 create_input_operand (&ops
[1], pred
, GET_MODE(pred
));
5130 create_input_operand (&ops
[2], src
, mode
);
5131 temporary_volatile_ok
v (true);
5132 expand_insn (code_for_aarch64_pred_mov (mode
), 3, ops
);
5135 /* Expand a pre-RA SVE data move from SRC to DEST in which at least one
5136 operand is in memory. In this case we need to use the predicated LD1
5137 and ST1 instead of LDR and STR, both for correctness on big-endian
5138 targets and because LD1 and ST1 support a wider range of addressing modes.
5139 PRED_MODE is the mode of the predicate.
5141 See the comment at the head of aarch64-sve.md for details about the
5142 big-endian handling. */
5145 aarch64_expand_sve_mem_move (rtx dest
, rtx src
, machine_mode pred_mode
)
5147 machine_mode mode
= GET_MODE (dest
);
5148 rtx ptrue
= aarch64_ptrue_reg (pred_mode
);
5149 if (!register_operand (src
, mode
)
5150 && !register_operand (dest
, mode
))
5152 rtx tmp
= gen_reg_rtx (mode
);
5154 aarch64_emit_sve_pred_move (tmp
, ptrue
, src
);
5156 emit_move_insn (tmp
, src
);
5159 aarch64_emit_sve_pred_move (dest
, ptrue
, src
);
5162 /* Called only on big-endian targets. See whether an SVE vector move
5163 from SRC to DEST is effectively a REV[BHW] instruction, because at
5164 least one operand is a subreg of an SVE vector that has wider or
5165 narrower elements. Return true and emit the instruction if so.
5169 (set (reg:VNx8HI R1) (subreg:VNx8HI (reg:VNx16QI R2) 0))
5171 represents a VIEW_CONVERT between the following vectors, viewed
5174 R2: { [0].high, [0].low, [1].high, [1].low, ... }
5175 R1: { [0], [1], [2], [3], ... }
5177 The high part of lane X in R2 should therefore correspond to lane X*2
5178 of R1, but the register representations are:
5181 R2: ...... [1].high [1].low [0].high [0].low
5182 R1: ...... [3] [2] [1] [0]
5184 where the low part of lane X in R2 corresponds to lane X*2 in R1.
5185 We therefore need a reverse operation to swap the high and low values
5188 This is purely an optimization. Without it we would spill the
5189 subreg operand to the stack in one mode and reload it in the
5190 other mode, which has the same effect as the REV. */
5193 aarch64_maybe_expand_sve_subreg_move (rtx dest
, rtx src
)
5195 gcc_assert (BYTES_BIG_ENDIAN
);
5196 if (GET_CODE (dest
) == SUBREG
)
5197 dest
= SUBREG_REG (dest
);
5198 if (GET_CODE (src
) == SUBREG
)
5199 src
= SUBREG_REG (src
);
5201 /* The optimization handles two single SVE REGs with different element
5205 || aarch64_classify_vector_mode (GET_MODE (dest
)) != VEC_SVE_DATA
5206 || aarch64_classify_vector_mode (GET_MODE (src
)) != VEC_SVE_DATA
5207 || (GET_MODE_UNIT_SIZE (GET_MODE (dest
))
5208 == GET_MODE_UNIT_SIZE (GET_MODE (src
))))
5211 /* Generate *aarch64_sve_mov<mode>_subreg_be. */
5212 rtx ptrue
= aarch64_ptrue_reg (VNx16BImode
);
5213 rtx unspec
= gen_rtx_UNSPEC (GET_MODE (dest
), gen_rtvec (2, ptrue
, src
),
5215 emit_insn (gen_rtx_SET (dest
, unspec
));
5219 /* Return a copy of X with mode MODE, without changing its other
5220 attributes. Unlike gen_lowpart, this doesn't care whether the
5221 mode change is valid. */
5224 aarch64_replace_reg_mode (rtx x
, machine_mode mode
)
5226 if (GET_MODE (x
) == mode
)
5229 x
= shallow_copy_rtx (x
);
5230 set_mode_and_regno (x
, mode
, REGNO (x
));
5234 /* Return the SVE REV[BHW] unspec for reversing quantites of mode MODE
5235 stored in wider integer containers. */
5238 aarch64_sve_rev_unspec (machine_mode mode
)
5240 switch (GET_MODE_UNIT_SIZE (mode
))
5242 case 1: return UNSPEC_REVB
;
5243 case 2: return UNSPEC_REVH
;
5244 case 4: return UNSPEC_REVW
;
5249 /* Split a *aarch64_sve_mov<mode>_subreg_be pattern with the given
5253 aarch64_split_sve_subreg_move (rtx dest
, rtx ptrue
, rtx src
)
5255 /* Decide which REV operation we need. The mode with wider elements
5256 determines the mode of the operands and the mode with the narrower
5257 elements determines the reverse width. */
5258 machine_mode mode_with_wider_elts
= aarch64_sve_int_mode (GET_MODE (dest
));
5259 machine_mode mode_with_narrower_elts
= aarch64_sve_int_mode (GET_MODE (src
));
5260 if (GET_MODE_UNIT_SIZE (mode_with_wider_elts
)
5261 < GET_MODE_UNIT_SIZE (mode_with_narrower_elts
))
5262 std::swap (mode_with_wider_elts
, mode_with_narrower_elts
);
5264 unsigned int unspec
= aarch64_sve_rev_unspec (mode_with_narrower_elts
);
5265 machine_mode pred_mode
= aarch64_sve_pred_mode (mode_with_wider_elts
);
5267 /* Get the operands in the appropriate modes and emit the instruction. */
5268 ptrue
= gen_lowpart (pred_mode
, ptrue
);
5269 dest
= aarch64_replace_reg_mode (dest
, mode_with_wider_elts
);
5270 src
= aarch64_replace_reg_mode (src
, mode_with_wider_elts
);
5271 emit_insn (gen_aarch64_pred (unspec
, mode_with_wider_elts
,
5276 aarch64_function_ok_for_sibcall (tree
, tree exp
)
5278 if (crtl
->abi
->id () != expr_callee_abi (exp
).id ())
5284 /* Subroutine of aarch64_pass_by_reference for arguments that are not
5285 passed in SVE registers. */
5288 aarch64_pass_by_reference_1 (const function_arg_info
&arg
)
5291 machine_mode dummymode
;
5294 /* GET_MODE_SIZE (BLKmode) is useless since it is 0. */
5295 if (arg
.mode
== BLKmode
&& arg
.type
)
5296 size
= int_size_in_bytes (arg
.type
);
5298 /* No frontends can create types with variable-sized modes, so we
5299 shouldn't be asked to pass or return them. */
5300 size
= GET_MODE_SIZE (arg
.mode
).to_constant ();
5302 /* Aggregates are passed by reference based on their size. */
5303 if (arg
.aggregate_type_p ())
5304 size
= int_size_in_bytes (arg
.type
);
5306 /* Variable sized arguments are always returned by reference. */
5310 /* Can this be a candidate to be passed in fp/simd register(s)? */
5311 if (aarch64_vfp_is_call_or_return_candidate (arg
.mode
, arg
.type
,
5316 /* Arguments which are variable sized or larger than 2 registers are
5317 passed by reference unless they are a homogenous floating point
5319 return size
> 2 * UNITS_PER_WORD
;
5322 /* Implement TARGET_PASS_BY_REFERENCE. */
5325 aarch64_pass_by_reference (cumulative_args_t pcum_v
,
5326 const function_arg_info
&arg
)
5328 CUMULATIVE_ARGS
*pcum
= get_cumulative_args (pcum_v
);
5331 return aarch64_pass_by_reference_1 (arg
);
5333 pure_scalable_type_info pst_info
;
5334 switch (pst_info
.analyze (arg
.type
))
5336 case pure_scalable_type_info::IS_PST
:
5337 if (pcum
&& !pcum
->silent_p
&& !TARGET_SVE
)
5338 /* We can't gracefully recover at this point, so make this a
5340 fatal_error (input_location
, "arguments of type %qT require"
5341 " the SVE ISA extension", arg
.type
);
5343 /* Variadic SVE types are passed by reference. Normal non-variadic
5344 arguments are too if we've run out of registers. */
5346 || pcum
->aapcs_nvrn
+ pst_info
.num_zr () > NUM_FP_ARG_REGS
5347 || pcum
->aapcs_nprn
+ pst_info
.num_pr () > NUM_PR_ARG_REGS
);
5349 case pure_scalable_type_info::DOESNT_MATTER
:
5350 gcc_assert (aarch64_pass_by_reference_1 (arg
));
5353 case pure_scalable_type_info::NO_ABI_IDENTITY
:
5354 case pure_scalable_type_info::ISNT_PST
:
5355 return aarch64_pass_by_reference_1 (arg
);
5360 /* Return TRUE if VALTYPE is padded to its least significant bits. */
5362 aarch64_return_in_msb (const_tree valtype
)
5364 machine_mode dummy_mode
;
5367 /* Never happens in little-endian mode. */
5368 if (!BYTES_BIG_ENDIAN
)
5371 /* Only composite types smaller than or equal to 16 bytes can
5372 be potentially returned in registers. */
5373 if (!aarch64_composite_type_p (valtype
, TYPE_MODE (valtype
))
5374 || int_size_in_bytes (valtype
) <= 0
5375 || int_size_in_bytes (valtype
) > 16)
5378 /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
5379 or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
5380 is always passed/returned in the least significant bits of fp/simd
5382 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype
), valtype
,
5383 &dummy_mode
, &dummy_int
, NULL
))
5386 /* Likewise pure scalable types for SVE vector and predicate registers. */
5387 pure_scalable_type_info pst_info
;
5388 if (pst_info
.analyze_registers (valtype
))
5394 /* Implement TARGET_FUNCTION_VALUE.
5395 Define how to find the value returned by a function. */
5398 aarch64_function_value (const_tree type
, const_tree func
,
5399 bool outgoing ATTRIBUTE_UNUSED
)
5404 mode
= TYPE_MODE (type
);
5405 if (INTEGRAL_TYPE_P (type
))
5406 mode
= promote_function_mode (type
, mode
, &unsignedp
, func
, 1);
5408 pure_scalable_type_info pst_info
;
5409 if (type
&& pst_info
.analyze_registers (type
))
5410 return pst_info
.get_rtx (mode
, V0_REGNUM
, P0_REGNUM
);
5412 /* Generic vectors that map to full SVE modes with -msve-vector-bits=N
5413 are returned in memory, not by value. */
5414 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
5415 bool sve_p
= (vec_flags
& VEC_ANY_SVE
);
5417 if (aarch64_return_in_msb (type
))
5419 HOST_WIDE_INT size
= int_size_in_bytes (type
);
5421 if (size
% UNITS_PER_WORD
!= 0)
5423 size
+= UNITS_PER_WORD
- size
% UNITS_PER_WORD
;
5424 mode
= int_mode_for_size (size
* BITS_PER_UNIT
, 0).require ();
5429 machine_mode ag_mode
;
5430 if (aarch64_vfp_is_call_or_return_candidate (mode
, type
,
5431 &ag_mode
, &count
, NULL
))
5433 gcc_assert (!sve_p
);
5434 if (!aarch64_composite_type_p (type
, mode
))
5436 gcc_assert (count
== 1 && mode
== ag_mode
);
5437 return gen_rtx_REG (mode
, V0_REGNUM
);
5444 par
= gen_rtx_PARALLEL (mode
, rtvec_alloc (count
));
5445 for (i
= 0; i
< count
; i
++)
5447 rtx tmp
= gen_rtx_REG (ag_mode
, V0_REGNUM
+ i
);
5448 rtx offset
= gen_int_mode (i
* GET_MODE_SIZE (ag_mode
), Pmode
);
5449 tmp
= gen_rtx_EXPR_LIST (VOIDmode
, tmp
, offset
);
5450 XVECEXP (par
, 0, i
) = tmp
;
5459 /* Vector types can acquire a partial SVE mode using things like
5460 __attribute__((vector_size(N))), and this is potentially useful.
5461 However, the choice of mode doesn't affect the type's ABI
5462 identity, so we should treat the types as though they had
5463 the associated integer mode, just like they did before SVE
5466 We know that the vector must be 128 bits or smaller,
5467 otherwise we'd have returned it in memory instead. */
5469 && (aarch64_some_values_include_pst_objects_p (type
)
5470 || (vec_flags
& VEC_PARTIAL
)));
5472 scalar_int_mode int_mode
= int_mode_for_mode (mode
).require ();
5473 rtx reg
= gen_rtx_REG (int_mode
, R0_REGNUM
);
5474 rtx pair
= gen_rtx_EXPR_LIST (VOIDmode
, reg
, const0_rtx
);
5475 return gen_rtx_PARALLEL (mode
, gen_rtvec (1, pair
));
5477 return gen_rtx_REG (mode
, R0_REGNUM
);
5481 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
5482 Return true if REGNO is the number of a hard register in which the values
5483 of called function may come back. */
5486 aarch64_function_value_regno_p (const unsigned int regno
)
5488 /* Maximum of 16 bytes can be returned in the general registers. Examples
5489 of 16-byte return values are: 128-bit integers and 16-byte small
5490 structures (excluding homogeneous floating-point aggregates). */
5491 if (regno
== R0_REGNUM
|| regno
== R1_REGNUM
)
5494 /* Up to four fp/simd registers can return a function value, e.g. a
5495 homogeneous floating-point aggregate having four members. */
5496 if (regno
>= V0_REGNUM
&& regno
< V0_REGNUM
+ HA_MAX_NUM_FLDS
)
5497 return TARGET_FLOAT
;
5502 /* Subroutine for aarch64_return_in_memory for types that are not returned
5503 in SVE registers. */
5506 aarch64_return_in_memory_1 (const_tree type
)
5509 machine_mode ag_mode
;
5512 if (!AGGREGATE_TYPE_P (type
)
5513 && TREE_CODE (type
) != COMPLEX_TYPE
5514 && TREE_CODE (type
) != VECTOR_TYPE
)
5515 /* Simple scalar types always returned in registers. */
5518 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type
),
5525 /* Types larger than 2 registers returned in memory. */
5526 size
= int_size_in_bytes (type
);
5527 return (size
< 0 || size
> 2 * UNITS_PER_WORD
);
5530 /* Implement TARGET_RETURN_IN_MEMORY.
5532 If the type T of the result of a function is such that
5534 would require that arg be passed as a value in a register (or set of
5535 registers) according to the parameter passing rules, then the result
5536 is returned in the same registers as would be used for such an
5540 aarch64_return_in_memory (const_tree type
, const_tree fndecl ATTRIBUTE_UNUSED
)
5542 pure_scalable_type_info pst_info
;
5543 switch (pst_info
.analyze (type
))
5545 case pure_scalable_type_info::IS_PST
:
5546 return (pst_info
.num_zr () > NUM_FP_ARG_REGS
5547 || pst_info
.num_pr () > NUM_PR_ARG_REGS
);
5549 case pure_scalable_type_info::DOESNT_MATTER
:
5550 gcc_assert (aarch64_return_in_memory_1 (type
));
5553 case pure_scalable_type_info::NO_ABI_IDENTITY
:
5554 case pure_scalable_type_info::ISNT_PST
:
5555 return aarch64_return_in_memory_1 (type
);
5561 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v
, machine_mode mode
,
5562 const_tree type
, int *nregs
)
5564 CUMULATIVE_ARGS
*pcum
= get_cumulative_args (pcum_v
);
5565 return aarch64_vfp_is_call_or_return_candidate (mode
,
5567 &pcum
->aapcs_vfp_rmode
,
5572 /* Given MODE and TYPE of a function argument, return the alignment in
5573 bits. The idea is to suppress any stronger alignment requested by
5574 the user and opt for the natural alignment (specified in AAPCS64 \S
5575 4.1). ABI_BREAK is set to true if the alignment was incorrectly
5576 calculated in versions of GCC prior to GCC-9. This is a helper
5577 function for local use only. */
5580 aarch64_function_arg_alignment (machine_mode mode
, const_tree type
,
5585 return GET_MODE_ALIGNMENT (mode
);
5587 if (integer_zerop (TYPE_SIZE (type
)))
5590 gcc_assert (TYPE_MODE (type
) == mode
);
5592 if (!AGGREGATE_TYPE_P (type
))
5593 return TYPE_ALIGN (TYPE_MAIN_VARIANT (type
));
5595 if (TREE_CODE (type
) == ARRAY_TYPE
)
5596 return TYPE_ALIGN (TREE_TYPE (type
));
5598 unsigned int alignment
= 0;
5599 unsigned int bitfield_alignment
= 0;
5600 for (tree field
= TYPE_FIELDS (type
); field
; field
= DECL_CHAIN (field
))
5601 if (TREE_CODE (field
) == FIELD_DECL
)
5603 alignment
= std::max (alignment
, DECL_ALIGN (field
));
5604 if (DECL_BIT_FIELD_TYPE (field
))
5606 = std::max (bitfield_alignment
,
5607 TYPE_ALIGN (DECL_BIT_FIELD_TYPE (field
)));
5610 if (bitfield_alignment
> alignment
)
5613 return bitfield_alignment
;
5619 /* Layout a function argument according to the AAPCS64 rules. The rule
5620 numbers refer to the rule numbers in the AAPCS64. ORIG_MODE is the
5621 mode that was originally given to us by the target hook, whereas the
5622 mode in ARG might be the result of replacing partial SVE modes with
5623 the equivalent integer mode. */
5626 aarch64_layout_arg (cumulative_args_t pcum_v
, const function_arg_info
&arg
)
5628 CUMULATIVE_ARGS
*pcum
= get_cumulative_args (pcum_v
);
5629 tree type
= arg
.type
;
5630 machine_mode mode
= arg
.mode
;
5631 int ncrn
, nvrn
, nregs
;
5632 bool allocate_ncrn
, allocate_nvrn
;
5636 /* We need to do this once per argument. */
5637 if (pcum
->aapcs_arg_processed
)
5640 pcum
->aapcs_arg_processed
= true;
5642 pure_scalable_type_info pst_info
;
5643 if (type
&& pst_info
.analyze_registers (type
))
5645 /* The PCS says that it is invalid to pass an SVE value to an
5646 unprototyped function. There is no ABI-defined location we
5647 can return in this case, so we have no real choice but to raise
5648 an error immediately, even though this is only a query function. */
5649 if (arg
.named
&& pcum
->pcs_variant
!= ARM_PCS_SVE
)
5651 gcc_assert (!pcum
->silent_p
);
5652 error ("SVE type %qT cannot be passed to an unprototyped function",
5654 /* Avoid repeating the message, and avoid tripping the assert
5656 pcum
->pcs_variant
= ARM_PCS_SVE
;
5659 /* We would have converted the argument into pass-by-reference
5660 form if it didn't fit in registers. */
5661 pcum
->aapcs_nextnvrn
= pcum
->aapcs_nvrn
+ pst_info
.num_zr ();
5662 pcum
->aapcs_nextnprn
= pcum
->aapcs_nprn
+ pst_info
.num_pr ();
5663 gcc_assert (arg
.named
5664 && pcum
->pcs_variant
== ARM_PCS_SVE
5665 && pcum
->aapcs_nextnvrn
<= NUM_FP_ARG_REGS
5666 && pcum
->aapcs_nextnprn
<= NUM_PR_ARG_REGS
);
5667 pcum
->aapcs_reg
= pst_info
.get_rtx (mode
, V0_REGNUM
+ pcum
->aapcs_nvrn
,
5668 P0_REGNUM
+ pcum
->aapcs_nprn
);
5672 /* Generic vectors that map to full SVE modes with -msve-vector-bits=N
5673 are passed by reference, not by value. */
5674 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
5675 bool sve_p
= (vec_flags
& VEC_ANY_SVE
);
5677 /* Vector types can acquire a partial SVE mode using things like
5678 __attribute__((vector_size(N))), and this is potentially useful.
5679 However, the choice of mode doesn't affect the type's ABI
5680 identity, so we should treat the types as though they had
5681 the associated integer mode, just like they did before SVE
5684 We know that the vector must be 128 bits or smaller,
5685 otherwise we'd have passed it in memory instead. */
5687 && (aarch64_some_values_include_pst_objects_p (type
)
5688 || (vec_flags
& VEC_PARTIAL
)));
5690 /* Size in bytes, rounded to the nearest multiple of 8 bytes. */
5692 size
= int_size_in_bytes (type
);
5694 /* No frontends can create types with variable-sized modes, so we
5695 shouldn't be asked to pass or return them. */
5696 size
= GET_MODE_SIZE (mode
).to_constant ();
5697 size
= ROUND_UP (size
, UNITS_PER_WORD
);
5699 allocate_ncrn
= (type
) ? !(FLOAT_TYPE_P (type
)) : !FLOAT_MODE_P (mode
);
5700 allocate_nvrn
= aarch64_vfp_is_call_candidate (pcum_v
,
5704 gcc_assert (!sve_p
|| !allocate_nvrn
);
5706 /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
5707 The following code thus handles passing by SIMD/FP registers first. */
5709 nvrn
= pcum
->aapcs_nvrn
;
5711 /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
5712 and homogenous short-vector aggregates (HVA). */
5715 if (!pcum
->silent_p
&& !TARGET_FLOAT
)
5716 aarch64_err_no_fpadvsimd (mode
);
5718 if (nvrn
+ nregs
<= NUM_FP_ARG_REGS
)
5720 pcum
->aapcs_nextnvrn
= nvrn
+ nregs
;
5721 if (!aarch64_composite_type_p (type
, mode
))
5723 gcc_assert (nregs
== 1);
5724 pcum
->aapcs_reg
= gen_rtx_REG (mode
, V0_REGNUM
+ nvrn
);
5730 par
= gen_rtx_PARALLEL (mode
, rtvec_alloc (nregs
));
5731 for (i
= 0; i
< nregs
; i
++)
5733 rtx tmp
= gen_rtx_REG (pcum
->aapcs_vfp_rmode
,
5734 V0_REGNUM
+ nvrn
+ i
);
5735 rtx offset
= gen_int_mode
5736 (i
* GET_MODE_SIZE (pcum
->aapcs_vfp_rmode
), Pmode
);
5737 tmp
= gen_rtx_EXPR_LIST (VOIDmode
, tmp
, offset
);
5738 XVECEXP (par
, 0, i
) = tmp
;
5740 pcum
->aapcs_reg
= par
;
5746 /* C.3 NSRN is set to 8. */
5747 pcum
->aapcs_nextnvrn
= NUM_FP_ARG_REGS
;
5752 ncrn
= pcum
->aapcs_ncrn
;
5753 nregs
= size
/ UNITS_PER_WORD
;
5755 /* C6 - C9. though the sign and zero extension semantics are
5756 handled elsewhere. This is the case where the argument fits
5757 entirely general registers. */
5758 if (allocate_ncrn
&& (ncrn
+ nregs
<= NUM_ARG_REGS
))
5760 gcc_assert (nregs
== 0 || nregs
== 1 || nregs
== 2);
5762 /* C.8 if the argument has an alignment of 16 then the NGRN is
5763 rounded up to the next even number. */
5766 /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
5767 comparison is there because for > 16 * BITS_PER_UNIT
5768 alignment nregs should be > 2 and therefore it should be
5769 passed by reference rather than value. */
5770 && (aarch64_function_arg_alignment (mode
, type
, &abi_break
)
5771 == 16 * BITS_PER_UNIT
))
5773 if (abi_break
&& warn_psabi
&& currently_expanding_gimple_stmt
)
5774 inform (input_location
, "parameter passing for argument of type "
5775 "%qT changed in GCC 9.1", type
);
5777 gcc_assert (ncrn
+ nregs
<= NUM_ARG_REGS
);
5780 /* If an argument with an SVE mode needs to be shifted up to the
5781 high part of the register, treat it as though it had an integer mode.
5782 Using the normal (parallel [...]) would suppress the shifting. */
5785 && maybe_ne (GET_MODE_SIZE (mode
), nregs
* UNITS_PER_WORD
)
5786 && aarch64_pad_reg_upward (mode
, type
, false))
5788 mode
= int_mode_for_mode (mode
).require ();
5792 /* NREGS can be 0 when e.g. an empty structure is to be passed.
5793 A reg is still generated for it, but the caller should be smart
5794 enough not to use it. */
5796 || (nregs
== 1 && !sve_p
)
5797 || GET_MODE_CLASS (mode
) == MODE_INT
)
5798 pcum
->aapcs_reg
= gen_rtx_REG (mode
, R0_REGNUM
+ ncrn
);
5804 par
= gen_rtx_PARALLEL (mode
, rtvec_alloc (nregs
));
5805 for (i
= 0; i
< nregs
; i
++)
5807 scalar_int_mode reg_mode
= word_mode
;
5809 reg_mode
= int_mode_for_mode (mode
).require ();
5810 rtx tmp
= gen_rtx_REG (reg_mode
, R0_REGNUM
+ ncrn
+ i
);
5811 tmp
= gen_rtx_EXPR_LIST (VOIDmode
, tmp
,
5812 GEN_INT (i
* UNITS_PER_WORD
));
5813 XVECEXP (par
, 0, i
) = tmp
;
5815 pcum
->aapcs_reg
= par
;
5818 pcum
->aapcs_nextncrn
= ncrn
+ nregs
;
5823 pcum
->aapcs_nextncrn
= NUM_ARG_REGS
;
5825 /* The argument is passed on stack; record the needed number of words for
5826 this argument and align the total size if necessary. */
5828 pcum
->aapcs_stack_words
= size
/ UNITS_PER_WORD
;
5830 if (aarch64_function_arg_alignment (mode
, type
, &abi_break
)
5831 == 16 * BITS_PER_UNIT
)
5833 int new_size
= ROUND_UP (pcum
->aapcs_stack_size
, 16 / UNITS_PER_WORD
);
5834 if (pcum
->aapcs_stack_size
!= new_size
)
5836 if (abi_break
&& warn_psabi
&& currently_expanding_gimple_stmt
)
5837 inform (input_location
, "parameter passing for argument of type "
5838 "%qT changed in GCC 9.1", type
);
5839 pcum
->aapcs_stack_size
= new_size
;
5845 /* Implement TARGET_FUNCTION_ARG. */
5848 aarch64_function_arg (cumulative_args_t pcum_v
, const function_arg_info
&arg
)
5850 CUMULATIVE_ARGS
*pcum
= get_cumulative_args (pcum_v
);
5851 gcc_assert (pcum
->pcs_variant
== ARM_PCS_AAPCS64
5852 || pcum
->pcs_variant
== ARM_PCS_SIMD
5853 || pcum
->pcs_variant
== ARM_PCS_SVE
);
5855 if (arg
.end_marker_p ())
5856 return gen_int_mode (pcum
->pcs_variant
, DImode
);
5858 aarch64_layout_arg (pcum_v
, arg
);
5859 return pcum
->aapcs_reg
;
5863 aarch64_init_cumulative_args (CUMULATIVE_ARGS
*pcum
,
5865 rtx libname ATTRIBUTE_UNUSED
,
5866 const_tree fndecl ATTRIBUTE_UNUSED
,
5867 unsigned n_named ATTRIBUTE_UNUSED
,
5870 pcum
->aapcs_ncrn
= 0;
5871 pcum
->aapcs_nvrn
= 0;
5872 pcum
->aapcs_nprn
= 0;
5873 pcum
->aapcs_nextncrn
= 0;
5874 pcum
->aapcs_nextnvrn
= 0;
5875 pcum
->aapcs_nextnprn
= 0;
5877 pcum
->pcs_variant
= (arm_pcs
) fntype_abi (fntype
).id ();
5879 pcum
->pcs_variant
= ARM_PCS_AAPCS64
;
5880 pcum
->aapcs_reg
= NULL_RTX
;
5881 pcum
->aapcs_arg_processed
= false;
5882 pcum
->aapcs_stack_words
= 0;
5883 pcum
->aapcs_stack_size
= 0;
5884 pcum
->silent_p
= silent_p
;
5888 && fndecl
&& TREE_PUBLIC (fndecl
)
5889 && fntype
&& fntype
!= error_mark_node
)
5891 const_tree type
= TREE_TYPE (fntype
);
5892 machine_mode mode ATTRIBUTE_UNUSED
; /* To pass pointer as argument. */
5893 int nregs ATTRIBUTE_UNUSED
; /* Likewise. */
5894 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type
), type
,
5895 &mode
, &nregs
, NULL
))
5896 aarch64_err_no_fpadvsimd (TYPE_MODE (type
));
5901 && pcum
->pcs_variant
== ARM_PCS_SVE
)
5903 /* We can't gracefully recover at this point, so make this a
5906 fatal_error (input_location
, "%qE requires the SVE ISA extension",
5909 fatal_error (input_location
, "calls to functions of type %qT require"
5910 " the SVE ISA extension", fntype
);
5915 aarch64_function_arg_advance (cumulative_args_t pcum_v
,
5916 const function_arg_info
&arg
)
5918 CUMULATIVE_ARGS
*pcum
= get_cumulative_args (pcum_v
);
5919 if (pcum
->pcs_variant
== ARM_PCS_AAPCS64
5920 || pcum
->pcs_variant
== ARM_PCS_SIMD
5921 || pcum
->pcs_variant
== ARM_PCS_SVE
)
5923 aarch64_layout_arg (pcum_v
, arg
);
5924 gcc_assert ((pcum
->aapcs_reg
!= NULL_RTX
)
5925 != (pcum
->aapcs_stack_words
!= 0));
5926 pcum
->aapcs_arg_processed
= false;
5927 pcum
->aapcs_ncrn
= pcum
->aapcs_nextncrn
;
5928 pcum
->aapcs_nvrn
= pcum
->aapcs_nextnvrn
;
5929 pcum
->aapcs_nprn
= pcum
->aapcs_nextnprn
;
5930 pcum
->aapcs_stack_size
+= pcum
->aapcs_stack_words
;
5931 pcum
->aapcs_stack_words
= 0;
5932 pcum
->aapcs_reg
= NULL_RTX
;
5937 aarch64_function_arg_regno_p (unsigned regno
)
5939 return ((GP_REGNUM_P (regno
) && regno
< R0_REGNUM
+ NUM_ARG_REGS
)
5940 || (FP_REGNUM_P (regno
) && regno
< V0_REGNUM
+ NUM_FP_ARG_REGS
));
5943 /* Implement FUNCTION_ARG_BOUNDARY. Every parameter gets at least
5944 PARM_BOUNDARY bits of alignment, but will be given anything up
5945 to STACK_BOUNDARY bits if the type requires it. This makes sure
5946 that both before and after the layout of each argument, the Next
5947 Stacked Argument Address (NSAA) will have a minimum alignment of
5951 aarch64_function_arg_boundary (machine_mode mode
, const_tree type
)
5954 unsigned int alignment
= aarch64_function_arg_alignment (mode
, type
,
5956 if (abi_break
& warn_psabi
)
5957 inform (input_location
, "parameter passing for argument of type "
5958 "%qT changed in GCC 9.1", type
);
5960 return MIN (MAX (alignment
, PARM_BOUNDARY
), STACK_BOUNDARY
);
5963 /* Implement TARGET_GET_RAW_RESULT_MODE and TARGET_GET_RAW_ARG_MODE. */
5965 static fixed_size_mode
5966 aarch64_get_reg_raw_mode (int regno
)
5968 if (TARGET_SVE
&& FP_REGNUM_P (regno
))
5969 /* Don't use the SVE part of the register for __builtin_apply and
5970 __builtin_return. The SVE registers aren't used by the normal PCS,
5971 so using them there would be a waste of time. The PCS extensions
5972 for SVE types are fundamentally incompatible with the
5973 __builtin_return/__builtin_apply interface. */
5974 return as_a
<fixed_size_mode
> (V16QImode
);
5975 return default_get_reg_raw_mode (regno
);
5978 /* Implement TARGET_FUNCTION_ARG_PADDING.
5980 Small aggregate types are placed in the lowest memory address.
5982 The related parameter passing rules are B.4, C.3, C.5 and C.14. */
5984 static pad_direction
5985 aarch64_function_arg_padding (machine_mode mode
, const_tree type
)
5987 /* On little-endian targets, the least significant byte of every stack
5988 argument is passed at the lowest byte address of the stack slot. */
5989 if (!BYTES_BIG_ENDIAN
)
5992 /* Otherwise, integral, floating-point and pointer types are padded downward:
5993 the least significant byte of a stack argument is passed at the highest
5994 byte address of the stack slot. */
5996 ? (INTEGRAL_TYPE_P (type
) || SCALAR_FLOAT_TYPE_P (type
)
5997 || POINTER_TYPE_P (type
))
5998 : (SCALAR_INT_MODE_P (mode
) || SCALAR_FLOAT_MODE_P (mode
)))
5999 return PAD_DOWNWARD
;
6001 /* Everything else padded upward, i.e. data in first byte of stack slot. */
6005 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
6007 It specifies padding for the last (may also be the only)
6008 element of a block move between registers and memory. If
6009 assuming the block is in the memory, padding upward means that
6010 the last element is padded after its highest significant byte,
6011 while in downward padding, the last element is padded at the
6012 its least significant byte side.
6014 Small aggregates and small complex types are always padded
6017 We don't need to worry about homogeneous floating-point or
6018 short-vector aggregates; their move is not affected by the
6019 padding direction determined here. Regardless of endianness,
6020 each element of such an aggregate is put in the least
6021 significant bits of a fp/simd register.
6023 Return !BYTES_BIG_ENDIAN if the least significant byte of the
6024 register has useful data, and return the opposite if the most
6025 significant byte does. */
6028 aarch64_pad_reg_upward (machine_mode mode
, const_tree type
,
6029 bool first ATTRIBUTE_UNUSED
)
6032 /* Aside from pure scalable types, small composite types are always
6034 if (BYTES_BIG_ENDIAN
&& aarch64_composite_type_p (type
, mode
))
6038 size
= int_size_in_bytes (type
);
6040 /* No frontends can create types with variable-sized modes, so we
6041 shouldn't be asked to pass or return them. */
6042 size
= GET_MODE_SIZE (mode
).to_constant ();
6043 if (size
< 2 * UNITS_PER_WORD
)
6045 pure_scalable_type_info pst_info
;
6046 if (pst_info
.analyze_registers (type
))
6052 /* Otherwise, use the default padding. */
6053 return !BYTES_BIG_ENDIAN
;
6056 static scalar_int_mode
6057 aarch64_libgcc_cmp_return_mode (void)
6062 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
6064 /* We use the 12-bit shifted immediate arithmetic instructions so values
6065 must be multiple of (1 << 12), i.e. 4096. */
6066 #define ARITH_FACTOR 4096
6068 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
6069 #error Cannot use simple address calculation for stack probing
6072 /* The pair of scratch registers used for stack probing. */
6073 #define PROBE_STACK_FIRST_REG R9_REGNUM
6074 #define PROBE_STACK_SECOND_REG R10_REGNUM
6076 /* Emit code to probe a range of stack addresses from FIRST to FIRST+POLY_SIZE,
6077 inclusive. These are offsets from the current stack pointer. */
6080 aarch64_emit_probe_stack_range (HOST_WIDE_INT first
, poly_int64 poly_size
)
6083 if (!poly_size
.is_constant (&size
))
6085 sorry ("stack probes for SVE frames");
6089 rtx reg1
= gen_rtx_REG (Pmode
, PROBE_STACK_FIRST_REG
);
6091 /* See the same assertion on PROBE_INTERVAL above. */
6092 gcc_assert ((first
% ARITH_FACTOR
) == 0);
6094 /* See if we have a constant small number of probes to generate. If so,
6095 that's the easy case. */
6096 if (size
<= PROBE_INTERVAL
)
6098 const HOST_WIDE_INT base
= ROUND_UP (size
, ARITH_FACTOR
);
6100 emit_set_insn (reg1
,
6101 plus_constant (Pmode
,
6102 stack_pointer_rtx
, -(first
+ base
)));
6103 emit_stack_probe (plus_constant (Pmode
, reg1
, base
- size
));
6106 /* The run-time loop is made up of 8 insns in the generic case while the
6107 compile-time loop is made up of 4+2*(n-2) insns for n # of intervals. */
6108 else if (size
<= 4 * PROBE_INTERVAL
)
6110 HOST_WIDE_INT i
, rem
;
6112 emit_set_insn (reg1
,
6113 plus_constant (Pmode
,
6115 -(first
+ PROBE_INTERVAL
)));
6116 emit_stack_probe (reg1
);
6118 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
6119 it exceeds SIZE. If only two probes are needed, this will not
6120 generate any code. Then probe at FIRST + SIZE. */
6121 for (i
= 2 * PROBE_INTERVAL
; i
< size
; i
+= PROBE_INTERVAL
)
6123 emit_set_insn (reg1
,
6124 plus_constant (Pmode
, reg1
, -PROBE_INTERVAL
));
6125 emit_stack_probe (reg1
);
6128 rem
= size
- (i
- PROBE_INTERVAL
);
6131 const HOST_WIDE_INT base
= ROUND_UP (rem
, ARITH_FACTOR
);
6133 emit_set_insn (reg1
, plus_constant (Pmode
, reg1
, -base
));
6134 emit_stack_probe (plus_constant (Pmode
, reg1
, base
- rem
));
6137 emit_stack_probe (plus_constant (Pmode
, reg1
, -rem
));
6140 /* Otherwise, do the same as above, but in a loop. Note that we must be
6141 extra careful with variables wrapping around because we might be at
6142 the very top (or the very bottom) of the address space and we have
6143 to be able to handle this case properly; in particular, we use an
6144 equality test for the loop condition. */
6147 rtx reg2
= gen_rtx_REG (Pmode
, PROBE_STACK_SECOND_REG
);
6149 /* Step 1: round SIZE to the previous multiple of the interval. */
6151 HOST_WIDE_INT rounded_size
= size
& -PROBE_INTERVAL
;
6154 /* Step 2: compute initial and final value of the loop counter. */
6156 /* TEST_ADDR = SP + FIRST. */
6157 emit_set_insn (reg1
,
6158 plus_constant (Pmode
, stack_pointer_rtx
, -first
));
6160 /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE. */
6161 HOST_WIDE_INT adjustment
= - (first
+ rounded_size
);
6162 if (! aarch64_uimm12_shift (adjustment
))
6164 aarch64_internal_mov_immediate (reg2
, GEN_INT (adjustment
),
6166 emit_set_insn (reg2
, gen_rtx_PLUS (Pmode
, stack_pointer_rtx
, reg2
));
6169 emit_set_insn (reg2
,
6170 plus_constant (Pmode
, stack_pointer_rtx
, adjustment
));
6176 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
6179 while (TEST_ADDR != LAST_ADDR)
6181 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
6182 until it is equal to ROUNDED_SIZE. */
6184 emit_insn (gen_probe_stack_range (reg1
, reg1
, reg2
));
6187 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
6188 that SIZE is equal to ROUNDED_SIZE. */
6190 if (size
!= rounded_size
)
6192 HOST_WIDE_INT rem
= size
- rounded_size
;
6196 const HOST_WIDE_INT base
= ROUND_UP (rem
, ARITH_FACTOR
);
6198 emit_set_insn (reg2
, plus_constant (Pmode
, reg2
, -base
));
6199 emit_stack_probe (plus_constant (Pmode
, reg2
, base
- rem
));
6202 emit_stack_probe (plus_constant (Pmode
, reg2
, -rem
));
6206 /* Make sure nothing is scheduled before we are done. */
6207 emit_insn (gen_blockage ());
6210 /* Probe a range of stack addresses from REG1 to REG2 inclusive. These are
6211 absolute addresses. */
6214 aarch64_output_probe_stack_range (rtx reg1
, rtx reg2
)
6216 static int labelno
= 0;
6220 ASM_GENERATE_INTERNAL_LABEL (loop_lab
, "LPSRL", labelno
++);
6223 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file
, loop_lab
);
6225 HOST_WIDE_INT stack_clash_probe_interval
6226 = 1 << param_stack_clash_protection_guard_size
;
6228 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
6230 HOST_WIDE_INT interval
;
6231 if (flag_stack_clash_protection
)
6232 interval
= stack_clash_probe_interval
;
6234 interval
= PROBE_INTERVAL
;
6236 gcc_assert (aarch64_uimm12_shift (interval
));
6237 xops
[1] = GEN_INT (interval
);
6239 output_asm_insn ("sub\t%0, %0, %1", xops
);
6241 /* If doing stack clash protection then we probe up by the ABI specified
6242 amount. We do this because we're dropping full pages at a time in the
6243 loop. But if we're doing non-stack clash probing, probe at SP 0. */
6244 if (flag_stack_clash_protection
)
6245 xops
[1] = GEN_INT (STACK_CLASH_CALLER_GUARD
);
6247 xops
[1] = CONST0_RTX (GET_MODE (xops
[1]));
6249 /* Probe at TEST_ADDR. If we're inside the loop it is always safe to probe
6250 by this amount for each iteration. */
6251 output_asm_insn ("str\txzr, [%0, %1]", xops
);
6253 /* Test if TEST_ADDR == LAST_ADDR. */
6255 output_asm_insn ("cmp\t%0, %1", xops
);
6258 fputs ("\tb.ne\t", asm_out_file
);
6259 assemble_name_raw (asm_out_file
, loop_lab
);
6260 fputc ('\n', asm_out_file
);
6265 /* Emit the probe loop for doing stack clash probes and stack adjustments for
6266 SVE. This emits probes from BASE to BASE - ADJUSTMENT based on a guard size
6267 of GUARD_SIZE. When a probe is emitted it is done at most
6268 MIN_PROBE_THRESHOLD bytes from the current BASE at an interval of
6269 at most MIN_PROBE_THRESHOLD. By the end of this function
6270 BASE = BASE - ADJUSTMENT. */
6273 aarch64_output_probe_sve_stack_clash (rtx base
, rtx adjustment
,
6274 rtx min_probe_threshold
, rtx guard_size
)
6276 /* This function is not allowed to use any instruction generation function
6277 like gen_ and friends. If you do you'll likely ICE during CFG validation,
6278 so instead emit the code you want using output_asm_insn. */
6279 gcc_assert (flag_stack_clash_protection
);
6280 gcc_assert (CONST_INT_P (min_probe_threshold
) && CONST_INT_P (guard_size
));
6281 gcc_assert (INTVAL (guard_size
) > INTVAL (min_probe_threshold
));
6283 /* The minimum required allocation before the residual requires probing. */
6284 HOST_WIDE_INT residual_probe_guard
= INTVAL (min_probe_threshold
);
6286 /* Clamp the value down to the nearest value that can be used with a cmp. */
6287 residual_probe_guard
= aarch64_clamp_to_uimm12_shift (residual_probe_guard
);
6288 rtx probe_offset_value_rtx
= gen_int_mode (residual_probe_guard
, Pmode
);
6290 gcc_assert (INTVAL (min_probe_threshold
) >= residual_probe_guard
);
6291 gcc_assert (aarch64_uimm12_shift (residual_probe_guard
));
6293 static int labelno
= 0;
6294 char loop_start_lab
[32];
6295 char loop_end_lab
[32];
6298 ASM_GENERATE_INTERNAL_LABEL (loop_start_lab
, "SVLPSPL", labelno
);
6299 ASM_GENERATE_INTERNAL_LABEL (loop_end_lab
, "SVLPEND", labelno
++);
6301 /* Emit loop start label. */
6302 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file
, loop_start_lab
);
6304 /* ADJUSTMENT < RESIDUAL_PROBE_GUARD. */
6305 xops
[0] = adjustment
;
6306 xops
[1] = probe_offset_value_rtx
;
6307 output_asm_insn ("cmp\t%0, %1", xops
);
6309 /* Branch to end if not enough adjustment to probe. */
6310 fputs ("\tb.lt\t", asm_out_file
);
6311 assemble_name_raw (asm_out_file
, loop_end_lab
);
6312 fputc ('\n', asm_out_file
);
6314 /* BASE = BASE - RESIDUAL_PROBE_GUARD. */
6316 xops
[1] = probe_offset_value_rtx
;
6317 output_asm_insn ("sub\t%0, %0, %1", xops
);
6319 /* Probe at BASE. */
6320 xops
[1] = const0_rtx
;
6321 output_asm_insn ("str\txzr, [%0, %1]", xops
);
6323 /* ADJUSTMENT = ADJUSTMENT - RESIDUAL_PROBE_GUARD. */
6324 xops
[0] = adjustment
;
6325 xops
[1] = probe_offset_value_rtx
;
6326 output_asm_insn ("sub\t%0, %0, %1", xops
);
6328 /* Branch to start if still more bytes to allocate. */
6329 fputs ("\tb\t", asm_out_file
);
6330 assemble_name_raw (asm_out_file
, loop_start_lab
);
6331 fputc ('\n', asm_out_file
);
6333 /* No probe leave. */
6334 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file
, loop_end_lab
);
6336 /* BASE = BASE - ADJUSTMENT. */
6338 xops
[1] = adjustment
;
6339 output_asm_insn ("sub\t%0, %0, %1", xops
);
6343 /* Determine whether a frame chain needs to be generated. */
6345 aarch64_needs_frame_chain (void)
6347 /* Force a frame chain for EH returns so the return address is at FP+8. */
6348 if (frame_pointer_needed
|| crtl
->calls_eh_return
)
6351 /* A leaf function cannot have calls or write LR. */
6352 bool is_leaf
= crtl
->is_leaf
&& !df_regs_ever_live_p (LR_REGNUM
);
6354 /* Don't use a frame chain in leaf functions if leaf frame pointers
6356 if (flag_omit_leaf_frame_pointer
&& is_leaf
)
6359 return aarch64_use_frame_pointer
;
6362 /* Mark the registers that need to be saved by the callee and calculate
6363 the size of the callee-saved registers area and frame record (both FP
6364 and LR may be omitted). */
6366 aarch64_layout_frame (void)
6368 poly_int64 offset
= 0;
6369 int regno
, last_fp_reg
= INVALID_REGNUM
;
6370 machine_mode vector_save_mode
= aarch64_reg_save_mode (V8_REGNUM
);
6371 poly_int64 vector_save_size
= GET_MODE_SIZE (vector_save_mode
);
6372 bool frame_related_fp_reg_p
= false;
6373 aarch64_frame
&frame
= cfun
->machine
->frame
;
6375 frame
.emit_frame_chain
= aarch64_needs_frame_chain ();
6377 /* Adjust the outgoing arguments size if required. Keep it in sync with what
6378 the mid-end is doing. */
6379 crtl
->outgoing_args_size
= STACK_DYNAMIC_OFFSET (cfun
);
6381 #define SLOT_NOT_REQUIRED (-2)
6382 #define SLOT_REQUIRED (-1)
6384 frame
.wb_candidate1
= INVALID_REGNUM
;
6385 frame
.wb_candidate2
= INVALID_REGNUM
;
6386 frame
.spare_pred_reg
= INVALID_REGNUM
;
6388 /* First mark all the registers that really need to be saved... */
6389 for (regno
= 0; regno
<= LAST_SAVED_REGNUM
; regno
++)
6390 frame
.reg_offset
[regno
] = SLOT_NOT_REQUIRED
;
6392 /* ... that includes the eh data registers (if needed)... */
6393 if (crtl
->calls_eh_return
)
6394 for (regno
= 0; EH_RETURN_DATA_REGNO (regno
) != INVALID_REGNUM
; regno
++)
6395 frame
.reg_offset
[EH_RETURN_DATA_REGNO (regno
)] = SLOT_REQUIRED
;
6397 /* ... and any callee saved register that dataflow says is live. */
6398 for (regno
= R0_REGNUM
; regno
<= R30_REGNUM
; regno
++)
6399 if (df_regs_ever_live_p (regno
)
6400 && !fixed_regs
[regno
]
6401 && (regno
== R30_REGNUM
6402 || !crtl
->abi
->clobbers_full_reg_p (regno
)))
6403 frame
.reg_offset
[regno
] = SLOT_REQUIRED
;
6405 for (regno
= V0_REGNUM
; regno
<= V31_REGNUM
; regno
++)
6406 if (df_regs_ever_live_p (regno
)
6407 && !fixed_regs
[regno
]
6408 && !crtl
->abi
->clobbers_full_reg_p (regno
))
6410 frame
.reg_offset
[regno
] = SLOT_REQUIRED
;
6411 last_fp_reg
= regno
;
6412 if (aarch64_emit_cfi_for_reg_p (regno
))
6413 frame_related_fp_reg_p
= true;
6416 /* Big-endian SVE frames need a spare predicate register in order
6417 to save Z8-Z15. Decide which register they should use. Prefer
6418 an unused argument register if possible, so that we don't force P4
6419 to be saved unnecessarily. */
6420 if (frame_related_fp_reg_p
6421 && crtl
->abi
->id () == ARM_PCS_SVE
6422 && BYTES_BIG_ENDIAN
)
6424 bitmap live1
= df_get_live_out (ENTRY_BLOCK_PTR_FOR_FN (cfun
));
6425 bitmap live2
= df_get_live_in (EXIT_BLOCK_PTR_FOR_FN (cfun
));
6426 for (regno
= P0_REGNUM
; regno
<= P7_REGNUM
; regno
++)
6427 if (!bitmap_bit_p (live1
, regno
) && !bitmap_bit_p (live2
, regno
))
6429 gcc_assert (regno
<= P7_REGNUM
);
6430 frame
.spare_pred_reg
= regno
;
6431 df_set_regs_ever_live (regno
, true);
6434 for (regno
= P0_REGNUM
; regno
<= P15_REGNUM
; regno
++)
6435 if (df_regs_ever_live_p (regno
)
6436 && !fixed_regs
[regno
]
6437 && !crtl
->abi
->clobbers_full_reg_p (regno
))
6438 frame
.reg_offset
[regno
] = SLOT_REQUIRED
;
6440 /* With stack-clash, LR must be saved in non-leaf functions. */
6441 gcc_assert (crtl
->is_leaf
6442 || maybe_ne (frame
.reg_offset
[R30_REGNUM
], SLOT_NOT_REQUIRED
));
6444 /* Now assign stack slots for the registers. Start with the predicate
6445 registers, since predicate LDR and STR have a relatively small
6446 offset range. These saves happen below the hard frame pointer. */
6447 for (regno
= P0_REGNUM
; regno
<= P15_REGNUM
; regno
++)
6448 if (known_eq (frame
.reg_offset
[regno
], SLOT_REQUIRED
))
6450 frame
.reg_offset
[regno
] = offset
;
6451 offset
+= BYTES_PER_SVE_PRED
;
6454 if (maybe_ne (offset
, 0))
6456 /* If we have any vector registers to save above the predicate registers,
6457 the offset of the vector register save slots need to be a multiple
6458 of the vector size. This lets us use the immediate forms of LDR/STR
6459 (or LD1/ST1 for big-endian).
6461 A vector register is 8 times the size of a predicate register,
6462 and we need to save a maximum of 12 predicate registers, so the
6463 first vector register will be at either #1, MUL VL or #2, MUL VL.
6465 If we don't have any vector registers to save, and we know how
6466 big the predicate save area is, we can just round it up to the
6467 next 16-byte boundary. */
6468 if (last_fp_reg
== (int) INVALID_REGNUM
&& offset
.is_constant ())
6469 offset
= aligned_upper_bound (offset
, STACK_BOUNDARY
/ BITS_PER_UNIT
);
6472 if (known_le (offset
, vector_save_size
))
6473 offset
= vector_save_size
;
6474 else if (known_le (offset
, vector_save_size
* 2))
6475 offset
= vector_save_size
* 2;
6481 /* If we need to save any SVE vector registers, add them next. */
6482 if (last_fp_reg
!= (int) INVALID_REGNUM
&& crtl
->abi
->id () == ARM_PCS_SVE
)
6483 for (regno
= V0_REGNUM
; regno
<= V31_REGNUM
; regno
++)
6484 if (known_eq (frame
.reg_offset
[regno
], SLOT_REQUIRED
))
6486 frame
.reg_offset
[regno
] = offset
;
6487 offset
+= vector_save_size
;
6490 /* OFFSET is now the offset of the hard frame pointer from the bottom
6491 of the callee save area. */
6492 bool saves_below_hard_fp_p
= maybe_ne (offset
, 0);
6493 frame
.below_hard_fp_saved_regs_size
= offset
;
6494 if (frame
.emit_frame_chain
)
6496 /* FP and LR are placed in the linkage record. */
6497 frame
.reg_offset
[R29_REGNUM
] = offset
;
6498 frame
.wb_candidate1
= R29_REGNUM
;
6499 frame
.reg_offset
[R30_REGNUM
] = offset
+ UNITS_PER_WORD
;
6500 frame
.wb_candidate2
= R30_REGNUM
;
6501 offset
+= 2 * UNITS_PER_WORD
;
6504 for (regno
= R0_REGNUM
; regno
<= R30_REGNUM
; regno
++)
6505 if (known_eq (frame
.reg_offset
[regno
], SLOT_REQUIRED
))
6507 frame
.reg_offset
[regno
] = offset
;
6508 if (frame
.wb_candidate1
== INVALID_REGNUM
)
6509 frame
.wb_candidate1
= regno
;
6510 else if (frame
.wb_candidate2
== INVALID_REGNUM
)
6511 frame
.wb_candidate2
= regno
;
6512 offset
+= UNITS_PER_WORD
;
6515 poly_int64 max_int_offset
= offset
;
6516 offset
= aligned_upper_bound (offset
, STACK_BOUNDARY
/ BITS_PER_UNIT
);
6517 bool has_align_gap
= maybe_ne (offset
, max_int_offset
);
6519 for (regno
= V0_REGNUM
; regno
<= V31_REGNUM
; regno
++)
6520 if (known_eq (frame
.reg_offset
[regno
], SLOT_REQUIRED
))
6522 /* If there is an alignment gap between integer and fp callee-saves,
6523 allocate the last fp register to it if possible. */
6524 if (regno
== last_fp_reg
6526 && known_eq (vector_save_size
, 8)
6527 && multiple_p (offset
, 16))
6529 frame
.reg_offset
[regno
] = max_int_offset
;
6533 frame
.reg_offset
[regno
] = offset
;
6534 if (frame
.wb_candidate1
== INVALID_REGNUM
)
6535 frame
.wb_candidate1
= regno
;
6536 else if (frame
.wb_candidate2
== INVALID_REGNUM
6537 && frame
.wb_candidate1
>= V0_REGNUM
)
6538 frame
.wb_candidate2
= regno
;
6539 offset
+= vector_save_size
;
6542 offset
= aligned_upper_bound (offset
, STACK_BOUNDARY
/ BITS_PER_UNIT
);
6544 frame
.saved_regs_size
= offset
;
6546 poly_int64 varargs_and_saved_regs_size
= offset
+ frame
.saved_varargs_size
;
6548 poly_int64 above_outgoing_args
6549 = aligned_upper_bound (varargs_and_saved_regs_size
6550 + get_frame_size (),
6551 STACK_BOUNDARY
/ BITS_PER_UNIT
);
6553 frame
.hard_fp_offset
6554 = above_outgoing_args
- frame
.below_hard_fp_saved_regs_size
;
6556 /* Both these values are already aligned. */
6557 gcc_assert (multiple_p (crtl
->outgoing_args_size
,
6558 STACK_BOUNDARY
/ BITS_PER_UNIT
));
6559 frame
.frame_size
= above_outgoing_args
+ crtl
->outgoing_args_size
;
6561 frame
.locals_offset
= frame
.saved_varargs_size
;
6563 frame
.initial_adjust
= 0;
6564 frame
.final_adjust
= 0;
6565 frame
.callee_adjust
= 0;
6566 frame
.sve_callee_adjust
= 0;
6567 frame
.callee_offset
= 0;
6569 HOST_WIDE_INT max_push_offset
= 0;
6570 if (frame
.wb_candidate2
!= INVALID_REGNUM
)
6571 max_push_offset
= 512;
6572 else if (frame
.wb_candidate1
!= INVALID_REGNUM
)
6573 max_push_offset
= 256;
6575 HOST_WIDE_INT const_size
, const_outgoing_args_size
, const_fp_offset
;
6576 HOST_WIDE_INT const_saved_regs_size
;
6577 if (frame
.frame_size
.is_constant (&const_size
)
6578 && const_size
< max_push_offset
6579 && known_eq (frame
.hard_fp_offset
, const_size
))
6581 /* Simple, small frame with no outgoing arguments:
6583 stp reg1, reg2, [sp, -frame_size]!
6584 stp reg3, reg4, [sp, 16] */
6585 frame
.callee_adjust
= const_size
;
6587 else if (crtl
->outgoing_args_size
.is_constant (&const_outgoing_args_size
)
6588 && frame
.saved_regs_size
.is_constant (&const_saved_regs_size
)
6589 && const_outgoing_args_size
+ const_saved_regs_size
< 512
6590 /* We could handle this case even with outgoing args, provided
6591 that the number of args left us with valid offsets for all
6592 predicate and vector save slots. It's such a rare case that
6593 it hardly seems worth the effort though. */
6594 && (!saves_below_hard_fp_p
|| const_outgoing_args_size
== 0)
6595 && !(cfun
->calls_alloca
6596 && frame
.hard_fp_offset
.is_constant (&const_fp_offset
)
6597 && const_fp_offset
< max_push_offset
))
6599 /* Frame with small outgoing arguments:
6601 sub sp, sp, frame_size
6602 stp reg1, reg2, [sp, outgoing_args_size]
6603 stp reg3, reg4, [sp, outgoing_args_size + 16] */
6604 frame
.initial_adjust
= frame
.frame_size
;
6605 frame
.callee_offset
= const_outgoing_args_size
;
6607 else if (saves_below_hard_fp_p
6608 && known_eq (frame
.saved_regs_size
,
6609 frame
.below_hard_fp_saved_regs_size
))
6611 /* Frame in which all saves are SVE saves:
6613 sub sp, sp, hard_fp_offset + below_hard_fp_saved_regs_size
6614 save SVE registers relative to SP
6615 sub sp, sp, outgoing_args_size */
6616 frame
.initial_adjust
= (frame
.hard_fp_offset
6617 + frame
.below_hard_fp_saved_regs_size
);
6618 frame
.final_adjust
= crtl
->outgoing_args_size
;
6620 else if (frame
.hard_fp_offset
.is_constant (&const_fp_offset
)
6621 && const_fp_offset
< max_push_offset
)
6623 /* Frame with large outgoing arguments or SVE saves, but with
6626 stp reg1, reg2, [sp, -hard_fp_offset]!
6627 stp reg3, reg4, [sp, 16]
6628 [sub sp, sp, below_hard_fp_saved_regs_size]
6629 [save SVE registers relative to SP]
6630 sub sp, sp, outgoing_args_size */
6631 frame
.callee_adjust
= const_fp_offset
;
6632 frame
.sve_callee_adjust
= frame
.below_hard_fp_saved_regs_size
;
6633 frame
.final_adjust
= crtl
->outgoing_args_size
;
6637 /* Frame with large local area and outgoing arguments or SVE saves,
6638 using frame pointer:
6640 sub sp, sp, hard_fp_offset
6641 stp x29, x30, [sp, 0]
6643 stp reg3, reg4, [sp, 16]
6644 [sub sp, sp, below_hard_fp_saved_regs_size]
6645 [save SVE registers relative to SP]
6646 sub sp, sp, outgoing_args_size */
6647 frame
.initial_adjust
= frame
.hard_fp_offset
;
6648 frame
.sve_callee_adjust
= frame
.below_hard_fp_saved_regs_size
;
6649 frame
.final_adjust
= crtl
->outgoing_args_size
;
6652 /* Make sure the individual adjustments add up to the full frame size. */
6653 gcc_assert (known_eq (frame
.initial_adjust
6654 + frame
.callee_adjust
6655 + frame
.sve_callee_adjust
6656 + frame
.final_adjust
, frame
.frame_size
));
6658 frame
.laid_out
= true;
6661 /* Return true if the register REGNO is saved on entry to
6662 the current function. */
6665 aarch64_register_saved_on_entry (int regno
)
6667 return known_ge (cfun
->machine
->frame
.reg_offset
[regno
], 0);
6670 /* Return the next register up from REGNO up to LIMIT for the callee
6674 aarch64_next_callee_save (unsigned regno
, unsigned limit
)
6676 while (regno
<= limit
&& !aarch64_register_saved_on_entry (regno
))
6681 /* Push the register number REGNO of mode MODE to the stack with write-back
6682 adjusting the stack by ADJUSTMENT. */
6685 aarch64_pushwb_single_reg (machine_mode mode
, unsigned regno
,
6686 HOST_WIDE_INT adjustment
)
6688 rtx base_rtx
= stack_pointer_rtx
;
6691 reg
= gen_rtx_REG (mode
, regno
);
6692 mem
= gen_rtx_PRE_MODIFY (Pmode
, base_rtx
,
6693 plus_constant (Pmode
, base_rtx
, -adjustment
));
6694 mem
= gen_frame_mem (mode
, mem
);
6696 insn
= emit_move_insn (mem
, reg
);
6697 RTX_FRAME_RELATED_P (insn
) = 1;
6700 /* Generate and return an instruction to store the pair of registers
6701 REG and REG2 of mode MODE to location BASE with write-back adjusting
6702 the stack location BASE by ADJUSTMENT. */
6705 aarch64_gen_storewb_pair (machine_mode mode
, rtx base
, rtx reg
, rtx reg2
,
6706 HOST_WIDE_INT adjustment
)
6711 return gen_storewb_pairdi_di (base
, base
, reg
, reg2
,
6712 GEN_INT (-adjustment
),
6713 GEN_INT (UNITS_PER_WORD
- adjustment
));
6715 return gen_storewb_pairdf_di (base
, base
, reg
, reg2
,
6716 GEN_INT (-adjustment
),
6717 GEN_INT (UNITS_PER_WORD
- adjustment
));
6719 return gen_storewb_pairtf_di (base
, base
, reg
, reg2
,
6720 GEN_INT (-adjustment
),
6721 GEN_INT (UNITS_PER_VREG
- adjustment
));
6727 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
6728 stack pointer by ADJUSTMENT. */
6731 aarch64_push_regs (unsigned regno1
, unsigned regno2
, HOST_WIDE_INT adjustment
)
6734 machine_mode mode
= aarch64_reg_save_mode (regno1
);
6736 if (regno2
== INVALID_REGNUM
)
6737 return aarch64_pushwb_single_reg (mode
, regno1
, adjustment
);
6739 rtx reg1
= gen_rtx_REG (mode
, regno1
);
6740 rtx reg2
= gen_rtx_REG (mode
, regno2
);
6742 insn
= emit_insn (aarch64_gen_storewb_pair (mode
, stack_pointer_rtx
, reg1
,
6744 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn
), 0, 2)) = 1;
6745 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn
), 0, 1)) = 1;
6746 RTX_FRAME_RELATED_P (insn
) = 1;
6749 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
6750 adjusting it by ADJUSTMENT afterwards. */
6753 aarch64_gen_loadwb_pair (machine_mode mode
, rtx base
, rtx reg
, rtx reg2
,
6754 HOST_WIDE_INT adjustment
)
6759 return gen_loadwb_pairdi_di (base
, base
, reg
, reg2
, GEN_INT (adjustment
),
6760 GEN_INT (UNITS_PER_WORD
));
6762 return gen_loadwb_pairdf_di (base
, base
, reg
, reg2
, GEN_INT (adjustment
),
6763 GEN_INT (UNITS_PER_WORD
));
6765 return gen_loadwb_pairtf_di (base
, base
, reg
, reg2
, GEN_INT (adjustment
),
6766 GEN_INT (UNITS_PER_VREG
));
6772 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
6773 afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
6777 aarch64_pop_regs (unsigned regno1
, unsigned regno2
, HOST_WIDE_INT adjustment
,
6780 machine_mode mode
= aarch64_reg_save_mode (regno1
);
6781 rtx reg1
= gen_rtx_REG (mode
, regno1
);
6783 *cfi_ops
= alloc_reg_note (REG_CFA_RESTORE
, reg1
, *cfi_ops
);
6785 if (regno2
== INVALID_REGNUM
)
6787 rtx mem
= plus_constant (Pmode
, stack_pointer_rtx
, adjustment
);
6788 mem
= gen_rtx_POST_MODIFY (Pmode
, stack_pointer_rtx
, mem
);
6789 emit_move_insn (reg1
, gen_frame_mem (mode
, mem
));
6793 rtx reg2
= gen_rtx_REG (mode
, regno2
);
6794 *cfi_ops
= alloc_reg_note (REG_CFA_RESTORE
, reg2
, *cfi_ops
);
6795 emit_insn (aarch64_gen_loadwb_pair (mode
, stack_pointer_rtx
, reg1
,
6800 /* Generate and return a store pair instruction of mode MODE to store
6801 register REG1 to MEM1 and register REG2 to MEM2. */
6804 aarch64_gen_store_pair (machine_mode mode
, rtx mem1
, rtx reg1
, rtx mem2
,
6810 return gen_store_pair_dw_didi (mem1
, reg1
, mem2
, reg2
);
6813 return gen_store_pair_dw_dfdf (mem1
, reg1
, mem2
, reg2
);
6816 return gen_store_pair_dw_tftf (mem1
, reg1
, mem2
, reg2
);
6823 /* Generate and regurn a load pair isntruction of mode MODE to load register
6824 REG1 from MEM1 and register REG2 from MEM2. */
6827 aarch64_gen_load_pair (machine_mode mode
, rtx reg1
, rtx mem1
, rtx reg2
,
6833 return gen_load_pair_dw_didi (reg1
, mem1
, reg2
, mem2
);
6836 return gen_load_pair_dw_dfdf (reg1
, mem1
, reg2
, mem2
);
6839 return gen_load_pair_dw_tftf (reg1
, mem1
, reg2
, mem2
);
6846 /* Return TRUE if return address signing should be enabled for the current
6847 function, otherwise return FALSE. */
6850 aarch64_return_address_signing_enabled (void)
6852 /* This function should only be called after frame laid out. */
6853 gcc_assert (cfun
->machine
->frame
.laid_out
);
6855 /* If signing scope is AARCH64_FUNCTION_NON_LEAF, we only sign a leaf function
6856 if its LR is pushed onto stack. */
6857 return (aarch64_ra_sign_scope
== AARCH64_FUNCTION_ALL
6858 || (aarch64_ra_sign_scope
== AARCH64_FUNCTION_NON_LEAF
6859 && known_ge (cfun
->machine
->frame
.reg_offset
[LR_REGNUM
], 0)));
6862 /* Return TRUE if Branch Target Identification Mechanism is enabled. */
6864 aarch64_bti_enabled (void)
6866 return (aarch64_enable_bti
== 1);
6869 /* The caller is going to use ST1D or LD1D to save or restore an SVE
6870 register in mode MODE at BASE_RTX + OFFSET, where OFFSET is in
6871 the range [1, 16] * GET_MODE_SIZE (MODE). Prepare for this by:
6873 (1) updating BASE_RTX + OFFSET so that it is a legitimate ST1D
6876 (2) setting PRED to a valid predicate register for the ST1D or LD1D,
6877 if the variable isn't already nonnull
6879 (1) is needed when OFFSET is in the range [8, 16] * GET_MODE_SIZE (MODE).
6880 Handle this case using a temporary base register that is suitable for
6881 all offsets in that range. Use ANCHOR_REG as this base register if it
6882 is nonnull, otherwise create a new register and store it in ANCHOR_REG. */
6885 aarch64_adjust_sve_callee_save_base (machine_mode mode
, rtx
&base_rtx
,
6886 rtx
&anchor_reg
, poly_int64
&offset
,
6889 if (maybe_ge (offset
, 8 * GET_MODE_SIZE (mode
)))
6891 /* This is the maximum valid offset of the anchor from the base.
6892 Lower values would be valid too. */
6893 poly_int64 anchor_offset
= 16 * GET_MODE_SIZE (mode
);
6896 anchor_reg
= gen_rtx_REG (Pmode
, STACK_CLASH_SVE_CFA_REGNUM
);
6897 emit_insn (gen_add3_insn (anchor_reg
, base_rtx
,
6898 gen_int_mode (anchor_offset
, Pmode
)));
6900 base_rtx
= anchor_reg
;
6901 offset
-= anchor_offset
;
6905 int pred_reg
= cfun
->machine
->frame
.spare_pred_reg
;
6906 emit_move_insn (gen_rtx_REG (VNx16BImode
, pred_reg
),
6907 CONSTM1_RTX (VNx16BImode
));
6908 ptrue
= gen_rtx_REG (VNx2BImode
, pred_reg
);
6912 /* Add a REG_CFA_EXPRESSION note to INSN to say that register REG
6913 is saved at BASE + OFFSET. */
6916 aarch64_add_cfa_expression (rtx_insn
*insn
, rtx reg
,
6917 rtx base
, poly_int64 offset
)
6919 rtx mem
= gen_frame_mem (GET_MODE (reg
),
6920 plus_constant (Pmode
, base
, offset
));
6921 add_reg_note (insn
, REG_CFA_EXPRESSION
, gen_rtx_SET (mem
, reg
));
6924 /* Emit code to save the callee-saved registers from register number START
6925 to LIMIT to the stack at the location starting at offset START_OFFSET,
6926 skipping any write-back candidates if SKIP_WB is true. HARD_FP_VALID_P
6927 is true if the hard frame pointer has been set up. */
6930 aarch64_save_callee_saves (poly_int64 start_offset
,
6931 unsigned start
, unsigned limit
, bool skip_wb
,
6932 bool hard_fp_valid_p
)
6937 rtx anchor_reg
= NULL_RTX
, ptrue
= NULL_RTX
;
6939 for (regno
= aarch64_next_callee_save (start
, limit
);
6941 regno
= aarch64_next_callee_save (regno
+ 1, limit
))
6945 bool frame_related_p
= aarch64_emit_cfi_for_reg_p (regno
);
6948 && (regno
== cfun
->machine
->frame
.wb_candidate1
6949 || regno
== cfun
->machine
->frame
.wb_candidate2
))
6952 if (cfun
->machine
->reg_is_wrapped_separately
[regno
])
6955 machine_mode mode
= aarch64_reg_save_mode (regno
);
6956 reg
= gen_rtx_REG (mode
, regno
);
6957 offset
= start_offset
+ cfun
->machine
->frame
.reg_offset
[regno
];
6958 rtx base_rtx
= stack_pointer_rtx
;
6959 poly_int64 sp_offset
= offset
;
6961 HOST_WIDE_INT const_offset
;
6962 if (mode
== VNx2DImode
&& BYTES_BIG_ENDIAN
)
6963 aarch64_adjust_sve_callee_save_base (mode
, base_rtx
, anchor_reg
,
6965 else if (GP_REGNUM_P (regno
)
6966 && (!offset
.is_constant (&const_offset
) || const_offset
>= 512))
6968 gcc_assert (known_eq (start_offset
, 0));
6969 poly_int64 fp_offset
6970 = cfun
->machine
->frame
.below_hard_fp_saved_regs_size
;
6971 if (hard_fp_valid_p
)
6972 base_rtx
= hard_frame_pointer_rtx
;
6977 anchor_reg
= gen_rtx_REG (Pmode
, STACK_CLASH_SVE_CFA_REGNUM
);
6978 emit_insn (gen_add3_insn (anchor_reg
, base_rtx
,
6979 gen_int_mode (fp_offset
, Pmode
)));
6981 base_rtx
= anchor_reg
;
6983 offset
-= fp_offset
;
6985 mem
= gen_frame_mem (mode
, plus_constant (Pmode
, base_rtx
, offset
));
6986 bool need_cfa_note_p
= (base_rtx
!= stack_pointer_rtx
);
6988 if (!aarch64_sve_mode_p (mode
)
6989 && (regno2
= aarch64_next_callee_save (regno
+ 1, limit
)) <= limit
6990 && !cfun
->machine
->reg_is_wrapped_separately
[regno2
]
6991 && known_eq (GET_MODE_SIZE (mode
),
6992 cfun
->machine
->frame
.reg_offset
[regno2
]
6993 - cfun
->machine
->frame
.reg_offset
[regno
]))
6995 rtx reg2
= gen_rtx_REG (mode
, regno2
);
6998 offset
+= GET_MODE_SIZE (mode
);
6999 mem2
= gen_frame_mem (mode
, plus_constant (Pmode
, base_rtx
, offset
));
7000 insn
= emit_insn (aarch64_gen_store_pair (mode
, mem
, reg
, mem2
,
7003 /* The first part of a frame-related parallel insn is
7004 always assumed to be relevant to the frame
7005 calculations; subsequent parts, are only
7006 frame-related if explicitly marked. */
7007 if (aarch64_emit_cfi_for_reg_p (regno2
))
7009 if (need_cfa_note_p
)
7010 aarch64_add_cfa_expression (insn
, reg2
, stack_pointer_rtx
,
7011 sp_offset
+ GET_MODE_SIZE (mode
));
7013 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn
), 0, 1)) = 1;
7018 else if (mode
== VNx2DImode
&& BYTES_BIG_ENDIAN
)
7020 insn
= emit_insn (gen_aarch64_pred_mov (mode
, mem
, ptrue
, reg
));
7021 need_cfa_note_p
= true;
7023 else if (aarch64_sve_mode_p (mode
))
7024 insn
= emit_insn (gen_rtx_SET (mem
, reg
));
7026 insn
= emit_move_insn (mem
, reg
);
7028 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
7029 if (frame_related_p
&& need_cfa_note_p
)
7030 aarch64_add_cfa_expression (insn
, reg
, stack_pointer_rtx
, sp_offset
);
7034 /* Emit code to restore the callee registers from register number START
7035 up to and including LIMIT. Restore from the stack offset START_OFFSET,
7036 skipping any write-back candidates if SKIP_WB is true. Write the
7037 appropriate REG_CFA_RESTORE notes into CFI_OPS. */
7040 aarch64_restore_callee_saves (poly_int64 start_offset
, unsigned start
,
7041 unsigned limit
, bool skip_wb
, rtx
*cfi_ops
)
7046 rtx anchor_reg
= NULL_RTX
, ptrue
= NULL_RTX
;
7048 for (regno
= aarch64_next_callee_save (start
, limit
);
7050 regno
= aarch64_next_callee_save (regno
+ 1, limit
))
7052 bool frame_related_p
= aarch64_emit_cfi_for_reg_p (regno
);
7053 if (cfun
->machine
->reg_is_wrapped_separately
[regno
])
7059 && (regno
== cfun
->machine
->frame
.wb_candidate1
7060 || regno
== cfun
->machine
->frame
.wb_candidate2
))
7063 machine_mode mode
= aarch64_reg_save_mode (regno
);
7064 reg
= gen_rtx_REG (mode
, regno
);
7065 offset
= start_offset
+ cfun
->machine
->frame
.reg_offset
[regno
];
7066 rtx base_rtx
= stack_pointer_rtx
;
7067 if (mode
== VNx2DImode
&& BYTES_BIG_ENDIAN
)
7068 aarch64_adjust_sve_callee_save_base (mode
, base_rtx
, anchor_reg
,
7070 mem
= gen_frame_mem (mode
, plus_constant (Pmode
, base_rtx
, offset
));
7072 if (!aarch64_sve_mode_p (mode
)
7073 && (regno2
= aarch64_next_callee_save (regno
+ 1, limit
)) <= limit
7074 && !cfun
->machine
->reg_is_wrapped_separately
[regno2
]
7075 && known_eq (GET_MODE_SIZE (mode
),
7076 cfun
->machine
->frame
.reg_offset
[regno2
]
7077 - cfun
->machine
->frame
.reg_offset
[regno
]))
7079 rtx reg2
= gen_rtx_REG (mode
, regno2
);
7082 offset
+= GET_MODE_SIZE (mode
);
7083 mem2
= gen_frame_mem (mode
, plus_constant (Pmode
, base_rtx
, offset
));
7084 emit_insn (aarch64_gen_load_pair (mode
, reg
, mem
, reg2
, mem2
));
7086 *cfi_ops
= alloc_reg_note (REG_CFA_RESTORE
, reg2
, *cfi_ops
);
7089 else if (mode
== VNx2DImode
&& BYTES_BIG_ENDIAN
)
7090 emit_insn (gen_aarch64_pred_mov (mode
, reg
, ptrue
, mem
));
7091 else if (aarch64_sve_mode_p (mode
))
7092 emit_insn (gen_rtx_SET (reg
, mem
));
7094 emit_move_insn (reg
, mem
);
7095 if (frame_related_p
)
7096 *cfi_ops
= alloc_reg_note (REG_CFA_RESTORE
, reg
, *cfi_ops
);
7100 /* Return true if OFFSET is a signed 4-bit value multiplied by the size
7104 offset_4bit_signed_scaled_p (machine_mode mode
, poly_int64 offset
)
7106 HOST_WIDE_INT multiple
;
7107 return (constant_multiple_p (offset
, GET_MODE_SIZE (mode
), &multiple
)
7108 && IN_RANGE (multiple
, -8, 7));
7111 /* Return true if OFFSET is a unsigned 6-bit value multiplied by the size
7115 offset_6bit_unsigned_scaled_p (machine_mode mode
, poly_int64 offset
)
7117 HOST_WIDE_INT multiple
;
7118 return (constant_multiple_p (offset
, GET_MODE_SIZE (mode
), &multiple
)
7119 && IN_RANGE (multiple
, 0, 63));
7122 /* Return true if OFFSET is a signed 7-bit value multiplied by the size
7126 aarch64_offset_7bit_signed_scaled_p (machine_mode mode
, poly_int64 offset
)
7128 HOST_WIDE_INT multiple
;
7129 return (constant_multiple_p (offset
, GET_MODE_SIZE (mode
), &multiple
)
7130 && IN_RANGE (multiple
, -64, 63));
7133 /* Return true if OFFSET is a signed 9-bit value. */
7136 aarch64_offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED
,
7139 HOST_WIDE_INT const_offset
;
7140 return (offset
.is_constant (&const_offset
)
7141 && IN_RANGE (const_offset
, -256, 255));
7144 /* Return true if OFFSET is a signed 9-bit value multiplied by the size
7148 offset_9bit_signed_scaled_p (machine_mode mode
, poly_int64 offset
)
7150 HOST_WIDE_INT multiple
;
7151 return (constant_multiple_p (offset
, GET_MODE_SIZE (mode
), &multiple
)
7152 && IN_RANGE (multiple
, -256, 255));
7155 /* Return true if OFFSET is an unsigned 12-bit value multiplied by the size
7159 offset_12bit_unsigned_scaled_p (machine_mode mode
, poly_int64 offset
)
7161 HOST_WIDE_INT multiple
;
7162 return (constant_multiple_p (offset
, GET_MODE_SIZE (mode
), &multiple
)
7163 && IN_RANGE (multiple
, 0, 4095));
7166 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS. */
7169 aarch64_get_separate_components (void)
7171 sbitmap components
= sbitmap_alloc (LAST_SAVED_REGNUM
+ 1);
7172 bitmap_clear (components
);
7174 /* The registers we need saved to the frame. */
7175 for (unsigned regno
= 0; regno
<= LAST_SAVED_REGNUM
; regno
++)
7176 if (aarch64_register_saved_on_entry (regno
))
7178 /* Punt on saves and restores that use ST1D and LD1D. We could
7179 try to be smarter, but it would involve making sure that the
7180 spare predicate register itself is safe to use at the save
7181 and restore points. Also, when a frame pointer is being used,
7182 the slots are often out of reach of ST1D and LD1D anyway. */
7183 machine_mode mode
= aarch64_reg_save_mode (regno
);
7184 if (mode
== VNx2DImode
&& BYTES_BIG_ENDIAN
)
7187 poly_int64 offset
= cfun
->machine
->frame
.reg_offset
[regno
];
7189 /* If the register is saved in the first SVE save slot, we use
7190 it as a stack probe for -fstack-clash-protection. */
7191 if (flag_stack_clash_protection
7192 && maybe_ne (cfun
->machine
->frame
.below_hard_fp_saved_regs_size
, 0)
7193 && known_eq (offset
, 0))
7196 /* Get the offset relative to the register we'll use. */
7197 if (frame_pointer_needed
)
7198 offset
-= cfun
->machine
->frame
.below_hard_fp_saved_regs_size
;
7200 offset
+= crtl
->outgoing_args_size
;
7202 /* Check that we can access the stack slot of the register with one
7203 direct load with no adjustments needed. */
7204 if (aarch64_sve_mode_p (mode
)
7205 ? offset_9bit_signed_scaled_p (mode
, offset
)
7206 : offset_12bit_unsigned_scaled_p (mode
, offset
))
7207 bitmap_set_bit (components
, regno
);
7210 /* Don't mess with the hard frame pointer. */
7211 if (frame_pointer_needed
)
7212 bitmap_clear_bit (components
, HARD_FRAME_POINTER_REGNUM
);
7214 /* If the spare predicate register used by big-endian SVE code
7215 is call-preserved, it must be saved in the main prologue
7216 before any saves that use it. */
7217 if (cfun
->machine
->frame
.spare_pred_reg
!= INVALID_REGNUM
)
7218 bitmap_clear_bit (components
, cfun
->machine
->frame
.spare_pred_reg
);
7220 unsigned reg1
= cfun
->machine
->frame
.wb_candidate1
;
7221 unsigned reg2
= cfun
->machine
->frame
.wb_candidate2
;
7222 /* If registers have been chosen to be stored/restored with
7223 writeback don't interfere with them to avoid having to output explicit
7224 stack adjustment instructions. */
7225 if (reg2
!= INVALID_REGNUM
)
7226 bitmap_clear_bit (components
, reg2
);
7227 if (reg1
!= INVALID_REGNUM
)
7228 bitmap_clear_bit (components
, reg1
);
7230 bitmap_clear_bit (components
, LR_REGNUM
);
7231 bitmap_clear_bit (components
, SP_REGNUM
);
7236 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB. */
7239 aarch64_components_for_bb (basic_block bb
)
7241 bitmap in
= DF_LIVE_IN (bb
);
7242 bitmap gen
= &DF_LIVE_BB_INFO (bb
)->gen
;
7243 bitmap kill
= &DF_LIVE_BB_INFO (bb
)->kill
;
7245 sbitmap components
= sbitmap_alloc (LAST_SAVED_REGNUM
+ 1);
7246 bitmap_clear (components
);
7248 /* Clobbered registers don't generate values in any meaningful sense,
7249 since nothing after the clobber can rely on their value. And we can't
7250 say that partially-clobbered registers are unconditionally killed,
7251 because whether they're killed or not depends on the mode of the
7252 value they're holding. Thus partially call-clobbered registers
7253 appear in neither the kill set nor the gen set.
7255 Check manually for any calls that clobber more of a register than the
7256 current function can. */
7257 function_abi_aggregator callee_abis
;
7259 FOR_BB_INSNS (bb
, insn
)
7261 callee_abis
.note_callee_abi (insn_callee_abi (insn
));
7262 HARD_REG_SET extra_caller_saves
= callee_abis
.caller_save_regs (*crtl
->abi
);
7264 /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets. */
7265 for (unsigned regno
= 0; regno
<= LAST_SAVED_REGNUM
; regno
++)
7266 if (!fixed_regs
[regno
]
7267 && !crtl
->abi
->clobbers_full_reg_p (regno
)
7268 && (TEST_HARD_REG_BIT (extra_caller_saves
, regno
)
7269 || bitmap_bit_p (in
, regno
)
7270 || bitmap_bit_p (gen
, regno
)
7271 || bitmap_bit_p (kill
, regno
)))
7273 bitmap_set_bit (components
, regno
);
7275 /* If there is a callee-save at an adjacent offset, add it too
7276 to increase the use of LDP/STP. */
7277 poly_int64 offset
= cfun
->machine
->frame
.reg_offset
[regno
];
7278 unsigned regno2
= multiple_p (offset
, 16) ? regno
+ 1 : regno
- 1;
7280 if (regno2
<= LAST_SAVED_REGNUM
)
7282 poly_int64 offset2
= cfun
->machine
->frame
.reg_offset
[regno2
];
7284 ? known_eq (offset
+ 8, offset2
)
7285 : multiple_p (offset2
, 16) && known_eq (offset2
+ 8, offset
))
7286 bitmap_set_bit (components
, regno2
);
7293 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
7294 Nothing to do for aarch64. */
7297 aarch64_disqualify_components (sbitmap
, edge
, sbitmap
, bool)
7301 /* Return the next set bit in BMP from START onwards. Return the total number
7302 of bits in BMP if no set bit is found at or after START. */
7305 aarch64_get_next_set_bit (sbitmap bmp
, unsigned int start
)
7307 unsigned int nbits
= SBITMAP_SIZE (bmp
);
7311 gcc_assert (start
< nbits
);
7312 for (unsigned int i
= start
; i
< nbits
; i
++)
7313 if (bitmap_bit_p (bmp
, i
))
7319 /* Do the work for aarch64_emit_prologue_components and
7320 aarch64_emit_epilogue_components. COMPONENTS is the bitmap of registers
7321 to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
7322 for these components or the epilogue sequence. That is, it determines
7323 whether we should emit stores or loads and what kind of CFA notes to attach
7324 to the insns. Otherwise the logic for the two sequences is very
7328 aarch64_process_components (sbitmap components
, bool prologue_p
)
7330 rtx ptr_reg
= gen_rtx_REG (Pmode
, frame_pointer_needed
7331 ? HARD_FRAME_POINTER_REGNUM
7332 : STACK_POINTER_REGNUM
);
7334 unsigned last_regno
= SBITMAP_SIZE (components
);
7335 unsigned regno
= aarch64_get_next_set_bit (components
, R0_REGNUM
);
7336 rtx_insn
*insn
= NULL
;
7338 while (regno
!= last_regno
)
7340 bool frame_related_p
= aarch64_emit_cfi_for_reg_p (regno
);
7341 machine_mode mode
= aarch64_reg_save_mode (regno
);
7343 rtx reg
= gen_rtx_REG (mode
, regno
);
7344 poly_int64 offset
= cfun
->machine
->frame
.reg_offset
[regno
];
7345 if (frame_pointer_needed
)
7346 offset
-= cfun
->machine
->frame
.below_hard_fp_saved_regs_size
;
7348 offset
+= crtl
->outgoing_args_size
;
7350 rtx addr
= plus_constant (Pmode
, ptr_reg
, offset
);
7351 rtx mem
= gen_frame_mem (mode
, addr
);
7353 rtx set
= prologue_p
? gen_rtx_SET (mem
, reg
) : gen_rtx_SET (reg
, mem
);
7354 unsigned regno2
= aarch64_get_next_set_bit (components
, regno
+ 1);
7355 /* No more registers to handle after REGNO.
7356 Emit a single save/restore and exit. */
7357 if (regno2
== last_regno
)
7359 insn
= emit_insn (set
);
7360 if (frame_related_p
)
7362 RTX_FRAME_RELATED_P (insn
) = 1;
7364 add_reg_note (insn
, REG_CFA_OFFSET
, copy_rtx (set
));
7366 add_reg_note (insn
, REG_CFA_RESTORE
, reg
);
7371 poly_int64 offset2
= cfun
->machine
->frame
.reg_offset
[regno2
];
7372 /* The next register is not of the same class or its offset is not
7373 mergeable with the current one into a pair. */
7374 if (aarch64_sve_mode_p (mode
)
7375 || !satisfies_constraint_Ump (mem
)
7376 || GP_REGNUM_P (regno
) != GP_REGNUM_P (regno2
)
7377 || (crtl
->abi
->id () == ARM_PCS_SIMD
&& FP_REGNUM_P (regno
))
7378 || maybe_ne ((offset2
- cfun
->machine
->frame
.reg_offset
[regno
]),
7379 GET_MODE_SIZE (mode
)))
7381 insn
= emit_insn (set
);
7382 if (frame_related_p
)
7384 RTX_FRAME_RELATED_P (insn
) = 1;
7386 add_reg_note (insn
, REG_CFA_OFFSET
, copy_rtx (set
));
7388 add_reg_note (insn
, REG_CFA_RESTORE
, reg
);
7395 bool frame_related2_p
= aarch64_emit_cfi_for_reg_p (regno2
);
7397 /* REGNO2 can be saved/restored in a pair with REGNO. */
7398 rtx reg2
= gen_rtx_REG (mode
, regno2
);
7399 if (frame_pointer_needed
)
7400 offset2
-= cfun
->machine
->frame
.below_hard_fp_saved_regs_size
;
7402 offset2
+= crtl
->outgoing_args_size
;
7403 rtx addr2
= plus_constant (Pmode
, ptr_reg
, offset2
);
7404 rtx mem2
= gen_frame_mem (mode
, addr2
);
7405 rtx set2
= prologue_p
? gen_rtx_SET (mem2
, reg2
)
7406 : gen_rtx_SET (reg2
, mem2
);
7409 insn
= emit_insn (aarch64_gen_store_pair (mode
, mem
, reg
, mem2
, reg2
));
7411 insn
= emit_insn (aarch64_gen_load_pair (mode
, reg
, mem
, reg2
, mem2
));
7413 if (frame_related_p
|| frame_related2_p
)
7415 RTX_FRAME_RELATED_P (insn
) = 1;
7418 if (frame_related_p
)
7419 add_reg_note (insn
, REG_CFA_OFFSET
, set
);
7420 if (frame_related2_p
)
7421 add_reg_note (insn
, REG_CFA_OFFSET
, set2
);
7425 if (frame_related_p
)
7426 add_reg_note (insn
, REG_CFA_RESTORE
, reg
);
7427 if (frame_related2_p
)
7428 add_reg_note (insn
, REG_CFA_RESTORE
, reg2
);
7432 regno
= aarch64_get_next_set_bit (components
, regno2
+ 1);
7436 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS. */
7439 aarch64_emit_prologue_components (sbitmap components
)
7441 aarch64_process_components (components
, true);
7444 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS. */
7447 aarch64_emit_epilogue_components (sbitmap components
)
7449 aarch64_process_components (components
, false);
7452 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS. */
7455 aarch64_set_handled_components (sbitmap components
)
7457 for (unsigned regno
= 0; regno
<= LAST_SAVED_REGNUM
; regno
++)
7458 if (bitmap_bit_p (components
, regno
))
7459 cfun
->machine
->reg_is_wrapped_separately
[regno
] = true;
7462 /* On AArch64 we have an ABI defined safe buffer. This constant is used to
7463 determining the probe offset for alloca. */
7465 static HOST_WIDE_INT
7466 aarch64_stack_clash_protection_alloca_probe_range (void)
7468 return STACK_CLASH_CALLER_GUARD
;
7472 /* Allocate POLY_SIZE bytes of stack space using TEMP1 and TEMP2 as scratch
7473 registers. If POLY_SIZE is not large enough to require a probe this function
7474 will only adjust the stack. When allocating the stack space
7475 FRAME_RELATED_P is then used to indicate if the allocation is frame related.
7476 FINAL_ADJUSTMENT_P indicates whether we are allocating the outgoing
7477 arguments. If we are then we ensure that any allocation larger than the ABI
7478 defined buffer needs a probe so that the invariant of having a 1KB buffer is
7481 We emit barriers after each stack adjustment to prevent optimizations from
7482 breaking the invariant that we never drop the stack more than a page. This
7483 invariant is needed to make it easier to correctly handle asynchronous
7484 events, e.g. if we were to allow the stack to be dropped by more than a page
7485 and then have multiple probes up and we take a signal somewhere in between
7486 then the signal handler doesn't know the state of the stack and can make no
7487 assumptions about which pages have been probed. */
7490 aarch64_allocate_and_probe_stack_space (rtx temp1
, rtx temp2
,
7491 poly_int64 poly_size
,
7492 bool frame_related_p
,
7493 bool final_adjustment_p
)
7495 HOST_WIDE_INT guard_size
7496 = 1 << param_stack_clash_protection_guard_size
;
7497 HOST_WIDE_INT guard_used_by_caller
= STACK_CLASH_CALLER_GUARD
;
7498 HOST_WIDE_INT min_probe_threshold
7499 = (final_adjustment_p
7500 ? guard_used_by_caller
7501 : guard_size
- guard_used_by_caller
);
7502 /* When doing the final adjustment for the outgoing arguments, take into
7503 account any unprobed space there is above the current SP. There are
7506 - When saving SVE registers below the hard frame pointer, we force
7507 the lowest save to take place in the prologue before doing the final
7508 adjustment (i.e. we don't allow the save to be shrink-wrapped).
7509 This acts as a probe at SP, so there is no unprobed space.
7511 - When there are no SVE register saves, we use the store of the link
7512 register as a probe. We can't assume that LR was saved at position 0
7513 though, so treat any space below it as unprobed. */
7514 if (final_adjustment_p
7515 && known_eq (cfun
->machine
->frame
.below_hard_fp_saved_regs_size
, 0))
7517 poly_int64 lr_offset
= cfun
->machine
->frame
.reg_offset
[LR_REGNUM
];
7518 if (known_ge (lr_offset
, 0))
7519 min_probe_threshold
-= lr_offset
.to_constant ();
7521 gcc_assert (!flag_stack_clash_protection
|| known_eq (poly_size
, 0));
7524 poly_int64 frame_size
= cfun
->machine
->frame
.frame_size
;
7526 /* We should always have a positive probe threshold. */
7527 gcc_assert (min_probe_threshold
> 0);
7529 if (flag_stack_clash_protection
&& !final_adjustment_p
)
7531 poly_int64 initial_adjust
= cfun
->machine
->frame
.initial_adjust
;
7532 poly_int64 sve_callee_adjust
= cfun
->machine
->frame
.sve_callee_adjust
;
7533 poly_int64 final_adjust
= cfun
->machine
->frame
.final_adjust
;
7535 if (known_eq (frame_size
, 0))
7537 dump_stack_clash_frame_info (NO_PROBE_NO_FRAME
, false);
7539 else if (known_lt (initial_adjust
+ sve_callee_adjust
,
7540 guard_size
- guard_used_by_caller
)
7541 && known_lt (final_adjust
, guard_used_by_caller
))
7543 dump_stack_clash_frame_info (NO_PROBE_SMALL_FRAME
, true);
7547 /* If SIZE is not large enough to require probing, just adjust the stack and
7549 if (known_lt (poly_size
, min_probe_threshold
)
7550 || !flag_stack_clash_protection
)
7552 aarch64_sub_sp (temp1
, temp2
, poly_size
, frame_related_p
);
7557 /* Handle the SVE non-constant case first. */
7558 if (!poly_size
.is_constant (&size
))
7562 fprintf (dump_file
, "Stack clash SVE prologue: ");
7563 print_dec (poly_size
, dump_file
);
7564 fprintf (dump_file
, " bytes, dynamic probing will be required.\n");
7567 /* First calculate the amount of bytes we're actually spilling. */
7568 aarch64_add_offset (Pmode
, temp1
, CONST0_RTX (Pmode
),
7569 poly_size
, temp1
, temp2
, false, true);
7571 rtx_insn
*insn
= get_last_insn ();
7573 if (frame_related_p
)
7575 /* This is done to provide unwinding information for the stack
7576 adjustments we're about to do, however to prevent the optimizers
7577 from removing the R11 move and leaving the CFA note (which would be
7578 very wrong) we tie the old and new stack pointer together.
7579 The tie will expand to nothing but the optimizers will not touch
7581 rtx stack_ptr_copy
= gen_rtx_REG (Pmode
, STACK_CLASH_SVE_CFA_REGNUM
);
7582 emit_move_insn (stack_ptr_copy
, stack_pointer_rtx
);
7583 emit_insn (gen_stack_tie (stack_ptr_copy
, stack_pointer_rtx
));
7585 /* We want the CFA independent of the stack pointer for the
7586 duration of the loop. */
7587 add_reg_note (insn
, REG_CFA_DEF_CFA
, stack_ptr_copy
);
7588 RTX_FRAME_RELATED_P (insn
) = 1;
7591 rtx probe_const
= gen_int_mode (min_probe_threshold
, Pmode
);
7592 rtx guard_const
= gen_int_mode (guard_size
, Pmode
);
7594 insn
= emit_insn (gen_probe_sve_stack_clash (Pmode
, stack_pointer_rtx
,
7595 stack_pointer_rtx
, temp1
,
7596 probe_const
, guard_const
));
7598 /* Now reset the CFA register if needed. */
7599 if (frame_related_p
)
7601 add_reg_note (insn
, REG_CFA_DEF_CFA
,
7602 gen_rtx_PLUS (Pmode
, stack_pointer_rtx
,
7603 gen_int_mode (poly_size
, Pmode
)));
7604 RTX_FRAME_RELATED_P (insn
) = 1;
7612 "Stack clash AArch64 prologue: " HOST_WIDE_INT_PRINT_DEC
7613 " bytes, probing will be required.\n", size
);
7615 /* Round size to the nearest multiple of guard_size, and calculate the
7616 residual as the difference between the original size and the rounded
7618 HOST_WIDE_INT rounded_size
= ROUND_DOWN (size
, guard_size
);
7619 HOST_WIDE_INT residual
= size
- rounded_size
;
7621 /* We can handle a small number of allocations/probes inline. Otherwise
7623 if (rounded_size
<= STACK_CLASH_MAX_UNROLL_PAGES
* guard_size
)
7625 for (HOST_WIDE_INT i
= 0; i
< rounded_size
; i
+= guard_size
)
7627 aarch64_sub_sp (NULL
, temp2
, guard_size
, true);
7628 emit_stack_probe (plus_constant (Pmode
, stack_pointer_rtx
,
7629 guard_used_by_caller
));
7630 emit_insn (gen_blockage ());
7632 dump_stack_clash_frame_info (PROBE_INLINE
, size
!= rounded_size
);
7636 /* Compute the ending address. */
7637 aarch64_add_offset (Pmode
, temp1
, stack_pointer_rtx
, -rounded_size
,
7638 temp1
, NULL
, false, true);
7639 rtx_insn
*insn
= get_last_insn ();
7641 /* For the initial allocation, we don't have a frame pointer
7642 set up, so we always need CFI notes. If we're doing the
7643 final allocation, then we may have a frame pointer, in which
7644 case it is the CFA, otherwise we need CFI notes.
7646 We can determine which allocation we are doing by looking at
7647 the value of FRAME_RELATED_P since the final allocations are not
7649 if (frame_related_p
)
7651 /* We want the CFA independent of the stack pointer for the
7652 duration of the loop. */
7653 add_reg_note (insn
, REG_CFA_DEF_CFA
,
7654 plus_constant (Pmode
, temp1
, rounded_size
));
7655 RTX_FRAME_RELATED_P (insn
) = 1;
7658 /* This allocates and probes the stack. Note that this re-uses some of
7659 the existing Ada stack protection code. However we are guaranteed not
7660 to enter the non loop or residual branches of that code.
7662 The non-loop part won't be entered because if our allocation amount
7663 doesn't require a loop, the case above would handle it.
7665 The residual amount won't be entered because TEMP1 is a mutliple of
7666 the allocation size. The residual will always be 0. As such, the only
7667 part we are actually using from that code is the loop setup. The
7668 actual probing is done in aarch64_output_probe_stack_range. */
7669 insn
= emit_insn (gen_probe_stack_range (stack_pointer_rtx
,
7670 stack_pointer_rtx
, temp1
));
7672 /* Now reset the CFA register if needed. */
7673 if (frame_related_p
)
7675 add_reg_note (insn
, REG_CFA_DEF_CFA
,
7676 plus_constant (Pmode
, stack_pointer_rtx
, rounded_size
));
7677 RTX_FRAME_RELATED_P (insn
) = 1;
7680 emit_insn (gen_blockage ());
7681 dump_stack_clash_frame_info (PROBE_LOOP
, size
!= rounded_size
);
7684 /* Handle any residuals. Residuals of at least MIN_PROBE_THRESHOLD have to
7685 be probed. This maintains the requirement that each page is probed at
7686 least once. For initial probing we probe only if the allocation is
7687 more than GUARD_SIZE - buffer, and for the outgoing arguments we probe
7688 if the amount is larger than buffer. GUARD_SIZE - buffer + buffer ==
7689 GUARD_SIZE. This works that for any allocation that is large enough to
7690 trigger a probe here, we'll have at least one, and if they're not large
7691 enough for this code to emit anything for them, The page would have been
7692 probed by the saving of FP/LR either by this function or any callees. If
7693 we don't have any callees then we won't have more stack adjustments and so
7697 HOST_WIDE_INT residual_probe_offset
= guard_used_by_caller
;
7698 /* If we're doing final adjustments, and we've done any full page
7699 allocations then any residual needs to be probed. */
7700 if (final_adjustment_p
&& rounded_size
!= 0)
7701 min_probe_threshold
= 0;
7702 /* If doing a small final adjustment, we always probe at offset 0.
7703 This is done to avoid issues when LR is not at position 0 or when
7704 the final adjustment is smaller than the probing offset. */
7705 else if (final_adjustment_p
&& rounded_size
== 0)
7706 residual_probe_offset
= 0;
7708 aarch64_sub_sp (temp1
, temp2
, residual
, frame_related_p
);
7709 if (residual
>= min_probe_threshold
)
7713 "Stack clash AArch64 prologue residuals: "
7714 HOST_WIDE_INT_PRINT_DEC
" bytes, probing will be required."
7717 emit_stack_probe (plus_constant (Pmode
, stack_pointer_rtx
,
7718 residual_probe_offset
));
7719 emit_insn (gen_blockage ());
7724 /* Return 1 if the register is used by the epilogue. We need to say the
7725 return register is used, but only after epilogue generation is complete.
7726 Note that in the case of sibcalls, the values "used by the epilogue" are
7727 considered live at the start of the called function.
7729 For SIMD functions we need to return 1 for FP registers that are saved and
7730 restored by a function but are not zero in call_used_regs. If we do not do
7731 this optimizations may remove the restore of the register. */
7734 aarch64_epilogue_uses (int regno
)
7736 if (epilogue_completed
)
7738 if (regno
== LR_REGNUM
)
7744 /* AArch64 stack frames generated by this compiler look like:
7746 +-------------------------------+
7748 | incoming stack arguments |
7750 +-------------------------------+
7751 | | <-- incoming stack pointer (aligned)
7752 | callee-allocated save area |
7753 | for register varargs |
7755 +-------------------------------+
7756 | local variables | <-- frame_pointer_rtx
7758 +-------------------------------+
7760 +-------------------------------+ |
7761 | callee-saved registers | | frame.saved_regs_size
7762 +-------------------------------+ |
7764 +-------------------------------+ |
7766 +-------------------------------+ |<- hard_frame_pointer_rtx (aligned)
7767 | SVE vector registers | | \
7768 +-------------------------------+ | | below_hard_fp_saved_regs_size
7769 | SVE predicate registers | / /
7770 +-------------------------------+
7771 | dynamic allocation |
7772 +-------------------------------+
7774 +-------------------------------+
7775 | outgoing stack arguments | <-- arg_pointer
7777 +-------------------------------+
7778 | | <-- stack_pointer_rtx (aligned)
7780 Dynamic stack allocations via alloca() decrease stack_pointer_rtx
7781 but leave frame_pointer_rtx and hard_frame_pointer_rtx
7784 By default for stack-clash we assume the guard is at least 64KB, but this
7785 value is configurable to either 4KB or 64KB. We also force the guard size to
7786 be the same as the probing interval and both values are kept in sync.
7788 With those assumptions the callee can allocate up to 63KB (or 3KB depending
7789 on the guard size) of stack space without probing.
7791 When probing is needed, we emit a probe at the start of the prologue
7792 and every PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE bytes thereafter.
7794 We have to track how much space has been allocated and the only stores
7795 to the stack we track as implicit probes are the FP/LR stores.
7797 For outgoing arguments we probe if the size is larger than 1KB, such that
7798 the ABI specified buffer is maintained for the next callee.
7800 The following registers are reserved during frame layout and should not be
7801 used for any other purpose:
7803 - r11: Used by stack clash protection when SVE is enabled, and also
7804 as an anchor register when saving and restoring registers
7805 - r12(EP0) and r13(EP1): Used as temporaries for stack adjustment.
7806 - r14 and r15: Used for speculation tracking.
7807 - r16(IP0), r17(IP1): Used by indirect tailcalls.
7808 - r30(LR), r29(FP): Used by standard frame layout.
7810 These registers must be avoided in frame layout related code unless the
7811 explicit intention is to interact with one of the features listed above. */
7813 /* Generate the prologue instructions for entry into a function.
7814 Establish the stack frame by decreasing the stack pointer with a
7815 properly calculated size and, if necessary, create a frame record
7816 filled with the values of LR and previous frame pointer. The
7817 current FP is also set up if it is in use. */
7820 aarch64_expand_prologue (void)
7822 poly_int64 frame_size
= cfun
->machine
->frame
.frame_size
;
7823 poly_int64 initial_adjust
= cfun
->machine
->frame
.initial_adjust
;
7824 HOST_WIDE_INT callee_adjust
= cfun
->machine
->frame
.callee_adjust
;
7825 poly_int64 final_adjust
= cfun
->machine
->frame
.final_adjust
;
7826 poly_int64 callee_offset
= cfun
->machine
->frame
.callee_offset
;
7827 poly_int64 sve_callee_adjust
= cfun
->machine
->frame
.sve_callee_adjust
;
7828 poly_int64 below_hard_fp_saved_regs_size
7829 = cfun
->machine
->frame
.below_hard_fp_saved_regs_size
;
7830 unsigned reg1
= cfun
->machine
->frame
.wb_candidate1
;
7831 unsigned reg2
= cfun
->machine
->frame
.wb_candidate2
;
7832 bool emit_frame_chain
= cfun
->machine
->frame
.emit_frame_chain
;
7835 if (flag_stack_clash_protection
&& known_eq (callee_adjust
, 0))
7837 /* Fold the SVE allocation into the initial allocation.
7838 We don't do this in aarch64_layout_arg to avoid pessimizing
7839 the epilogue code. */
7840 initial_adjust
+= sve_callee_adjust
;
7841 sve_callee_adjust
= 0;
7844 /* Sign return address for functions. */
7845 if (aarch64_return_address_signing_enabled ())
7847 switch (aarch64_ra_sign_key
)
7850 insn
= emit_insn (gen_paciasp ());
7853 insn
= emit_insn (gen_pacibsp ());
7858 add_reg_note (insn
, REG_CFA_TOGGLE_RA_MANGLE
, const0_rtx
);
7859 RTX_FRAME_RELATED_P (insn
) = 1;
7862 if (flag_stack_usage_info
)
7863 current_function_static_stack_size
= constant_lower_bound (frame_size
);
7865 if (flag_stack_check
== STATIC_BUILTIN_STACK_CHECK
)
7867 if (crtl
->is_leaf
&& !cfun
->calls_alloca
)
7869 if (maybe_gt (frame_size
, PROBE_INTERVAL
)
7870 && maybe_gt (frame_size
, get_stack_check_protect ()))
7871 aarch64_emit_probe_stack_range (get_stack_check_protect (),
7873 - get_stack_check_protect ()));
7875 else if (maybe_gt (frame_size
, 0))
7876 aarch64_emit_probe_stack_range (get_stack_check_protect (), frame_size
);
7879 rtx tmp0_rtx
= gen_rtx_REG (Pmode
, EP0_REGNUM
);
7880 rtx tmp1_rtx
= gen_rtx_REG (Pmode
, EP1_REGNUM
);
7882 /* In theory we should never have both an initial adjustment
7883 and a callee save adjustment. Verify that is the case since the
7884 code below does not handle it for -fstack-clash-protection. */
7885 gcc_assert (known_eq (initial_adjust
, 0) || callee_adjust
== 0);
7887 /* Will only probe if the initial adjustment is larger than the guard
7888 less the amount of the guard reserved for use by the caller's
7890 aarch64_allocate_and_probe_stack_space (tmp0_rtx
, tmp1_rtx
, initial_adjust
,
7893 if (callee_adjust
!= 0)
7894 aarch64_push_regs (reg1
, reg2
, callee_adjust
);
7896 /* The offset of the frame chain record (if any) from the current SP. */
7897 poly_int64 chain_offset
= (initial_adjust
+ callee_adjust
7898 - cfun
->machine
->frame
.hard_fp_offset
);
7899 gcc_assert (known_ge (chain_offset
, 0));
7901 /* The offset of the bottom of the save area from the current SP. */
7902 poly_int64 saved_regs_offset
= chain_offset
- below_hard_fp_saved_regs_size
;
7904 if (emit_frame_chain
)
7906 if (callee_adjust
== 0)
7910 aarch64_save_callee_saves (saved_regs_offset
, reg1
, reg2
,
7914 gcc_assert (known_eq (chain_offset
, 0));
7915 aarch64_add_offset (Pmode
, hard_frame_pointer_rtx
,
7916 stack_pointer_rtx
, chain_offset
,
7917 tmp1_rtx
, tmp0_rtx
, frame_pointer_needed
);
7918 if (frame_pointer_needed
&& !frame_size
.is_constant ())
7920 /* Variable-sized frames need to describe the save slot
7921 address using DW_CFA_expression rather than DW_CFA_offset.
7922 This means that, without taking further action, the
7923 locations of the registers that we've already saved would
7924 remain based on the stack pointer even after we redefine
7925 the CFA based on the frame pointer. We therefore need new
7926 DW_CFA_expressions to re-express the save slots with addresses
7927 based on the frame pointer. */
7928 rtx_insn
*insn
= get_last_insn ();
7929 gcc_assert (RTX_FRAME_RELATED_P (insn
));
7931 /* Add an explicit CFA definition if this was previously
7933 if (!find_reg_note (insn
, REG_CFA_ADJUST_CFA
, NULL_RTX
))
7935 rtx src
= plus_constant (Pmode
, stack_pointer_rtx
,
7937 add_reg_note (insn
, REG_CFA_ADJUST_CFA
,
7938 gen_rtx_SET (hard_frame_pointer_rtx
, src
));
7941 /* Change the save slot expressions for the registers that
7942 we've already saved. */
7943 aarch64_add_cfa_expression (insn
, regno_reg_rtx
[reg2
],
7944 hard_frame_pointer_rtx
, UNITS_PER_WORD
);
7945 aarch64_add_cfa_expression (insn
, regno_reg_rtx
[reg1
],
7946 hard_frame_pointer_rtx
, 0);
7948 emit_insn (gen_stack_tie (stack_pointer_rtx
, hard_frame_pointer_rtx
));
7951 aarch64_save_callee_saves (saved_regs_offset
, R0_REGNUM
, R30_REGNUM
,
7952 callee_adjust
!= 0 || emit_frame_chain
,
7954 if (maybe_ne (sve_callee_adjust
, 0))
7956 gcc_assert (!flag_stack_clash_protection
7957 || known_eq (initial_adjust
, 0));
7958 aarch64_allocate_and_probe_stack_space (tmp1_rtx
, tmp0_rtx
,
7960 !frame_pointer_needed
, false);
7961 saved_regs_offset
+= sve_callee_adjust
;
7963 aarch64_save_callee_saves (saved_regs_offset
, P0_REGNUM
, P15_REGNUM
,
7964 false, emit_frame_chain
);
7965 aarch64_save_callee_saves (saved_regs_offset
, V0_REGNUM
, V31_REGNUM
,
7966 callee_adjust
!= 0 || emit_frame_chain
,
7969 /* We may need to probe the final adjustment if it is larger than the guard
7970 that is assumed by the called. */
7971 aarch64_allocate_and_probe_stack_space (tmp1_rtx
, tmp0_rtx
, final_adjust
,
7972 !frame_pointer_needed
, true);
7975 /* Return TRUE if we can use a simple_return insn.
7977 This function checks whether the callee saved stack is empty, which
7978 means no restore actions are need. The pro_and_epilogue will use
7979 this to check whether shrink-wrapping opt is feasible. */
7982 aarch64_use_return_insn_p (void)
7984 if (!reload_completed
)
7990 return known_eq (cfun
->machine
->frame
.frame_size
, 0);
7993 /* Generate the epilogue instructions for returning from a function.
7994 This is almost exactly the reverse of the prolog sequence, except
7995 that we need to insert barriers to avoid scheduling loads that read
7996 from a deallocated stack, and we optimize the unwind records by
7997 emitting them all together if possible. */
7999 aarch64_expand_epilogue (bool for_sibcall
)
8001 poly_int64 initial_adjust
= cfun
->machine
->frame
.initial_adjust
;
8002 HOST_WIDE_INT callee_adjust
= cfun
->machine
->frame
.callee_adjust
;
8003 poly_int64 final_adjust
= cfun
->machine
->frame
.final_adjust
;
8004 poly_int64 callee_offset
= cfun
->machine
->frame
.callee_offset
;
8005 poly_int64 sve_callee_adjust
= cfun
->machine
->frame
.sve_callee_adjust
;
8006 poly_int64 below_hard_fp_saved_regs_size
8007 = cfun
->machine
->frame
.below_hard_fp_saved_regs_size
;
8008 unsigned reg1
= cfun
->machine
->frame
.wb_candidate1
;
8009 unsigned reg2
= cfun
->machine
->frame
.wb_candidate2
;
8012 /* A stack clash protection prologue may not have left EP0_REGNUM or
8013 EP1_REGNUM in a usable state. The same is true for allocations
8014 with an SVE component, since we then need both temporary registers
8015 for each allocation. For stack clash we are in a usable state if
8016 the adjustment is less than GUARD_SIZE - GUARD_USED_BY_CALLER. */
8017 HOST_WIDE_INT guard_size
8018 = 1 << param_stack_clash_protection_guard_size
;
8019 HOST_WIDE_INT guard_used_by_caller
= STACK_CLASH_CALLER_GUARD
;
8021 /* We can re-use the registers when:
8023 (a) the deallocation amount is the same as the corresponding
8024 allocation amount (which is false if we combine the initial
8025 and SVE callee save allocations in the prologue); and
8027 (b) the allocation amount doesn't need a probe (which is false
8028 if the amount is guard_size - guard_used_by_caller or greater).
8030 In such situations the register should remain live with the correct
8032 bool can_inherit_p
= (initial_adjust
.is_constant ()
8033 && final_adjust
.is_constant ()
8034 && (!flag_stack_clash_protection
8035 || (known_lt (initial_adjust
,
8036 guard_size
- guard_used_by_caller
)
8037 && known_eq (sve_callee_adjust
, 0))));
8039 /* We need to add memory barrier to prevent read from deallocated stack. */
8041 = maybe_ne (get_frame_size ()
8042 + cfun
->machine
->frame
.saved_varargs_size
, 0);
8044 /* Emit a barrier to prevent loads from a deallocated stack. */
8045 if (maybe_gt (final_adjust
, crtl
->outgoing_args_size
)
8046 || cfun
->calls_alloca
8047 || crtl
->calls_eh_return
)
8049 emit_insn (gen_stack_tie (stack_pointer_rtx
, stack_pointer_rtx
));
8050 need_barrier_p
= false;
8053 /* Restore the stack pointer from the frame pointer if it may not
8054 be the same as the stack pointer. */
8055 rtx tmp0_rtx
= gen_rtx_REG (Pmode
, EP0_REGNUM
);
8056 rtx tmp1_rtx
= gen_rtx_REG (Pmode
, EP1_REGNUM
);
8057 if (frame_pointer_needed
8058 && (maybe_ne (final_adjust
, 0) || cfun
->calls_alloca
))
8059 /* If writeback is used when restoring callee-saves, the CFA
8060 is restored on the instruction doing the writeback. */
8061 aarch64_add_offset (Pmode
, stack_pointer_rtx
,
8062 hard_frame_pointer_rtx
,
8063 -callee_offset
- below_hard_fp_saved_regs_size
,
8064 tmp1_rtx
, tmp0_rtx
, callee_adjust
== 0);
8066 /* The case where we need to re-use the register here is very rare, so
8067 avoid the complicated condition and just always emit a move if the
8068 immediate doesn't fit. */
8069 aarch64_add_sp (tmp1_rtx
, tmp0_rtx
, final_adjust
, true);
8071 /* Restore the vector registers before the predicate registers,
8072 so that we can use P4 as a temporary for big-endian SVE frames. */
8073 aarch64_restore_callee_saves (callee_offset
, V0_REGNUM
, V31_REGNUM
,
8074 callee_adjust
!= 0, &cfi_ops
);
8075 aarch64_restore_callee_saves (callee_offset
, P0_REGNUM
, P15_REGNUM
,
8077 if (maybe_ne (sve_callee_adjust
, 0))
8078 aarch64_add_sp (NULL_RTX
, NULL_RTX
, sve_callee_adjust
, true);
8079 aarch64_restore_callee_saves (callee_offset
- sve_callee_adjust
,
8080 R0_REGNUM
, R30_REGNUM
,
8081 callee_adjust
!= 0, &cfi_ops
);
8084 emit_insn (gen_stack_tie (stack_pointer_rtx
, stack_pointer_rtx
));
8086 if (callee_adjust
!= 0)
8087 aarch64_pop_regs (reg1
, reg2
, callee_adjust
, &cfi_ops
);
8089 if (callee_adjust
!= 0 || maybe_gt (initial_adjust
, 65536))
8091 /* Emit delayed restores and set the CFA to be SP + initial_adjust. */
8092 insn
= get_last_insn ();
8093 rtx new_cfa
= plus_constant (Pmode
, stack_pointer_rtx
, initial_adjust
);
8094 REG_NOTES (insn
) = alloc_reg_note (REG_CFA_DEF_CFA
, new_cfa
, cfi_ops
);
8095 RTX_FRAME_RELATED_P (insn
) = 1;
8099 /* Liveness of EP0_REGNUM can not be trusted across function calls either, so
8100 add restriction on emit_move optimization to leaf functions. */
8101 aarch64_add_sp (tmp0_rtx
, tmp1_rtx
, initial_adjust
,
8102 (!can_inherit_p
|| !crtl
->is_leaf
8103 || df_regs_ever_live_p (EP0_REGNUM
)));
8107 /* Emit delayed restores and reset the CFA to be SP. */
8108 insn
= get_last_insn ();
8109 cfi_ops
= alloc_reg_note (REG_CFA_DEF_CFA
, stack_pointer_rtx
, cfi_ops
);
8110 REG_NOTES (insn
) = cfi_ops
;
8111 RTX_FRAME_RELATED_P (insn
) = 1;
8114 /* We prefer to emit the combined return/authenticate instruction RETAA,
8115 however there are three cases in which we must instead emit an explicit
8116 authentication instruction.
8118 1) Sibcalls don't return in a normal way, so if we're about to call one
8119 we must authenticate.
8121 2) The RETAA instruction is not available before ARMv8.3-A, so if we are
8122 generating code for !TARGET_ARMV8_3 we can't use it and must
8123 explicitly authenticate.
8125 3) On an eh_return path we make extra stack adjustments to update the
8126 canonical frame address to be the exception handler's CFA. We want
8127 to authenticate using the CFA of the function which calls eh_return.
8129 if (aarch64_return_address_signing_enabled ()
8130 && (for_sibcall
|| !TARGET_ARMV8_3
|| crtl
->calls_eh_return
))
8132 switch (aarch64_ra_sign_key
)
8135 insn
= emit_insn (gen_autiasp ());
8138 insn
= emit_insn (gen_autibsp ());
8143 add_reg_note (insn
, REG_CFA_TOGGLE_RA_MANGLE
, const0_rtx
);
8144 RTX_FRAME_RELATED_P (insn
) = 1;
8147 /* Stack adjustment for exception handler. */
8148 if (crtl
->calls_eh_return
&& !for_sibcall
)
8150 /* We need to unwind the stack by the offset computed by
8151 EH_RETURN_STACKADJ_RTX. We have already reset the CFA
8152 to be SP; letting the CFA move during this adjustment
8153 is just as correct as retaining the CFA from the body
8154 of the function. Therefore, do nothing special. */
8155 emit_insn (gen_add2_insn (stack_pointer_rtx
, EH_RETURN_STACKADJ_RTX
));
8158 emit_use (gen_rtx_REG (DImode
, LR_REGNUM
));
8160 emit_jump_insn (ret_rtx
);
8163 /* Implement EH_RETURN_HANDLER_RTX. EH returns need to either return
8164 normally or return to a previous frame after unwinding.
8166 An EH return uses a single shared return sequence. The epilogue is
8167 exactly like a normal epilogue except that it has an extra input
8168 register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
8169 that must be applied after the frame has been destroyed. An extra label
8170 is inserted before the epilogue which initializes this register to zero,
8171 and this is the entry point for a normal return.
8173 An actual EH return updates the return address, initializes the stack
8174 adjustment and jumps directly into the epilogue (bypassing the zeroing
8175 of the adjustment). Since the return address is typically saved on the
8176 stack when a function makes a call, the saved LR must be updated outside
8179 This poses problems as the store is generated well before the epilogue,
8180 so the offset of LR is not known yet. Also optimizations will remove the
8181 store as it appears dead, even after the epilogue is generated (as the
8182 base or offset for loading LR is different in many cases).
8184 To avoid these problems this implementation forces the frame pointer
8185 in eh_return functions so that the location of LR is fixed and known early.
8186 It also marks the store volatile, so no optimization is permitted to
8187 remove the store. */
8189 aarch64_eh_return_handler_rtx (void)
8191 rtx tmp
= gen_frame_mem (Pmode
,
8192 plus_constant (Pmode
, hard_frame_pointer_rtx
, UNITS_PER_WORD
));
8194 /* Mark the store volatile, so no optimization is permitted to remove it. */
8195 MEM_VOLATILE_P (tmp
) = true;
8199 /* Output code to add DELTA to the first argument, and then jump
8200 to FUNCTION. Used for C++ multiple inheritance. */
8202 aarch64_output_mi_thunk (FILE *file
, tree thunk ATTRIBUTE_UNUSED
,
8203 HOST_WIDE_INT delta
,
8204 HOST_WIDE_INT vcall_offset
,
8207 /* The this pointer is always in x0. Note that this differs from
8208 Arm where the this pointer maybe bumped to r1 if r0 is required
8209 to return a pointer to an aggregate. On AArch64 a result value
8210 pointer will be in x8. */
8211 int this_regno
= R0_REGNUM
;
8212 rtx this_rtx
, temp0
, temp1
, addr
, funexp
;
8214 const char *fnname
= IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (thunk
));
8216 if (aarch64_bti_enabled ())
8217 emit_insn (gen_bti_c());
8219 reload_completed
= 1;
8220 emit_note (NOTE_INSN_PROLOGUE_END
);
8222 this_rtx
= gen_rtx_REG (Pmode
, this_regno
);
8223 temp0
= gen_rtx_REG (Pmode
, EP0_REGNUM
);
8224 temp1
= gen_rtx_REG (Pmode
, EP1_REGNUM
);
8226 if (vcall_offset
== 0)
8227 aarch64_add_offset (Pmode
, this_rtx
, this_rtx
, delta
, temp1
, temp0
, false);
8230 gcc_assert ((vcall_offset
& (POINTER_BYTES
- 1)) == 0);
8235 if (delta
>= -256 && delta
< 256)
8236 addr
= gen_rtx_PRE_MODIFY (Pmode
, this_rtx
,
8237 plus_constant (Pmode
, this_rtx
, delta
));
8239 aarch64_add_offset (Pmode
, this_rtx
, this_rtx
, delta
,
8240 temp1
, temp0
, false);
8243 if (Pmode
== ptr_mode
)
8244 aarch64_emit_move (temp0
, gen_rtx_MEM (ptr_mode
, addr
));
8246 aarch64_emit_move (temp0
,
8247 gen_rtx_ZERO_EXTEND (Pmode
,
8248 gen_rtx_MEM (ptr_mode
, addr
)));
8250 if (vcall_offset
>= -256 && vcall_offset
< 4096 * POINTER_BYTES
)
8251 addr
= plus_constant (Pmode
, temp0
, vcall_offset
);
8254 aarch64_internal_mov_immediate (temp1
, GEN_INT (vcall_offset
), true,
8256 addr
= gen_rtx_PLUS (Pmode
, temp0
, temp1
);
8259 if (Pmode
== ptr_mode
)
8260 aarch64_emit_move (temp1
, gen_rtx_MEM (ptr_mode
,addr
));
8262 aarch64_emit_move (temp1
,
8263 gen_rtx_SIGN_EXTEND (Pmode
,
8264 gen_rtx_MEM (ptr_mode
, addr
)));
8266 emit_insn (gen_add2_insn (this_rtx
, temp1
));
8269 /* Generate a tail call to the target function. */
8270 if (!TREE_USED (function
))
8272 assemble_external (function
);
8273 TREE_USED (function
) = 1;
8275 funexp
= XEXP (DECL_RTL (function
), 0);
8276 funexp
= gen_rtx_MEM (FUNCTION_MODE
, funexp
);
8277 rtx callee_abi
= gen_int_mode (fndecl_abi (function
).id (), DImode
);
8278 insn
= emit_call_insn (gen_sibcall (funexp
, const0_rtx
, callee_abi
));
8279 SIBLING_CALL_P (insn
) = 1;
8281 insn
= get_insns ();
8282 shorten_branches (insn
);
8284 assemble_start_function (thunk
, fnname
);
8285 final_start_function (insn
, file
, 1);
8286 final (insn
, file
, 1);
8287 final_end_function ();
8288 assemble_end_function (thunk
, fnname
);
8290 /* Stop pretending to be a post-reload pass. */
8291 reload_completed
= 0;
8295 aarch64_tls_referenced_p (rtx x
)
8297 if (!TARGET_HAVE_TLS
)
8299 subrtx_iterator::array_type array
;
8300 FOR_EACH_SUBRTX (iter
, array
, x
, ALL
)
8302 const_rtx x
= *iter
;
8303 if (GET_CODE (x
) == SYMBOL_REF
&& SYMBOL_REF_TLS_MODEL (x
) != 0)
8305 /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
8306 TLS offsets, not real symbol references. */
8307 if (GET_CODE (x
) == UNSPEC
&& XINT (x
, 1) == UNSPEC_TLS
)
8308 iter
.skip_subrtxes ();
8314 /* Return true if val can be encoded as a 12-bit unsigned immediate with
8315 a left shift of 0 or 12 bits. */
8317 aarch64_uimm12_shift (HOST_WIDE_INT val
)
8319 return ((val
& (((HOST_WIDE_INT
) 0xfff) << 0)) == val
8320 || (val
& (((HOST_WIDE_INT
) 0xfff) << 12)) == val
8324 /* Returns the nearest value to VAL that will fit as a 12-bit unsigned immediate
8325 that can be created with a left shift of 0 or 12. */
8326 static HOST_WIDE_INT
8327 aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val
)
8329 /* Check to see if the value fits in 24 bits, as that is the maximum we can
8330 handle correctly. */
8331 gcc_assert ((val
& 0xffffff) == val
);
8333 if (((val
& 0xfff) << 0) == val
)
8336 return val
& (0xfff << 12);
8339 /* Return true if val is an immediate that can be loaded into a
8340 register by a MOVZ instruction. */
8342 aarch64_movw_imm (HOST_WIDE_INT val
, scalar_int_mode mode
)
8344 if (GET_MODE_SIZE (mode
) > 4)
8346 if ((val
& (((HOST_WIDE_INT
) 0xffff) << 32)) == val
8347 || (val
& (((HOST_WIDE_INT
) 0xffff) << 48)) == val
)
8352 /* Ignore sign extension. */
8353 val
&= (HOST_WIDE_INT
) 0xffffffff;
8355 return ((val
& (((HOST_WIDE_INT
) 0xffff) << 0)) == val
8356 || (val
& (((HOST_WIDE_INT
) 0xffff) << 16)) == val
);
8361 X = (X & AND_VAL) | IOR_VAL;
8363 can be implemented using:
8365 MOVK X, #(IOR_VAL >> shift), LSL #shift
8367 Return the shift if so, otherwise return -1. */
8369 aarch64_movk_shift (const wide_int_ref
&and_val
,
8370 const wide_int_ref
&ior_val
)
8372 unsigned int precision
= and_val
.get_precision ();
8373 unsigned HOST_WIDE_INT mask
= 0xffff;
8374 for (unsigned int shift
= 0; shift
< precision
; shift
+= 16)
8376 if (and_val
== ~mask
&& (ior_val
& mask
) == ior_val
)
8383 /* VAL is a value with the inner mode of MODE. Replicate it to fill a
8384 64-bit (DImode) integer. */
8386 static unsigned HOST_WIDE_INT
8387 aarch64_replicate_bitmask_imm (unsigned HOST_WIDE_INT val
, machine_mode mode
)
8389 unsigned int size
= GET_MODE_UNIT_PRECISION (mode
);
8392 val
&= (HOST_WIDE_INT_1U
<< size
) - 1;
8399 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2. */
8401 static const unsigned HOST_WIDE_INT bitmask_imm_mul
[] =
8403 0x0000000100000001ull
,
8404 0x0001000100010001ull
,
8405 0x0101010101010101ull
,
8406 0x1111111111111111ull
,
8407 0x5555555555555555ull
,
8411 /* Return true if val is a valid bitmask immediate. */
8414 aarch64_bitmask_imm (HOST_WIDE_INT val_in
, machine_mode mode
)
8416 unsigned HOST_WIDE_INT val
, tmp
, mask
, first_one
, next_one
;
8419 /* Check for a single sequence of one bits and return quickly if so.
8420 The special cases of all ones and all zeroes returns false. */
8421 val
= aarch64_replicate_bitmask_imm (val_in
, mode
);
8422 tmp
= val
+ (val
& -val
);
8424 if (tmp
== (tmp
& -tmp
))
8425 return (val
+ 1) > 1;
8427 /* Replicate 32-bit immediates so we can treat them as 64-bit. */
8429 val
= (val
<< 32) | (val
& 0xffffffff);
8431 /* Invert if the immediate doesn't start with a zero bit - this means we
8432 only need to search for sequences of one bits. */
8436 /* Find the first set bit and set tmp to val with the first sequence of one
8437 bits removed. Return success if there is a single sequence of ones. */
8438 first_one
= val
& -val
;
8439 tmp
= val
& (val
+ first_one
);
8444 /* Find the next set bit and compute the difference in bit position. */
8445 next_one
= tmp
& -tmp
;
8446 bits
= clz_hwi (first_one
) - clz_hwi (next_one
);
8449 /* Check the bit position difference is a power of 2, and that the first
8450 sequence of one bits fits within 'bits' bits. */
8451 if ((mask
>> bits
) != 0 || bits
!= (bits
& -bits
))
8454 /* Check the sequence of one bits is repeated 64/bits times. */
8455 return val
== mask
* bitmask_imm_mul
[__builtin_clz (bits
) - 26];
8458 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
8459 Assumed precondition: VAL_IN Is not zero. */
8461 unsigned HOST_WIDE_INT
8462 aarch64_and_split_imm1 (HOST_WIDE_INT val_in
)
8464 int lowest_bit_set
= ctz_hwi (val_in
);
8465 int highest_bit_set
= floor_log2 (val_in
);
8466 gcc_assert (val_in
!= 0);
8468 return ((HOST_WIDE_INT_UC (2) << highest_bit_set
) -
8469 (HOST_WIDE_INT_1U
<< lowest_bit_set
));
8472 /* Create constant where bits outside of lowest bit set to highest bit set
8475 unsigned HOST_WIDE_INT
8476 aarch64_and_split_imm2 (HOST_WIDE_INT val_in
)
8478 return val_in
| ~aarch64_and_split_imm1 (val_in
);
8481 /* Return true if VAL_IN is a valid 'and' bitmask immediate. */
8484 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in
, machine_mode mode
)
8486 scalar_int_mode int_mode
;
8487 if (!is_a
<scalar_int_mode
> (mode
, &int_mode
))
8490 if (aarch64_bitmask_imm (val_in
, int_mode
))
8493 if (aarch64_move_imm (val_in
, int_mode
))
8496 unsigned HOST_WIDE_INT imm2
= aarch64_and_split_imm2 (val_in
);
8498 return aarch64_bitmask_imm (imm2
, int_mode
);
8501 /* Return true if val is an immediate that can be loaded into a
8502 register in a single instruction. */
8504 aarch64_move_imm (HOST_WIDE_INT val
, machine_mode mode
)
8506 scalar_int_mode int_mode
;
8507 if (!is_a
<scalar_int_mode
> (mode
, &int_mode
))
8510 if (aarch64_movw_imm (val
, int_mode
) || aarch64_movw_imm (~val
, int_mode
))
8512 return aarch64_bitmask_imm (val
, int_mode
);
8516 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED
, rtx x
)
8520 if (GET_CODE (x
) == HIGH
)
8523 /* There's no way to calculate VL-based values using relocations. */
8524 subrtx_iterator::array_type array
;
8525 FOR_EACH_SUBRTX (iter
, array
, x
, ALL
)
8526 if (GET_CODE (*iter
) == CONST_POLY_INT
)
8529 split_const (x
, &base
, &offset
);
8530 if (GET_CODE (base
) == SYMBOL_REF
|| GET_CODE (base
) == LABEL_REF
)
8532 if (aarch64_classify_symbol (base
, INTVAL (offset
))
8533 != SYMBOL_FORCE_TO_MEM
)
8536 /* Avoid generating a 64-bit relocation in ILP32; leave
8537 to aarch64_expand_mov_immediate to handle it properly. */
8538 return mode
!= ptr_mode
;
8541 return aarch64_tls_referenced_p (x
);
8544 /* Implement TARGET_CASE_VALUES_THRESHOLD.
8545 The expansion for a table switch is quite expensive due to the number
8546 of instructions, the table lookup and hard to predict indirect jump.
8547 When optimizing for speed, and -O3 enabled, use the per-core tuning if
8548 set, otherwise use tables for > 16 cases as a tradeoff between size and
8549 performance. When optimizing for size, use the default setting. */
8552 aarch64_case_values_threshold (void)
8554 /* Use the specified limit for the number of cases before using jump
8555 tables at higher optimization levels. */
8557 && selected_cpu
->tune
->max_case_values
!= 0)
8558 return selected_cpu
->tune
->max_case_values
;
8560 return optimize_size
? default_case_values_threshold () : 17;
8563 /* Return true if register REGNO is a valid index register.
8564 STRICT_P is true if REG_OK_STRICT is in effect. */
8567 aarch64_regno_ok_for_index_p (int regno
, bool strict_p
)
8569 if (!HARD_REGISTER_NUM_P (regno
))
8577 regno
= reg_renumber
[regno
];
8579 return GP_REGNUM_P (regno
);
8582 /* Return true if register REGNO is a valid base register for mode MODE.
8583 STRICT_P is true if REG_OK_STRICT is in effect. */
8586 aarch64_regno_ok_for_base_p (int regno
, bool strict_p
)
8588 if (!HARD_REGISTER_NUM_P (regno
))
8596 regno
= reg_renumber
[regno
];
8599 /* The fake registers will be eliminated to either the stack or
8600 hard frame pointer, both of which are usually valid base registers.
8601 Reload deals with the cases where the eliminated form isn't valid. */
8602 return (GP_REGNUM_P (regno
)
8603 || regno
== SP_REGNUM
8604 || regno
== FRAME_POINTER_REGNUM
8605 || regno
== ARG_POINTER_REGNUM
);
8608 /* Return true if X is a valid base register for mode MODE.
8609 STRICT_P is true if REG_OK_STRICT is in effect. */
8612 aarch64_base_register_rtx_p (rtx x
, bool strict_p
)
8615 && GET_CODE (x
) == SUBREG
8616 && contains_reg_of_mode
[GENERAL_REGS
][GET_MODE (SUBREG_REG (x
))])
8619 return (REG_P (x
) && aarch64_regno_ok_for_base_p (REGNO (x
), strict_p
));
8622 /* Return true if address offset is a valid index. If it is, fill in INFO
8623 appropriately. STRICT_P is true if REG_OK_STRICT is in effect. */
8626 aarch64_classify_index (struct aarch64_address_info
*info
, rtx x
,
8627 machine_mode mode
, bool strict_p
)
8629 enum aarch64_address_type type
;
8634 if ((REG_P (x
) || GET_CODE (x
) == SUBREG
)
8635 && GET_MODE (x
) == Pmode
)
8637 type
= ADDRESS_REG_REG
;
8641 /* (sign_extend:DI (reg:SI)) */
8642 else if ((GET_CODE (x
) == SIGN_EXTEND
8643 || GET_CODE (x
) == ZERO_EXTEND
)
8644 && GET_MODE (x
) == DImode
8645 && GET_MODE (XEXP (x
, 0)) == SImode
)
8647 type
= (GET_CODE (x
) == SIGN_EXTEND
)
8648 ? ADDRESS_REG_SXTW
: ADDRESS_REG_UXTW
;
8649 index
= XEXP (x
, 0);
8652 /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
8653 else if (GET_CODE (x
) == MULT
8654 && (GET_CODE (XEXP (x
, 0)) == SIGN_EXTEND
8655 || GET_CODE (XEXP (x
, 0)) == ZERO_EXTEND
)
8656 && GET_MODE (XEXP (x
, 0)) == DImode
8657 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == SImode
8658 && CONST_INT_P (XEXP (x
, 1)))
8660 type
= (GET_CODE (XEXP (x
, 0)) == SIGN_EXTEND
)
8661 ? ADDRESS_REG_SXTW
: ADDRESS_REG_UXTW
;
8662 index
= XEXP (XEXP (x
, 0), 0);
8663 shift
= exact_log2 (INTVAL (XEXP (x
, 1)));
8665 /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
8666 else if (GET_CODE (x
) == ASHIFT
8667 && (GET_CODE (XEXP (x
, 0)) == SIGN_EXTEND
8668 || GET_CODE (XEXP (x
, 0)) == ZERO_EXTEND
)
8669 && GET_MODE (XEXP (x
, 0)) == DImode
8670 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == SImode
8671 && CONST_INT_P (XEXP (x
, 1)))
8673 type
= (GET_CODE (XEXP (x
, 0)) == SIGN_EXTEND
)
8674 ? ADDRESS_REG_SXTW
: ADDRESS_REG_UXTW
;
8675 index
= XEXP (XEXP (x
, 0), 0);
8676 shift
= INTVAL (XEXP (x
, 1));
8678 /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
8679 else if ((GET_CODE (x
) == SIGN_EXTRACT
8680 || GET_CODE (x
) == ZERO_EXTRACT
)
8681 && GET_MODE (x
) == DImode
8682 && GET_CODE (XEXP (x
, 0)) == MULT
8683 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == DImode
8684 && CONST_INT_P (XEXP (XEXP (x
, 0), 1)))
8686 type
= (GET_CODE (x
) == SIGN_EXTRACT
)
8687 ? ADDRESS_REG_SXTW
: ADDRESS_REG_UXTW
;
8688 index
= XEXP (XEXP (x
, 0), 0);
8689 shift
= exact_log2 (INTVAL (XEXP (XEXP (x
, 0), 1)));
8690 if (INTVAL (XEXP (x
, 1)) != 32 + shift
8691 || INTVAL (XEXP (x
, 2)) != 0)
8694 /* (and:DI (mult:DI (reg:DI) (const_int scale))
8695 (const_int 0xffffffff<<shift)) */
8696 else if (GET_CODE (x
) == AND
8697 && GET_MODE (x
) == DImode
8698 && GET_CODE (XEXP (x
, 0)) == MULT
8699 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == DImode
8700 && CONST_INT_P (XEXP (XEXP (x
, 0), 1))
8701 && CONST_INT_P (XEXP (x
, 1)))
8703 type
= ADDRESS_REG_UXTW
;
8704 index
= XEXP (XEXP (x
, 0), 0);
8705 shift
= exact_log2 (INTVAL (XEXP (XEXP (x
, 0), 1)));
8706 if (INTVAL (XEXP (x
, 1)) != (HOST_WIDE_INT
)0xffffffff << shift
)
8709 /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
8710 else if ((GET_CODE (x
) == SIGN_EXTRACT
8711 || GET_CODE (x
) == ZERO_EXTRACT
)
8712 && GET_MODE (x
) == DImode
8713 && GET_CODE (XEXP (x
, 0)) == ASHIFT
8714 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == DImode
8715 && CONST_INT_P (XEXP (XEXP (x
, 0), 1)))
8717 type
= (GET_CODE (x
) == SIGN_EXTRACT
)
8718 ? ADDRESS_REG_SXTW
: ADDRESS_REG_UXTW
;
8719 index
= XEXP (XEXP (x
, 0), 0);
8720 shift
= INTVAL (XEXP (XEXP (x
, 0), 1));
8721 if (INTVAL (XEXP (x
, 1)) != 32 + shift
8722 || INTVAL (XEXP (x
, 2)) != 0)
8725 /* (and:DI (ashift:DI (reg:DI) (const_int shift))
8726 (const_int 0xffffffff<<shift)) */
8727 else if (GET_CODE (x
) == AND
8728 && GET_MODE (x
) == DImode
8729 && GET_CODE (XEXP (x
, 0)) == ASHIFT
8730 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == DImode
8731 && CONST_INT_P (XEXP (XEXP (x
, 0), 1))
8732 && CONST_INT_P (XEXP (x
, 1)))
8734 type
= ADDRESS_REG_UXTW
;
8735 index
= XEXP (XEXP (x
, 0), 0);
8736 shift
= INTVAL (XEXP (XEXP (x
, 0), 1));
8737 if (INTVAL (XEXP (x
, 1)) != (HOST_WIDE_INT
)0xffffffff << shift
)
8740 /* (mult:P (reg:P) (const_int scale)) */
8741 else if (GET_CODE (x
) == MULT
8742 && GET_MODE (x
) == Pmode
8743 && GET_MODE (XEXP (x
, 0)) == Pmode
8744 && CONST_INT_P (XEXP (x
, 1)))
8746 type
= ADDRESS_REG_REG
;
8747 index
= XEXP (x
, 0);
8748 shift
= exact_log2 (INTVAL (XEXP (x
, 1)));
8750 /* (ashift:P (reg:P) (const_int shift)) */
8751 else if (GET_CODE (x
) == ASHIFT
8752 && GET_MODE (x
) == Pmode
8753 && GET_MODE (XEXP (x
, 0)) == Pmode
8754 && CONST_INT_P (XEXP (x
, 1)))
8756 type
= ADDRESS_REG_REG
;
8757 index
= XEXP (x
, 0);
8758 shift
= INTVAL (XEXP (x
, 1));
8764 && GET_CODE (index
) == SUBREG
8765 && contains_reg_of_mode
[GENERAL_REGS
][GET_MODE (SUBREG_REG (index
))])
8766 index
= SUBREG_REG (index
);
8768 if (aarch64_sve_data_mode_p (mode
))
8770 if (type
!= ADDRESS_REG_REG
8771 || (1 << shift
) != GET_MODE_UNIT_SIZE (mode
))
8777 && !(IN_RANGE (shift
, 1, 3)
8778 && known_eq (1 << shift
, GET_MODE_SIZE (mode
))))
8783 && aarch64_regno_ok_for_index_p (REGNO (index
), strict_p
))
8786 info
->offset
= index
;
8787 info
->shift
= shift
;
8794 /* Return true if MODE is one of the modes for which we
8795 support LDP/STP operations. */
8798 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode
)
8800 return mode
== SImode
|| mode
== DImode
8801 || mode
== SFmode
|| mode
== DFmode
8802 || (aarch64_vector_mode_supported_p (mode
)
8803 && (known_eq (GET_MODE_SIZE (mode
), 8)
8804 || (known_eq (GET_MODE_SIZE (mode
), 16)
8805 && (aarch64_tune_params
.extra_tuning_flags
8806 & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS
) == 0)));
8809 /* Return true if REGNO is a virtual pointer register, or an eliminable
8810 "soft" frame register. Like REGNO_PTR_FRAME_P except that we don't
8811 include stack_pointer or hard_frame_pointer. */
8813 virt_or_elim_regno_p (unsigned regno
)
8815 return ((regno
>= FIRST_VIRTUAL_REGISTER
8816 && regno
<= LAST_VIRTUAL_POINTER_REGISTER
)
8817 || regno
== FRAME_POINTER_REGNUM
8818 || regno
== ARG_POINTER_REGNUM
);
8821 /* Return true if X is a valid address of type TYPE for machine mode MODE.
8822 If it is, fill in INFO appropriately. STRICT_P is true if
8823 REG_OK_STRICT is in effect. */
8826 aarch64_classify_address (struct aarch64_address_info
*info
,
8827 rtx x
, machine_mode mode
, bool strict_p
,
8828 aarch64_addr_query_type type
)
8830 enum rtx_code code
= GET_CODE (x
);
8834 HOST_WIDE_INT const_size
;
8836 /* Whether a vector mode is partial doesn't affect address legitimacy.
8837 Partial vectors like VNx8QImode allow the same indexed addressing
8838 mode and MUL VL addressing mode as full vectors like VNx16QImode;
8839 in both cases, MUL VL counts multiples of GET_MODE_SIZE. */
8840 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
8841 vec_flags
&= ~VEC_PARTIAL
;
8843 /* On BE, we use load/store pair for all large int mode load/stores.
8844 TI/TFmode may also use a load/store pair. */
8845 bool advsimd_struct_p
= (vec_flags
== (VEC_ADVSIMD
| VEC_STRUCT
));
8846 bool load_store_pair_p
= (type
== ADDR_QUERY_LDP_STP
8847 || type
== ADDR_QUERY_LDP_STP_N
8850 || (BYTES_BIG_ENDIAN
&& advsimd_struct_p
));
8852 /* If we are dealing with ADDR_QUERY_LDP_STP_N that means the incoming mode
8853 corresponds to the actual size of the memory being loaded/stored and the
8854 mode of the corresponding addressing mode is half of that. */
8855 if (type
== ADDR_QUERY_LDP_STP_N
8856 && known_eq (GET_MODE_SIZE (mode
), 16))
8859 bool allow_reg_index_p
= (!load_store_pair_p
8860 && (known_lt (GET_MODE_SIZE (mode
), 16)
8861 || vec_flags
== VEC_ADVSIMD
8862 || vec_flags
& VEC_SVE_DATA
));
8864 /* For SVE, only accept [Rn], [Rn, Rm, LSL #shift] and
8865 [Rn, #offset, MUL VL]. */
8866 if ((vec_flags
& (VEC_SVE_DATA
| VEC_SVE_PRED
)) != 0
8867 && (code
!= REG
&& code
!= PLUS
))
8870 /* On LE, for AdvSIMD, don't support anything other than POST_INC or
8872 if (advsimd_struct_p
8873 && !BYTES_BIG_ENDIAN
8874 && (code
!= POST_INC
&& code
!= REG
))
8877 gcc_checking_assert (GET_MODE (x
) == VOIDmode
8878 || SCALAR_INT_MODE_P (GET_MODE (x
)));
8884 info
->type
= ADDRESS_REG_IMM
;
8886 info
->offset
= const0_rtx
;
8887 info
->const_offset
= 0;
8888 return aarch64_base_register_rtx_p (x
, strict_p
);
8896 && virt_or_elim_regno_p (REGNO (op0
))
8897 && poly_int_rtx_p (op1
, &offset
))
8899 info
->type
= ADDRESS_REG_IMM
;
8902 info
->const_offset
= offset
;
8907 if (maybe_ne (GET_MODE_SIZE (mode
), 0)
8908 && aarch64_base_register_rtx_p (op0
, strict_p
)
8909 && poly_int_rtx_p (op1
, &offset
))
8911 info
->type
= ADDRESS_REG_IMM
;
8914 info
->const_offset
= offset
;
8916 /* TImode and TFmode values are allowed in both pairs of X
8917 registers and individual Q registers. The available
8919 X,X: 7-bit signed scaled offset
8920 Q: 9-bit signed offset
8921 We conservatively require an offset representable in either mode.
8922 When performing the check for pairs of X registers i.e. LDP/STP
8923 pass down DImode since that is the natural size of the LDP/STP
8924 instruction memory accesses. */
8925 if (mode
== TImode
|| mode
== TFmode
)
8926 return (aarch64_offset_7bit_signed_scaled_p (DImode
, offset
)
8927 && (aarch64_offset_9bit_signed_unscaled_p (mode
, offset
)
8928 || offset_12bit_unsigned_scaled_p (mode
, offset
)));
8930 /* A 7bit offset check because OImode will emit a ldp/stp
8931 instruction (only big endian will get here).
8932 For ldp/stp instructions, the offset is scaled for the size of a
8933 single element of the pair. */
8935 return aarch64_offset_7bit_signed_scaled_p (TImode
, offset
);
8937 /* Three 9/12 bit offsets checks because CImode will emit three
8938 ldr/str instructions (only big endian will get here). */
8940 return (aarch64_offset_7bit_signed_scaled_p (TImode
, offset
)
8941 && (aarch64_offset_9bit_signed_unscaled_p (V16QImode
,
8943 || offset_12bit_unsigned_scaled_p (V16QImode
,
8946 /* Two 7bit offsets checks because XImode will emit two ldp/stp
8947 instructions (only big endian will get here). */
8949 return (aarch64_offset_7bit_signed_scaled_p (TImode
, offset
)
8950 && aarch64_offset_7bit_signed_scaled_p (TImode
,
8953 /* Make "m" use the LD1 offset range for SVE data modes, so
8954 that pre-RTL optimizers like ivopts will work to that
8955 instead of the wider LDR/STR range. */
8956 if (vec_flags
== VEC_SVE_DATA
)
8957 return (type
== ADDR_QUERY_M
8958 ? offset_4bit_signed_scaled_p (mode
, offset
)
8959 : offset_9bit_signed_scaled_p (mode
, offset
));
8961 if (vec_flags
== (VEC_SVE_DATA
| VEC_STRUCT
))
8963 poly_int64 end_offset
= (offset
8964 + GET_MODE_SIZE (mode
)
8965 - BYTES_PER_SVE_VECTOR
);
8966 return (type
== ADDR_QUERY_M
8967 ? offset_4bit_signed_scaled_p (mode
, offset
)
8968 : (offset_9bit_signed_scaled_p (SVE_BYTE_MODE
, offset
)
8969 && offset_9bit_signed_scaled_p (SVE_BYTE_MODE
,
8973 if (vec_flags
== VEC_SVE_PRED
)
8974 return offset_9bit_signed_scaled_p (mode
, offset
);
8976 if (load_store_pair_p
)
8977 return ((known_eq (GET_MODE_SIZE (mode
), 4)
8978 || known_eq (GET_MODE_SIZE (mode
), 8)
8979 || known_eq (GET_MODE_SIZE (mode
), 16))
8980 && aarch64_offset_7bit_signed_scaled_p (mode
, offset
));
8982 return (aarch64_offset_9bit_signed_unscaled_p (mode
, offset
)
8983 || offset_12bit_unsigned_scaled_p (mode
, offset
));
8986 if (allow_reg_index_p
)
8988 /* Look for base + (scaled/extended) index register. */
8989 if (aarch64_base_register_rtx_p (op0
, strict_p
)
8990 && aarch64_classify_index (info
, op1
, mode
, strict_p
))
8995 if (aarch64_base_register_rtx_p (op1
, strict_p
)
8996 && aarch64_classify_index (info
, op0
, mode
, strict_p
))
9009 info
->type
= ADDRESS_REG_WB
;
9010 info
->base
= XEXP (x
, 0);
9011 info
->offset
= NULL_RTX
;
9012 return aarch64_base_register_rtx_p (info
->base
, strict_p
);
9016 info
->type
= ADDRESS_REG_WB
;
9017 info
->base
= XEXP (x
, 0);
9018 if (GET_CODE (XEXP (x
, 1)) == PLUS
9019 && poly_int_rtx_p (XEXP (XEXP (x
, 1), 1), &offset
)
9020 && rtx_equal_p (XEXP (XEXP (x
, 1), 0), info
->base
)
9021 && aarch64_base_register_rtx_p (info
->base
, strict_p
))
9023 info
->offset
= XEXP (XEXP (x
, 1), 1);
9024 info
->const_offset
= offset
;
9026 /* TImode and TFmode values are allowed in both pairs of X
9027 registers and individual Q registers. The available
9029 X,X: 7-bit signed scaled offset
9030 Q: 9-bit signed offset
9031 We conservatively require an offset representable in either mode.
9033 if (mode
== TImode
|| mode
== TFmode
)
9034 return (aarch64_offset_7bit_signed_scaled_p (mode
, offset
)
9035 && aarch64_offset_9bit_signed_unscaled_p (mode
, offset
));
9037 if (load_store_pair_p
)
9038 return ((known_eq (GET_MODE_SIZE (mode
), 4)
9039 || known_eq (GET_MODE_SIZE (mode
), 8)
9040 || known_eq (GET_MODE_SIZE (mode
), 16))
9041 && aarch64_offset_7bit_signed_scaled_p (mode
, offset
));
9043 return aarch64_offset_9bit_signed_unscaled_p (mode
, offset
);
9050 /* load literal: pc-relative constant pool entry. Only supported
9051 for SI mode or larger. */
9052 info
->type
= ADDRESS_SYMBOLIC
;
9054 if (!load_store_pair_p
9055 && GET_MODE_SIZE (mode
).is_constant (&const_size
)
9060 split_const (x
, &sym
, &addend
);
9061 return ((GET_CODE (sym
) == LABEL_REF
9062 || (GET_CODE (sym
) == SYMBOL_REF
9063 && CONSTANT_POOL_ADDRESS_P (sym
)
9064 && aarch64_pcrelative_literal_loads
)));
9069 info
->type
= ADDRESS_LO_SUM
;
9070 info
->base
= XEXP (x
, 0);
9071 info
->offset
= XEXP (x
, 1);
9072 if (allow_reg_index_p
9073 && aarch64_base_register_rtx_p (info
->base
, strict_p
))
9076 split_const (info
->offset
, &sym
, &offs
);
9077 if (GET_CODE (sym
) == SYMBOL_REF
9078 && (aarch64_classify_symbol (sym
, INTVAL (offs
))
9079 == SYMBOL_SMALL_ABSOLUTE
))
9081 /* The symbol and offset must be aligned to the access size. */
9084 if (CONSTANT_POOL_ADDRESS_P (sym
))
9085 align
= GET_MODE_ALIGNMENT (get_pool_mode (sym
));
9086 else if (TREE_CONSTANT_POOL_ADDRESS_P (sym
))
9088 tree exp
= SYMBOL_REF_DECL (sym
);
9089 align
= TYPE_ALIGN (TREE_TYPE (exp
));
9090 align
= aarch64_constant_alignment (exp
, align
);
9092 else if (SYMBOL_REF_DECL (sym
))
9093 align
= DECL_ALIGN (SYMBOL_REF_DECL (sym
));
9094 else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym
)
9095 && SYMBOL_REF_BLOCK (sym
) != NULL
)
9096 align
= SYMBOL_REF_BLOCK (sym
)->alignment
;
9098 align
= BITS_PER_UNIT
;
9100 poly_int64 ref_size
= GET_MODE_SIZE (mode
);
9101 if (known_eq (ref_size
, 0))
9102 ref_size
= GET_MODE_SIZE (DImode
);
9104 return (multiple_p (INTVAL (offs
), ref_size
)
9105 && multiple_p (align
/ BITS_PER_UNIT
, ref_size
));
9115 /* Return true if the address X is valid for a PRFM instruction.
9116 STRICT_P is true if we should do strict checking with
9117 aarch64_classify_address. */
9120 aarch64_address_valid_for_prefetch_p (rtx x
, bool strict_p
)
9122 struct aarch64_address_info addr
;
9124 /* PRFM accepts the same addresses as DImode... */
9125 bool res
= aarch64_classify_address (&addr
, x
, DImode
, strict_p
);
9129 /* ... except writeback forms. */
9130 return addr
.type
!= ADDRESS_REG_WB
;
9134 aarch64_symbolic_address_p (rtx x
)
9138 split_const (x
, &x
, &offset
);
9139 return GET_CODE (x
) == SYMBOL_REF
|| GET_CODE (x
) == LABEL_REF
;
9142 /* Classify the base of symbolic expression X. */
9144 enum aarch64_symbol_type
9145 aarch64_classify_symbolic_expression (rtx x
)
9149 split_const (x
, &x
, &offset
);
9150 return aarch64_classify_symbol (x
, INTVAL (offset
));
9154 /* Return TRUE if X is a legitimate address for accessing memory in
9157 aarch64_legitimate_address_hook_p (machine_mode mode
, rtx x
, bool strict_p
)
9159 struct aarch64_address_info addr
;
9161 return aarch64_classify_address (&addr
, x
, mode
, strict_p
);
9164 /* Return TRUE if X is a legitimate address of type TYPE for accessing
9165 memory in mode MODE. STRICT_P is true if REG_OK_STRICT is in effect. */
9167 aarch64_legitimate_address_p (machine_mode mode
, rtx x
, bool strict_p
,
9168 aarch64_addr_query_type type
)
9170 struct aarch64_address_info addr
;
9172 return aarch64_classify_address (&addr
, x
, mode
, strict_p
, type
);
9175 /* Implement TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT. */
9178 aarch64_legitimize_address_displacement (rtx
*offset1
, rtx
*offset2
,
9179 poly_int64 orig_offset
,
9183 if (GET_MODE_SIZE (mode
).is_constant (&size
))
9185 HOST_WIDE_INT const_offset
, second_offset
;
9187 /* A general SVE offset is A * VQ + B. Remove the A component from
9188 coefficient 0 in order to get the constant B. */
9189 const_offset
= orig_offset
.coeffs
[0] - orig_offset
.coeffs
[1];
9191 /* Split an out-of-range address displacement into a base and
9192 offset. Use 4KB range for 1- and 2-byte accesses and a 16KB
9193 range otherwise to increase opportunities for sharing the base
9194 address of different sizes. Unaligned accesses use the signed
9195 9-bit range, TImode/TFmode use the intersection of signed
9196 scaled 7-bit and signed 9-bit offset. */
9197 if (mode
== TImode
|| mode
== TFmode
)
9198 second_offset
= ((const_offset
+ 0x100) & 0x1f8) - 0x100;
9199 else if ((const_offset
& (size
- 1)) != 0)
9200 second_offset
= ((const_offset
+ 0x100) & 0x1ff) - 0x100;
9202 second_offset
= const_offset
& (size
< 4 ? 0xfff : 0x3ffc);
9204 if (second_offset
== 0 || known_eq (orig_offset
, second_offset
))
9207 /* Split the offset into second_offset and the rest. */
9208 *offset1
= gen_int_mode (orig_offset
- second_offset
, Pmode
);
9209 *offset2
= gen_int_mode (second_offset
, Pmode
);
9214 /* Get the mode we should use as the basis of the range. For structure
9215 modes this is the mode of one vector. */
9216 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
9217 machine_mode step_mode
9218 = (vec_flags
& VEC_STRUCT
) != 0 ? SVE_BYTE_MODE
: mode
;
9220 /* Get the "mul vl" multiplier we'd like to use. */
9221 HOST_WIDE_INT factor
= GET_MODE_SIZE (step_mode
).coeffs
[1];
9222 HOST_WIDE_INT vnum
= orig_offset
.coeffs
[1] / factor
;
9223 if (vec_flags
& VEC_SVE_DATA
)
9224 /* LDR supports a 9-bit range, but the move patterns for
9225 structure modes require all vectors to be in range of the
9226 same base. The simplest way of accomodating that while still
9227 promoting reuse of anchor points between different modes is
9228 to use an 8-bit range unconditionally. */
9229 vnum
= ((vnum
+ 128) & 255) - 128;
9231 /* Predicates are only handled singly, so we might as well use
9233 vnum
= ((vnum
+ 256) & 511) - 256;
9237 /* Convert the "mul vl" multiplier into a byte offset. */
9238 poly_int64 second_offset
= GET_MODE_SIZE (step_mode
) * vnum
;
9239 if (known_eq (second_offset
, orig_offset
))
9242 /* Split the offset into second_offset and the rest. */
9243 *offset1
= gen_int_mode (orig_offset
- second_offset
, Pmode
);
9244 *offset2
= gen_int_mode (second_offset
, Pmode
);
9249 /* Return the binary representation of floating point constant VALUE in INTVAL.
9250 If the value cannot be converted, return false without setting INTVAL.
9251 The conversion is done in the given MODE. */
9253 aarch64_reinterpret_float_as_int (rtx value
, unsigned HOST_WIDE_INT
*intval
)
9256 /* We make a general exception for 0. */
9257 if (aarch64_float_const_zero_rtx_p (value
))
9263 scalar_float_mode mode
;
9264 if (GET_CODE (value
) != CONST_DOUBLE
9265 || !is_a
<scalar_float_mode
> (GET_MODE (value
), &mode
)
9266 || GET_MODE_BITSIZE (mode
) > HOST_BITS_PER_WIDE_INT
9267 /* Only support up to DF mode. */
9268 || GET_MODE_BITSIZE (mode
) > GET_MODE_BITSIZE (DFmode
))
9271 unsigned HOST_WIDE_INT ival
= 0;
9274 real_to_target (res
,
9275 CONST_DOUBLE_REAL_VALUE (value
),
9276 REAL_MODE_FORMAT (mode
));
9280 int order
= BYTES_BIG_ENDIAN
? 1 : 0;
9281 ival
= zext_hwi (res
[order
], 32);
9282 ival
|= (zext_hwi (res
[1 - order
], 32) << 32);
9285 ival
= zext_hwi (res
[0], 32);
9291 /* Return TRUE if rtx X is an immediate constant that can be moved using a
9292 single MOV(+MOVK) followed by an FMOV. */
9294 aarch64_float_const_rtx_p (rtx x
)
9296 machine_mode mode
= GET_MODE (x
);
9297 if (mode
== VOIDmode
)
9300 /* Determine whether it's cheaper to write float constants as
9301 mov/movk pairs over ldr/adrp pairs. */
9302 unsigned HOST_WIDE_INT ival
;
9304 if (GET_CODE (x
) == CONST_DOUBLE
9305 && SCALAR_FLOAT_MODE_P (mode
)
9306 && aarch64_reinterpret_float_as_int (x
, &ival
))
9308 scalar_int_mode imode
= (mode
== HFmode
9310 : int_mode_for_mode (mode
).require ());
9311 int num_instr
= aarch64_internal_mov_immediate
9312 (NULL_RTX
, gen_int_mode (ival
, imode
), false, imode
);
9313 return num_instr
< 3;
9319 /* Return TRUE if rtx X is immediate constant 0.0 */
9321 aarch64_float_const_zero_rtx_p (rtx x
)
9323 if (GET_MODE (x
) == VOIDmode
)
9326 if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x
)))
9327 return !HONOR_SIGNED_ZEROS (GET_MODE (x
));
9328 return real_equal (CONST_DOUBLE_REAL_VALUE (x
), &dconst0
);
9331 /* Return TRUE if rtx X is immediate constant that fits in a single
9332 MOVI immediate operation. */
9334 aarch64_can_const_movi_rtx_p (rtx x
, machine_mode mode
)
9340 scalar_int_mode imode
;
9341 unsigned HOST_WIDE_INT ival
;
9343 if (GET_CODE (x
) == CONST_DOUBLE
9344 && SCALAR_FLOAT_MODE_P (mode
))
9346 if (!aarch64_reinterpret_float_as_int (x
, &ival
))
9349 /* We make a general exception for 0. */
9350 if (aarch64_float_const_zero_rtx_p (x
))
9353 imode
= int_mode_for_mode (mode
).require ();
9355 else if (GET_CODE (x
) == CONST_INT
9356 && is_a
<scalar_int_mode
> (mode
, &imode
))
9361 /* use a 64 bit mode for everything except for DI/DF mode, where we use
9362 a 128 bit vector mode. */
9363 int width
= GET_MODE_BITSIZE (imode
) == 64 ? 128 : 64;
9365 vmode
= aarch64_simd_container_mode (imode
, width
);
9366 rtx v_op
= aarch64_simd_gen_const_vector_dup (vmode
, ival
);
9368 return aarch64_simd_valid_immediate (v_op
, NULL
);
9372 /* Return the fixed registers used for condition codes. */
9375 aarch64_fixed_condition_code_regs (unsigned int *p1
, unsigned int *p2
)
9378 *p2
= INVALID_REGNUM
;
9382 /* This function is used by the call expanders of the machine description.
9383 RESULT is the register in which the result is returned. It's NULL for
9384 "call" and "sibcall".
9385 MEM is the location of the function call.
9386 CALLEE_ABI is a const_int that gives the arm_pcs of the callee.
9387 SIBCALL indicates whether this function call is normal call or sibling call.
9388 It will generate different pattern accordingly. */
9391 aarch64_expand_call (rtx result
, rtx mem
, rtx callee_abi
, bool sibcall
)
9393 rtx call
, callee
, tmp
;
9397 gcc_assert (MEM_P (mem
));
9398 callee
= XEXP (mem
, 0);
9399 mode
= GET_MODE (callee
);
9400 gcc_assert (mode
== Pmode
);
9402 /* Decide if we should generate indirect calls by loading the
9403 address of the callee into a register before performing
9404 the branch-and-link. */
9405 if (SYMBOL_REF_P (callee
)
9406 ? (aarch64_is_long_call_p (callee
)
9407 || aarch64_is_noplt_call_p (callee
))
9409 XEXP (mem
, 0) = force_reg (mode
, callee
);
9411 call
= gen_rtx_CALL (VOIDmode
, mem
, const0_rtx
);
9413 if (result
!= NULL_RTX
)
9414 call
= gen_rtx_SET (result
, call
);
9419 tmp
= gen_rtx_CLOBBER (VOIDmode
, gen_rtx_REG (Pmode
, LR_REGNUM
));
9421 gcc_assert (CONST_INT_P (callee_abi
));
9422 callee_abi
= gen_rtx_UNSPEC (DImode
, gen_rtvec (1, callee_abi
),
9425 vec
= gen_rtvec (3, call
, callee_abi
, tmp
);
9426 call
= gen_rtx_PARALLEL (VOIDmode
, vec
);
9428 aarch64_emit_call_insn (call
);
9431 /* Emit call insn with PAT and do aarch64-specific handling. */
9434 aarch64_emit_call_insn (rtx pat
)
9436 rtx insn
= emit_call_insn (pat
);
9438 rtx
*fusage
= &CALL_INSN_FUNCTION_USAGE (insn
);
9439 clobber_reg (fusage
, gen_rtx_REG (word_mode
, IP0_REGNUM
));
9440 clobber_reg (fusage
, gen_rtx_REG (word_mode
, IP1_REGNUM
));
9444 aarch64_select_cc_mode (RTX_CODE code
, rtx x
, rtx y
)
9446 machine_mode mode_x
= GET_MODE (x
);
9447 rtx_code code_x
= GET_CODE (x
);
9449 /* All floating point compares return CCFP if it is an equality
9450 comparison, and CCFPE otherwise. */
9451 if (GET_MODE_CLASS (mode_x
) == MODE_FLOAT
)
9478 /* Equality comparisons of short modes against zero can be performed
9479 using the TST instruction with the appropriate bitmask. */
9480 if (y
== const0_rtx
&& (REG_P (x
) || SUBREG_P (x
))
9481 && (code
== EQ
|| code
== NE
)
9482 && (mode_x
== HImode
|| mode_x
== QImode
))
9485 /* Similarly, comparisons of zero_extends from shorter modes can
9486 be performed using an ANDS with an immediate mask. */
9487 if (y
== const0_rtx
&& code_x
== ZERO_EXTEND
9488 && (mode_x
== SImode
|| mode_x
== DImode
)
9489 && (GET_MODE (XEXP (x
, 0)) == HImode
|| GET_MODE (XEXP (x
, 0)) == QImode
)
9490 && (code
== EQ
|| code
== NE
))
9493 if ((mode_x
== SImode
|| mode_x
== DImode
)
9495 && (code
== EQ
|| code
== NE
|| code
== LT
|| code
== GE
)
9496 && (code_x
== PLUS
|| code_x
== MINUS
|| code_x
== AND
9498 || (code_x
== ZERO_EXTRACT
&& CONST_INT_P (XEXP (x
, 1))
9499 && CONST_INT_P (XEXP (x
, 2)))))
9502 /* A compare with a shifted operand. Because of canonicalization,
9503 the comparison will have to be swapped when we emit the assembly
9505 if ((mode_x
== SImode
|| mode_x
== DImode
)
9506 && (REG_P (y
) || GET_CODE (y
) == SUBREG
|| y
== const0_rtx
)
9507 && (code_x
== ASHIFT
|| code_x
== ASHIFTRT
9508 || code_x
== LSHIFTRT
9509 || code_x
== ZERO_EXTEND
|| code_x
== SIGN_EXTEND
))
9512 /* Similarly for a negated operand, but we can only do this for
9514 if ((mode_x
== SImode
|| mode_x
== DImode
)
9515 && (REG_P (y
) || GET_CODE (y
) == SUBREG
)
9516 && (code
== EQ
|| code
== NE
)
9520 /* A test for unsigned overflow from an addition. */
9521 if ((mode_x
== DImode
|| mode_x
== TImode
)
9522 && (code
== LTU
|| code
== GEU
)
9524 && rtx_equal_p (XEXP (x
, 0), y
))
9527 /* A test for unsigned overflow from an add with carry. */
9528 if ((mode_x
== DImode
|| mode_x
== TImode
)
9529 && (code
== LTU
|| code
== GEU
)
9531 && CONST_SCALAR_INT_P (y
)
9532 && (rtx_mode_t (y
, mode_x
)
9533 == (wi::shwi (1, mode_x
)
9534 << (GET_MODE_BITSIZE (mode_x
).to_constant () / 2))))
9537 /* A test for signed overflow. */
9538 if ((mode_x
== DImode
|| mode_x
== TImode
)
9541 && GET_CODE (y
) == SIGN_EXTEND
)
9544 /* For everything else, return CCmode. */
9549 aarch64_get_condition_code_1 (machine_mode
, enum rtx_code
);
9552 aarch64_get_condition_code (rtx x
)
9554 machine_mode mode
= GET_MODE (XEXP (x
, 0));
9555 enum rtx_code comp_code
= GET_CODE (x
);
9557 if (GET_MODE_CLASS (mode
) != MODE_CC
)
9558 mode
= SELECT_CC_MODE (comp_code
, XEXP (x
, 0), XEXP (x
, 1));
9559 return aarch64_get_condition_code_1 (mode
, comp_code
);
9563 aarch64_get_condition_code_1 (machine_mode mode
, enum rtx_code comp_code
)
9571 case GE
: return AARCH64_GE
;
9572 case GT
: return AARCH64_GT
;
9573 case LE
: return AARCH64_LS
;
9574 case LT
: return AARCH64_MI
;
9575 case NE
: return AARCH64_NE
;
9576 case EQ
: return AARCH64_EQ
;
9577 case ORDERED
: return AARCH64_VC
;
9578 case UNORDERED
: return AARCH64_VS
;
9579 case UNLT
: return AARCH64_LT
;
9580 case UNLE
: return AARCH64_LE
;
9581 case UNGT
: return AARCH64_HI
;
9582 case UNGE
: return AARCH64_PL
;
9590 case NE
: return AARCH64_NE
;
9591 case EQ
: return AARCH64_EQ
;
9592 case GE
: return AARCH64_GE
;
9593 case GT
: return AARCH64_GT
;
9594 case LE
: return AARCH64_LE
;
9595 case LT
: return AARCH64_LT
;
9596 case GEU
: return AARCH64_CS
;
9597 case GTU
: return AARCH64_HI
;
9598 case LEU
: return AARCH64_LS
;
9599 case LTU
: return AARCH64_CC
;
9607 case NE
: return AARCH64_NE
;
9608 case EQ
: return AARCH64_EQ
;
9609 case GE
: return AARCH64_LE
;
9610 case GT
: return AARCH64_LT
;
9611 case LE
: return AARCH64_GE
;
9612 case LT
: return AARCH64_GT
;
9613 case GEU
: return AARCH64_LS
;
9614 case GTU
: return AARCH64_CC
;
9615 case LEU
: return AARCH64_CS
;
9616 case LTU
: return AARCH64_HI
;
9624 case NE
: return AARCH64_NE
; /* = any */
9625 case EQ
: return AARCH64_EQ
; /* = none */
9626 case GE
: return AARCH64_PL
; /* = nfrst */
9627 case LT
: return AARCH64_MI
; /* = first */
9628 case GEU
: return AARCH64_CS
; /* = nlast */
9629 case GTU
: return AARCH64_HI
; /* = pmore */
9630 case LEU
: return AARCH64_LS
; /* = plast */
9631 case LTU
: return AARCH64_CC
; /* = last */
9639 case NE
: return AARCH64_NE
;
9640 case EQ
: return AARCH64_EQ
;
9641 case GE
: return AARCH64_PL
;
9642 case LT
: return AARCH64_MI
;
9650 case NE
: return AARCH64_NE
;
9651 case EQ
: return AARCH64_EQ
;
9659 case LTU
: return AARCH64_CS
;
9660 case GEU
: return AARCH64_CC
;
9668 case GEU
: return AARCH64_CS
;
9669 case LTU
: return AARCH64_CC
;
9677 case NE
: return AARCH64_VS
;
9678 case EQ
: return AARCH64_VC
;
9691 aarch64_const_vec_all_same_in_range_p (rtx x
,
9692 HOST_WIDE_INT minval
,
9693 HOST_WIDE_INT maxval
)
9696 return (const_vec_duplicate_p (x
, &elt
)
9697 && CONST_INT_P (elt
)
9698 && IN_RANGE (INTVAL (elt
), minval
, maxval
));
9702 aarch64_const_vec_all_same_int_p (rtx x
, HOST_WIDE_INT val
)
9704 return aarch64_const_vec_all_same_in_range_p (x
, val
, val
);
9707 /* Return true if VEC is a constant in which every element is in the range
9708 [MINVAL, MAXVAL]. The elements do not need to have the same value. */
9711 aarch64_const_vec_all_in_range_p (rtx vec
,
9712 HOST_WIDE_INT minval
,
9713 HOST_WIDE_INT maxval
)
9715 if (GET_CODE (vec
) != CONST_VECTOR
9716 || GET_MODE_CLASS (GET_MODE (vec
)) != MODE_VECTOR_INT
)
9720 if (!CONST_VECTOR_STEPPED_P (vec
))
9721 nunits
= const_vector_encoded_nelts (vec
);
9722 else if (!CONST_VECTOR_NUNITS (vec
).is_constant (&nunits
))
9725 for (int i
= 0; i
< nunits
; i
++)
9727 rtx vec_elem
= CONST_VECTOR_ELT (vec
, i
);
9728 if (!CONST_INT_P (vec_elem
)
9729 || !IN_RANGE (INTVAL (vec_elem
), minval
, maxval
))
9736 #define AARCH64_CC_V 1
9737 #define AARCH64_CC_C (1 << 1)
9738 #define AARCH64_CC_Z (1 << 2)
9739 #define AARCH64_CC_N (1 << 3)
9741 /* N Z C V flags for ccmp. Indexed by AARCH64_COND_CODE. */
9742 static const int aarch64_nzcv_codes
[] =
9744 0, /* EQ, Z == 1. */
9745 AARCH64_CC_Z
, /* NE, Z == 0. */
9746 0, /* CS, C == 1. */
9747 AARCH64_CC_C
, /* CC, C == 0. */
9748 0, /* MI, N == 1. */
9749 AARCH64_CC_N
, /* PL, N == 0. */
9750 0, /* VS, V == 1. */
9751 AARCH64_CC_V
, /* VC, V == 0. */
9752 0, /* HI, C ==1 && Z == 0. */
9753 AARCH64_CC_C
, /* LS, !(C == 1 && Z == 0). */
9754 AARCH64_CC_V
, /* GE, N == V. */
9755 0, /* LT, N != V. */
9756 AARCH64_CC_Z
, /* GT, Z == 0 && N == V. */
9757 0, /* LE, !(Z == 0 && N == V). */
9762 /* Print floating-point vector immediate operand X to F, negating it
9763 first if NEGATE is true. Return true on success, false if it isn't
9764 a constant we can handle. */
9767 aarch64_print_vector_float_operand (FILE *f
, rtx x
, bool negate
)
9771 if (!const_vec_duplicate_p (x
, &elt
))
9774 REAL_VALUE_TYPE r
= *CONST_DOUBLE_REAL_VALUE (elt
);
9776 r
= real_value_negate (&r
);
9778 /* Handle the SVE single-bit immediates specially, since they have a
9779 fixed form in the assembly syntax. */
9780 if (real_equal (&r
, &dconst0
))
9781 asm_fprintf (f
, "0.0");
9782 else if (real_equal (&r
, &dconst2
))
9783 asm_fprintf (f
, "2.0");
9784 else if (real_equal (&r
, &dconst1
))
9785 asm_fprintf (f
, "1.0");
9786 else if (real_equal (&r
, &dconsthalf
))
9787 asm_fprintf (f
, "0.5");
9790 const int buf_size
= 20;
9791 char float_buf
[buf_size
] = {'\0'};
9792 real_to_decimal_for_mode (float_buf
, &r
, buf_size
, buf_size
,
9794 asm_fprintf (f
, "%s", float_buf
);
9800 /* Return the equivalent letter for size. */
9802 sizetochar (int size
)
9806 case 64: return 'd';
9807 case 32: return 's';
9808 case 16: return 'h';
9809 case 8 : return 'b';
9810 default: gcc_unreachable ();
9814 /* Print operand X to file F in a target specific manner according to CODE.
9815 The acceptable formatting commands given by CODE are:
9816 'c': An integer or symbol address without a preceding #
9818 'C': Take the duplicated element in a vector constant
9819 and print it in hex.
9820 'D': Take the duplicated element in a vector constant
9821 and print it as an unsigned integer, in decimal.
9822 'e': Print the sign/zero-extend size as a character 8->b,
9823 16->h, 32->w. Can also be used for masks:
9824 0xff->b, 0xffff->h, 0xffffffff->w.
9825 'I': If the operand is a duplicated vector constant,
9826 replace it with the duplicated scalar. If the
9827 operand is then a floating-point constant, replace
9828 it with the integer bit representation. Print the
9829 transformed constant as a signed decimal number.
9830 'p': Prints N such that 2^N == X (X must be power of 2 and
9832 'P': Print the number of non-zero bits in X (a const_int).
9833 'H': Print the higher numbered register of a pair (TImode)
9835 'm': Print a condition (eq, ne, etc).
9836 'M': Same as 'm', but invert condition.
9837 'N': Take the duplicated element in a vector constant
9838 and print the negative of it in decimal.
9839 'b/h/s/d/q': Print a scalar FP/SIMD register name.
9840 'S/T/U/V': Print a FP/SIMD register name for a register list.
9841 The register printed is the FP/SIMD register name
9842 of X + 0/1/2/3 for S/T/U/V.
9843 'R': Print a scalar Integer/FP/SIMD register name + 1.
9844 'X': Print bottom 16 bits of integer constant in hex.
9845 'w/x': Print a general register name or the zero register
9847 '0': Print a normal operand, if it's a general register,
9848 then we assume DImode.
9849 'k': Print NZCV for conditional compare instructions.
9850 'A': Output address constant representing the first
9851 argument of X, specifying a relocation offset
9853 'L': Output constant address specified by X
9854 with a relocation offset if appropriate.
9855 'G': Prints address of X, specifying a PC relative
9856 relocation mode if appropriate.
9857 'y': Output address of LDP or STP - this is used for
9858 some LDP/STPs which don't use a PARALLEL in their
9859 pattern (so the mode needs to be adjusted).
9860 'z': Output address of a typical LDP or STP. */
9863 aarch64_print_operand (FILE *f
, rtx x
, int code
)
9869 switch (GET_CODE (x
))
9872 fprintf (f
, HOST_WIDE_INT_PRINT_DEC
, INTVAL (x
));
9876 output_addr_const (f
, x
);
9880 if (GET_CODE (XEXP (x
, 0)) == PLUS
9881 && GET_CODE (XEXP (XEXP (x
, 0), 0)) == SYMBOL_REF
)
9883 output_addr_const (f
, x
);
9889 output_operand_lossage ("unsupported operand for code '%c'", code
);
9895 x
= unwrap_const_vec_duplicate (x
);
9896 if (!CONST_INT_P (x
))
9898 output_operand_lossage ("invalid operand for '%%%c'", code
);
9902 HOST_WIDE_INT val
= INTVAL (x
);
9903 if ((val
& ~7) == 8 || val
== 0xff)
9905 else if ((val
& ~7) == 16 || val
== 0xffff)
9907 else if ((val
& ~7) == 32 || val
== 0xffffffff)
9911 output_operand_lossage ("invalid operand for '%%%c'", code
);
9921 if (!CONST_INT_P (x
) || (n
= exact_log2 (INTVAL (x
))) < 0)
9923 output_operand_lossage ("invalid operand for '%%%c'", code
);
9927 asm_fprintf (f
, "%d", n
);
9932 if (!CONST_INT_P (x
))
9934 output_operand_lossage ("invalid operand for '%%%c'", code
);
9938 asm_fprintf (f
, "%u", popcount_hwi (INTVAL (x
)));
9942 if (x
== const0_rtx
)
9944 asm_fprintf (f
, "xzr");
9948 if (!REG_P (x
) || !GP_REGNUM_P (REGNO (x
) + 1))
9950 output_operand_lossage ("invalid operand for '%%%c'", code
);
9954 asm_fprintf (f
, "%s", reg_names
[REGNO (x
) + 1]);
9959 x
= aarch64_bit_representation (unwrap_const_vec_duplicate (x
));
9960 if (CONST_INT_P (x
))
9961 asm_fprintf (f
, "%wd", INTVAL (x
));
9964 output_operand_lossage ("invalid operand for '%%%c'", code
);
9974 /* CONST_TRUE_RTX means al/nv (al is the default, don't print it). */
9975 if (x
== const_true_rtx
)
9982 if (!COMPARISON_P (x
))
9984 output_operand_lossage ("invalid operand for '%%%c'", code
);
9988 cond_code
= aarch64_get_condition_code (x
);
9989 gcc_assert (cond_code
>= 0);
9991 cond_code
= AARCH64_INVERSE_CONDITION_CODE (cond_code
);
9992 if (GET_MODE (XEXP (x
, 0)) == CC_NZCmode
)
9993 fputs (aarch64_sve_condition_codes
[cond_code
], f
);
9995 fputs (aarch64_condition_codes
[cond_code
], f
);
10000 if (!const_vec_duplicate_p (x
, &elt
))
10002 output_operand_lossage ("invalid vector constant");
10006 if (GET_MODE_CLASS (GET_MODE (x
)) == MODE_VECTOR_INT
)
10007 asm_fprintf (f
, "%wd", -INTVAL (elt
));
10008 else if (GET_MODE_CLASS (GET_MODE (x
)) == MODE_VECTOR_FLOAT
10009 && aarch64_print_vector_float_operand (f
, x
, true))
10013 output_operand_lossage ("invalid vector constant");
10023 if (!REG_P (x
) || !FP_REGNUM_P (REGNO (x
)))
10025 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code
);
10028 asm_fprintf (f
, "%c%d", code
, REGNO (x
) - V0_REGNUM
);
10035 if (!REG_P (x
) || !FP_REGNUM_P (REGNO (x
)))
10037 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code
);
10040 asm_fprintf (f
, "%c%d",
10041 aarch64_sve_data_mode_p (GET_MODE (x
)) ? 'z' : 'v',
10042 REGNO (x
) - V0_REGNUM
+ (code
- 'S'));
10046 if (REG_P (x
) && FP_REGNUM_P (REGNO (x
)))
10047 asm_fprintf (f
, "q%d", REGNO (x
) - V0_REGNUM
+ 1);
10048 else if (REG_P (x
) && GP_REGNUM_P (REGNO (x
)))
10049 asm_fprintf (f
, "x%d", REGNO (x
) - R0_REGNUM
+ 1);
10051 output_operand_lossage ("incompatible register operand for '%%%c'",
10056 if (!CONST_INT_P (x
))
10058 output_operand_lossage ("invalid operand for '%%%c'", code
);
10061 asm_fprintf (f
, "0x%wx", UINTVAL (x
) & 0xffff);
10066 /* Print a replicated constant in hex. */
10067 if (!const_vec_duplicate_p (x
, &elt
) || !CONST_INT_P (elt
))
10069 output_operand_lossage ("invalid operand for '%%%c'", code
);
10072 scalar_mode inner_mode
= GET_MODE_INNER (GET_MODE (x
));
10073 asm_fprintf (f
, "0x%wx", UINTVAL (elt
) & GET_MODE_MASK (inner_mode
));
10079 /* Print a replicated constant in decimal, treating it as
10081 if (!const_vec_duplicate_p (x
, &elt
) || !CONST_INT_P (elt
))
10083 output_operand_lossage ("invalid operand for '%%%c'", code
);
10086 scalar_mode inner_mode
= GET_MODE_INNER (GET_MODE (x
));
10087 asm_fprintf (f
, "%wd", UINTVAL (elt
) & GET_MODE_MASK (inner_mode
));
10093 if (x
== const0_rtx
10094 || (CONST_DOUBLE_P (x
) && aarch64_float_const_zero_rtx_p (x
)))
10096 asm_fprintf (f
, "%czr", code
);
10100 if (REG_P (x
) && GP_REGNUM_P (REGNO (x
)))
10102 asm_fprintf (f
, "%c%d", code
, REGNO (x
) - R0_REGNUM
);
10106 if (REG_P (x
) && REGNO (x
) == SP_REGNUM
)
10108 asm_fprintf (f
, "%ssp", code
== 'w' ? "w" : "");
10117 output_operand_lossage ("missing operand");
10121 switch (GET_CODE (x
))
10124 if (aarch64_sve_data_mode_p (GET_MODE (x
)))
10126 if (REG_NREGS (x
) == 1)
10127 asm_fprintf (f
, "z%d", REGNO (x
) - V0_REGNUM
);
10131 = sizetochar (GET_MODE_UNIT_BITSIZE (GET_MODE (x
)));
10132 asm_fprintf (f
, "{z%d.%c - z%d.%c}",
10133 REGNO (x
) - V0_REGNUM
, suffix
,
10134 END_REGNO (x
) - V0_REGNUM
- 1, suffix
);
10138 asm_fprintf (f
, "%s", reg_names
[REGNO (x
)]);
10142 output_address (GET_MODE (x
), XEXP (x
, 0));
10147 output_addr_const (asm_out_file
, x
);
10151 asm_fprintf (f
, "%wd", INTVAL (x
));
10155 if (!VECTOR_MODE_P (GET_MODE (x
)))
10157 output_addr_const (asm_out_file
, x
);
10163 if (!const_vec_duplicate_p (x
, &elt
))
10165 output_operand_lossage ("invalid vector constant");
10169 if (GET_MODE_CLASS (GET_MODE (x
)) == MODE_VECTOR_INT
)
10170 asm_fprintf (f
, "%wd", INTVAL (elt
));
10171 else if (GET_MODE_CLASS (GET_MODE (x
)) == MODE_VECTOR_FLOAT
10172 && aarch64_print_vector_float_operand (f
, x
, false))
10176 output_operand_lossage ("invalid vector constant");
10182 /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
10183 be getting CONST_DOUBLEs holding integers. */
10184 gcc_assert (GET_MODE (x
) != VOIDmode
);
10185 if (aarch64_float_const_zero_rtx_p (x
))
10190 else if (aarch64_float_const_representable_p (x
))
10192 #define buf_size 20
10193 char float_buf
[buf_size
] = {'\0'};
10194 real_to_decimal_for_mode (float_buf
,
10195 CONST_DOUBLE_REAL_VALUE (x
),
10196 buf_size
, buf_size
,
10198 asm_fprintf (asm_out_file
, "%s", float_buf
);
10202 output_operand_lossage ("invalid constant");
10205 output_operand_lossage ("invalid operand");
10211 if (GET_CODE (x
) == HIGH
)
10214 switch (aarch64_classify_symbolic_expression (x
))
10216 case SYMBOL_SMALL_GOT_4G
:
10217 asm_fprintf (asm_out_file
, ":got:");
10220 case SYMBOL_SMALL_TLSGD
:
10221 asm_fprintf (asm_out_file
, ":tlsgd:");
10224 case SYMBOL_SMALL_TLSDESC
:
10225 asm_fprintf (asm_out_file
, ":tlsdesc:");
10228 case SYMBOL_SMALL_TLSIE
:
10229 asm_fprintf (asm_out_file
, ":gottprel:");
10232 case SYMBOL_TLSLE24
:
10233 asm_fprintf (asm_out_file
, ":tprel:");
10236 case SYMBOL_TINY_GOT
:
10237 gcc_unreachable ();
10243 output_addr_const (asm_out_file
, x
);
10247 switch (aarch64_classify_symbolic_expression (x
))
10249 case SYMBOL_SMALL_GOT_4G
:
10250 asm_fprintf (asm_out_file
, ":lo12:");
10253 case SYMBOL_SMALL_TLSGD
:
10254 asm_fprintf (asm_out_file
, ":tlsgd_lo12:");
10257 case SYMBOL_SMALL_TLSDESC
:
10258 asm_fprintf (asm_out_file
, ":tlsdesc_lo12:");
10261 case SYMBOL_SMALL_TLSIE
:
10262 asm_fprintf (asm_out_file
, ":gottprel_lo12:");
10265 case SYMBOL_TLSLE12
:
10266 asm_fprintf (asm_out_file
, ":tprel_lo12:");
10269 case SYMBOL_TLSLE24
:
10270 asm_fprintf (asm_out_file
, ":tprel_lo12_nc:");
10273 case SYMBOL_TINY_GOT
:
10274 asm_fprintf (asm_out_file
, ":got:");
10277 case SYMBOL_TINY_TLSIE
:
10278 asm_fprintf (asm_out_file
, ":gottprel:");
10284 output_addr_const (asm_out_file
, x
);
10288 switch (aarch64_classify_symbolic_expression (x
))
10290 case SYMBOL_TLSLE24
:
10291 asm_fprintf (asm_out_file
, ":tprel_hi12:");
10296 output_addr_const (asm_out_file
, x
);
10301 HOST_WIDE_INT cond_code
;
10303 if (!CONST_INT_P (x
))
10305 output_operand_lossage ("invalid operand for '%%%c'", code
);
10309 cond_code
= INTVAL (x
);
10310 gcc_assert (cond_code
>= 0 && cond_code
<= AARCH64_NV
);
10311 asm_fprintf (f
, "%d", aarch64_nzcv_codes
[cond_code
]);
10318 machine_mode mode
= GET_MODE (x
);
10320 if (GET_CODE (x
) != MEM
10321 || (code
== 'y' && maybe_ne (GET_MODE_SIZE (mode
), 16)))
10323 output_operand_lossage ("invalid operand for '%%%c'", code
);
10327 if (!aarch64_print_address_internal (f
, mode
, XEXP (x
, 0),
10329 ? ADDR_QUERY_LDP_STP_N
10330 : ADDR_QUERY_LDP_STP
))
10331 output_operand_lossage ("invalid operand prefix '%%%c'", code
);
10336 output_operand_lossage ("invalid operand prefix '%%%c'", code
);
10341 /* Print address 'x' of a memory access with mode 'mode'.
10342 'op' is the context required by aarch64_classify_address. It can either be
10343 MEM for a normal memory access or PARALLEL for LDP/STP. */
10345 aarch64_print_address_internal (FILE *f
, machine_mode mode
, rtx x
,
10346 aarch64_addr_query_type type
)
10348 struct aarch64_address_info addr
;
10349 unsigned int size
, vec_flags
;
10351 /* Check all addresses are Pmode - including ILP32. */
10352 if (GET_MODE (x
) != Pmode
10353 && (!CONST_INT_P (x
)
10354 || trunc_int_for_mode (INTVAL (x
), Pmode
) != INTVAL (x
)))
10356 output_operand_lossage ("invalid address mode");
10360 if (aarch64_classify_address (&addr
, x
, mode
, true, type
))
10363 case ADDRESS_REG_IMM
:
10364 if (known_eq (addr
.const_offset
, 0))
10366 asm_fprintf (f
, "[%s]", reg_names
[REGNO (addr
.base
)]);
10370 vec_flags
= aarch64_classify_vector_mode (mode
);
10371 if (vec_flags
& VEC_ANY_SVE
)
10374 = exact_div (addr
.const_offset
,
10375 aarch64_vl_bytes (mode
, vec_flags
)).to_constant ();
10376 asm_fprintf (f
, "[%s, #%wd, mul vl]",
10377 reg_names
[REGNO (addr
.base
)], vnum
);
10381 asm_fprintf (f
, "[%s, %wd]", reg_names
[REGNO (addr
.base
)],
10382 INTVAL (addr
.offset
));
10385 case ADDRESS_REG_REG
:
10386 if (addr
.shift
== 0)
10387 asm_fprintf (f
, "[%s, %s]", reg_names
[REGNO (addr
.base
)],
10388 reg_names
[REGNO (addr
.offset
)]);
10390 asm_fprintf (f
, "[%s, %s, lsl %u]", reg_names
[REGNO (addr
.base
)],
10391 reg_names
[REGNO (addr
.offset
)], addr
.shift
);
10394 case ADDRESS_REG_UXTW
:
10395 if (addr
.shift
== 0)
10396 asm_fprintf (f
, "[%s, w%d, uxtw]", reg_names
[REGNO (addr
.base
)],
10397 REGNO (addr
.offset
) - R0_REGNUM
);
10399 asm_fprintf (f
, "[%s, w%d, uxtw %u]", reg_names
[REGNO (addr
.base
)],
10400 REGNO (addr
.offset
) - R0_REGNUM
, addr
.shift
);
10403 case ADDRESS_REG_SXTW
:
10404 if (addr
.shift
== 0)
10405 asm_fprintf (f
, "[%s, w%d, sxtw]", reg_names
[REGNO (addr
.base
)],
10406 REGNO (addr
.offset
) - R0_REGNUM
);
10408 asm_fprintf (f
, "[%s, w%d, sxtw %u]", reg_names
[REGNO (addr
.base
)],
10409 REGNO (addr
.offset
) - R0_REGNUM
, addr
.shift
);
10412 case ADDRESS_REG_WB
:
10413 /* Writeback is only supported for fixed-width modes. */
10414 size
= GET_MODE_SIZE (mode
).to_constant ();
10415 switch (GET_CODE (x
))
10418 asm_fprintf (f
, "[%s, %d]!", reg_names
[REGNO (addr
.base
)], size
);
10421 asm_fprintf (f
, "[%s], %d", reg_names
[REGNO (addr
.base
)], size
);
10424 asm_fprintf (f
, "[%s, -%d]!", reg_names
[REGNO (addr
.base
)], size
);
10427 asm_fprintf (f
, "[%s], -%d", reg_names
[REGNO (addr
.base
)], size
);
10430 asm_fprintf (f
, "[%s, %wd]!", reg_names
[REGNO (addr
.base
)],
10431 INTVAL (addr
.offset
));
10434 asm_fprintf (f
, "[%s], %wd", reg_names
[REGNO (addr
.base
)],
10435 INTVAL (addr
.offset
));
10442 case ADDRESS_LO_SUM
:
10443 asm_fprintf (f
, "[%s, #:lo12:", reg_names
[REGNO (addr
.base
)]);
10444 output_addr_const (f
, addr
.offset
);
10445 asm_fprintf (f
, "]");
10448 case ADDRESS_SYMBOLIC
:
10449 output_addr_const (f
, x
);
10456 /* Print address 'x' of a memory access with mode 'mode'. */
10458 aarch64_print_operand_address (FILE *f
, machine_mode mode
, rtx x
)
10460 if (!aarch64_print_address_internal (f
, mode
, x
, ADDR_QUERY_ANY
))
10461 output_addr_const (f
, x
);
10465 aarch64_label_mentioned_p (rtx x
)
10470 if (GET_CODE (x
) == LABEL_REF
)
10473 /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
10474 referencing instruction, but they are constant offsets, not
10476 if (GET_CODE (x
) == UNSPEC
&& XINT (x
, 1) == UNSPEC_TLS
)
10479 fmt
= GET_RTX_FORMAT (GET_CODE (x
));
10480 for (i
= GET_RTX_LENGTH (GET_CODE (x
)) - 1; i
>= 0; i
--)
10486 for (j
= XVECLEN (x
, i
) - 1; j
>= 0; j
--)
10487 if (aarch64_label_mentioned_p (XVECEXP (x
, i
, j
)))
10490 else if (fmt
[i
] == 'e' && aarch64_label_mentioned_p (XEXP (x
, i
)))
10497 /* Implement REGNO_REG_CLASS. */
10500 aarch64_regno_regclass (unsigned regno
)
10502 if (GP_REGNUM_P (regno
))
10503 return GENERAL_REGS
;
10505 if (regno
== SP_REGNUM
)
10508 if (regno
== FRAME_POINTER_REGNUM
10509 || regno
== ARG_POINTER_REGNUM
)
10510 return POINTER_REGS
;
10512 if (FP_REGNUM_P (regno
))
10513 return (FP_LO8_REGNUM_P (regno
) ? FP_LO8_REGS
10514 : FP_LO_REGNUM_P (regno
) ? FP_LO_REGS
: FP_REGS
);
10516 if (PR_REGNUM_P (regno
))
10517 return PR_LO_REGNUM_P (regno
) ? PR_LO_REGS
: PR_HI_REGS
;
10519 if (regno
== FFR_REGNUM
|| regno
== FFRT_REGNUM
)
10525 /* OFFSET is an address offset for mode MODE, which has SIZE bytes.
10526 If OFFSET is out of range, return an offset of an anchor point
10527 that is in range. Return 0 otherwise. */
10529 static HOST_WIDE_INT
10530 aarch64_anchor_offset (HOST_WIDE_INT offset
, HOST_WIDE_INT size
,
10533 /* Does it look like we'll need a 16-byte load/store-pair operation? */
10535 return (offset
+ 0x400) & ~0x7f0;
10537 /* For offsets that aren't a multiple of the access size, the limit is
10539 if (offset
& (size
- 1))
10541 /* BLKmode typically uses LDP of X-registers. */
10542 if (mode
== BLKmode
)
10543 return (offset
+ 512) & ~0x3ff;
10544 return (offset
+ 0x100) & ~0x1ff;
10547 /* Small negative offsets are supported. */
10548 if (IN_RANGE (offset
, -256, 0))
10551 if (mode
== TImode
|| mode
== TFmode
)
10552 return (offset
+ 0x100) & ~0x1ff;
10554 /* Use 12-bit offset by access size. */
10555 return offset
& (~0xfff * size
);
10559 aarch64_legitimize_address (rtx x
, rtx
/* orig_x */, machine_mode mode
)
10561 /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
10562 where mask is selected by alignment and size of the offset.
10563 We try to pick as large a range for the offset as possible to
10564 maximize the chance of a CSE. However, for aligned addresses
10565 we limit the range to 4k so that structures with different sized
10566 elements are likely to use the same base. We need to be careful
10567 not to split a CONST for some forms of address expression, otherwise
10568 it will generate sub-optimal code. */
10570 if (GET_CODE (x
) == PLUS
&& CONST_INT_P (XEXP (x
, 1)))
10572 rtx base
= XEXP (x
, 0);
10573 rtx offset_rtx
= XEXP (x
, 1);
10574 HOST_WIDE_INT offset
= INTVAL (offset_rtx
);
10576 if (GET_CODE (base
) == PLUS
)
10578 rtx op0
= XEXP (base
, 0);
10579 rtx op1
= XEXP (base
, 1);
10581 /* Force any scaling into a temp for CSE. */
10582 op0
= force_reg (Pmode
, op0
);
10583 op1
= force_reg (Pmode
, op1
);
10585 /* Let the pointer register be in op0. */
10586 if (REG_POINTER (op1
))
10587 std::swap (op0
, op1
);
10589 /* If the pointer is virtual or frame related, then we know that
10590 virtual register instantiation or register elimination is going
10591 to apply a second constant. We want the two constants folded
10592 together easily. Therefore, emit as (OP0 + CONST) + OP1. */
10593 if (virt_or_elim_regno_p (REGNO (op0
)))
10595 base
= expand_binop (Pmode
, add_optab
, op0
, offset_rtx
,
10596 NULL_RTX
, true, OPTAB_DIRECT
);
10597 return gen_rtx_PLUS (Pmode
, base
, op1
);
10600 /* Otherwise, in order to encourage CSE (and thence loop strength
10601 reduce) scaled addresses, emit as (OP0 + OP1) + CONST. */
10602 base
= expand_binop (Pmode
, add_optab
, op0
, op1
,
10603 NULL_RTX
, true, OPTAB_DIRECT
);
10604 x
= gen_rtx_PLUS (Pmode
, base
, offset_rtx
);
10607 HOST_WIDE_INT size
;
10608 if (GET_MODE_SIZE (mode
).is_constant (&size
))
10610 HOST_WIDE_INT base_offset
= aarch64_anchor_offset (offset
, size
,
10612 if (base_offset
!= 0)
10614 base
= plus_constant (Pmode
, base
, base_offset
);
10615 base
= force_operand (base
, NULL_RTX
);
10616 return plus_constant (Pmode
, base
, offset
- base_offset
);
10625 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED
, rtx x
,
10626 reg_class_t rclass
,
10628 secondary_reload_info
*sri
)
10630 /* Use aarch64_sve_reload_mem for SVE memory reloads that cannot use
10631 LDR and STR. See the comment at the head of aarch64-sve.md for
10632 more details about the big-endian handling. */
10633 if (reg_class_subset_p (rclass
, FP_REGS
)
10634 && !((REG_P (x
) && HARD_REGISTER_P (x
))
10635 || aarch64_simd_valid_immediate (x
, NULL
))
10636 && mode
!= VNx16QImode
)
10638 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
10639 if ((vec_flags
& VEC_SVE_DATA
)
10640 && ((vec_flags
& VEC_PARTIAL
) || BYTES_BIG_ENDIAN
))
10642 sri
->icode
= CODE_FOR_aarch64_sve_reload_mem
;
10647 /* If we have to disable direct literal pool loads and stores because the
10648 function is too big, then we need a scratch register. */
10649 if (MEM_P (x
) && GET_CODE (x
) == SYMBOL_REF
&& CONSTANT_POOL_ADDRESS_P (x
)
10650 && (SCALAR_FLOAT_MODE_P (GET_MODE (x
))
10651 || targetm
.vector_mode_supported_p (GET_MODE (x
)))
10652 && !aarch64_pcrelative_literal_loads
)
10654 sri
->icode
= code_for_aarch64_reload_movcp (mode
, DImode
);
10658 /* Without the TARGET_SIMD instructions we cannot move a Q register
10659 to a Q register directly. We need a scratch. */
10660 if (REG_P (x
) && (mode
== TFmode
|| mode
== TImode
) && mode
== GET_MODE (x
)
10661 && FP_REGNUM_P (REGNO (x
)) && !TARGET_SIMD
10662 && reg_class_subset_p (rclass
, FP_REGS
))
10664 sri
->icode
= code_for_aarch64_reload_mov (mode
);
10668 /* A TFmode or TImode memory access should be handled via an FP_REGS
10669 because AArch64 has richer addressing modes for LDR/STR instructions
10670 than LDP/STP instructions. */
10671 if (TARGET_FLOAT
&& rclass
== GENERAL_REGS
10672 && known_eq (GET_MODE_SIZE (mode
), 16) && MEM_P (x
))
10675 if (rclass
== FP_REGS
&& (mode
== TImode
|| mode
== TFmode
) && CONSTANT_P(x
))
10676 return GENERAL_REGS
;
10682 aarch64_can_eliminate (const int from ATTRIBUTE_UNUSED
, const int to
)
10684 gcc_assert (from
== ARG_POINTER_REGNUM
|| from
== FRAME_POINTER_REGNUM
);
10686 /* If we need a frame pointer, ARG_POINTER_REGNUM and FRAME_POINTER_REGNUM
10687 can only eliminate to HARD_FRAME_POINTER_REGNUM. */
10688 if (frame_pointer_needed
)
10689 return to
== HARD_FRAME_POINTER_REGNUM
;
10694 aarch64_initial_elimination_offset (unsigned from
, unsigned to
)
10696 if (to
== HARD_FRAME_POINTER_REGNUM
)
10698 if (from
== ARG_POINTER_REGNUM
)
10699 return cfun
->machine
->frame
.hard_fp_offset
;
10701 if (from
== FRAME_POINTER_REGNUM
)
10702 return cfun
->machine
->frame
.hard_fp_offset
10703 - cfun
->machine
->frame
.locals_offset
;
10706 if (to
== STACK_POINTER_REGNUM
)
10708 if (from
== FRAME_POINTER_REGNUM
)
10709 return cfun
->machine
->frame
.frame_size
10710 - cfun
->machine
->frame
.locals_offset
;
10713 return cfun
->machine
->frame
.frame_size
;
10716 /* Implement RETURN_ADDR_RTX. We do not support moving back to a
10720 aarch64_return_addr (int count
, rtx frame ATTRIBUTE_UNUSED
)
10724 return get_hard_reg_initial_val (Pmode
, LR_REGNUM
);
10729 aarch64_asm_trampoline_template (FILE *f
)
10734 if (aarch64_bti_enabled ())
10736 asm_fprintf (f
, "\thint\t34 // bti c\n");
10743 asm_fprintf (f
, "\tldr\tw%d, .+%d\n", IP1_REGNUM
- R0_REGNUM
, offset1
);
10744 asm_fprintf (f
, "\tldr\tw%d, .+%d\n", STATIC_CHAIN_REGNUM
- R0_REGNUM
,
10749 asm_fprintf (f
, "\tldr\t%s, .+%d\n", reg_names
[IP1_REGNUM
], offset1
);
10750 asm_fprintf (f
, "\tldr\t%s, .+%d\n", reg_names
[STATIC_CHAIN_REGNUM
],
10753 asm_fprintf (f
, "\tbr\t%s\n", reg_names
[IP1_REGNUM
]);
10755 /* The trampoline needs an extra padding instruction. In case if BTI is
10756 enabled the padding instruction is replaced by the BTI instruction at
10758 if (!aarch64_bti_enabled ())
10759 assemble_aligned_integer (4, const0_rtx
);
10761 assemble_aligned_integer (POINTER_BYTES
, const0_rtx
);
10762 assemble_aligned_integer (POINTER_BYTES
, const0_rtx
);
10766 aarch64_trampoline_init (rtx m_tramp
, tree fndecl
, rtx chain_value
)
10768 rtx fnaddr
, mem
, a_tramp
;
10769 const int tramp_code_sz
= 16;
10771 /* Don't need to copy the trailing D-words, we fill those in below. */
10772 emit_block_move (m_tramp
, assemble_trampoline_template (),
10773 GEN_INT (tramp_code_sz
), BLOCK_OP_NORMAL
);
10774 mem
= adjust_address (m_tramp
, ptr_mode
, tramp_code_sz
);
10775 fnaddr
= XEXP (DECL_RTL (fndecl
), 0);
10776 if (GET_MODE (fnaddr
) != ptr_mode
)
10777 fnaddr
= convert_memory_address (ptr_mode
, fnaddr
);
10778 emit_move_insn (mem
, fnaddr
);
10780 mem
= adjust_address (m_tramp
, ptr_mode
, tramp_code_sz
+ POINTER_BYTES
);
10781 emit_move_insn (mem
, chain_value
);
10783 /* XXX We should really define a "clear_cache" pattern and use
10784 gen_clear_cache(). */
10785 a_tramp
= XEXP (m_tramp
, 0);
10786 emit_library_call (gen_rtx_SYMBOL_REF (Pmode
, "__clear_cache"),
10787 LCT_NORMAL
, VOIDmode
, a_tramp
, ptr_mode
,
10788 plus_constant (ptr_mode
, a_tramp
, TRAMPOLINE_SIZE
),
10792 static unsigned char
10793 aarch64_class_max_nregs (reg_class_t regclass
, machine_mode mode
)
10795 /* ??? Logically we should only need to provide a value when
10796 HARD_REGNO_MODE_OK says that at least one register in REGCLASS
10797 can hold MODE, but at the moment we need to handle all modes.
10798 Just ignore any runtime parts for registers that can't store them. */
10799 HOST_WIDE_INT lowest_size
= constant_lower_bound (GET_MODE_SIZE (mode
));
10800 unsigned int nregs
, vec_flags
;
10803 case TAILCALL_ADDR_REGS
:
10807 case POINTER_AND_FP_REGS
:
10811 vec_flags
= aarch64_classify_vector_mode (mode
);
10812 if ((vec_flags
& VEC_SVE_DATA
)
10813 && constant_multiple_p (GET_MODE_SIZE (mode
),
10814 aarch64_vl_bytes (mode
, vec_flags
), &nregs
))
10816 return (vec_flags
& VEC_ADVSIMD
10817 ? CEIL (lowest_size
, UNITS_PER_VREG
)
10818 : CEIL (lowest_size
, UNITS_PER_WORD
));
10824 case PR_AND_FFR_REGS
:
10833 gcc_unreachable ();
10837 aarch64_preferred_reload_class (rtx x
, reg_class_t regclass
)
10839 if (regclass
== POINTER_REGS
)
10840 return GENERAL_REGS
;
10842 if (regclass
== STACK_REG
)
10845 && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x
)), POINTER_REGS
))
10851 /* Register eliminiation can result in a request for
10852 SP+constant->FP_REGS. We cannot support such operations which
10853 use SP as source and an FP_REG as destination, so reject out
10855 if (! reg_class_subset_p (regclass
, GENERAL_REGS
) && GET_CODE (x
) == PLUS
)
10857 rtx lhs
= XEXP (x
, 0);
10859 /* Look through a possible SUBREG introduced by ILP32. */
10860 if (GET_CODE (lhs
) == SUBREG
)
10861 lhs
= SUBREG_REG (lhs
);
10863 gcc_assert (REG_P (lhs
));
10864 gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs
)),
10873 aarch64_asm_output_labelref (FILE* f
, const char *name
)
10875 asm_fprintf (f
, "%U%s", name
);
10879 aarch64_elf_asm_constructor (rtx symbol
, int priority
)
10881 if (priority
== DEFAULT_INIT_PRIORITY
)
10882 default_ctor_section_asm_out_constructor (symbol
, priority
);
10886 /* While priority is known to be in range [0, 65535], so 18 bytes
10887 would be enough, the compiler might not know that. To avoid
10888 -Wformat-truncation false positive, use a larger size. */
10890 snprintf (buf
, sizeof (buf
), ".init_array.%.5u", priority
);
10891 s
= get_section (buf
, SECTION_WRITE
| SECTION_NOTYPE
, NULL
);
10892 switch_to_section (s
);
10893 assemble_align (POINTER_SIZE
);
10894 assemble_aligned_integer (POINTER_BYTES
, symbol
);
10899 aarch64_elf_asm_destructor (rtx symbol
, int priority
)
10901 if (priority
== DEFAULT_INIT_PRIORITY
)
10902 default_dtor_section_asm_out_destructor (symbol
, priority
);
10906 /* While priority is known to be in range [0, 65535], so 18 bytes
10907 would be enough, the compiler might not know that. To avoid
10908 -Wformat-truncation false positive, use a larger size. */
10910 snprintf (buf
, sizeof (buf
), ".fini_array.%.5u", priority
);
10911 s
= get_section (buf
, SECTION_WRITE
| SECTION_NOTYPE
, NULL
);
10912 switch_to_section (s
);
10913 assemble_align (POINTER_SIZE
);
10914 assemble_aligned_integer (POINTER_BYTES
, symbol
);
10919 aarch64_output_casesi (rtx
*operands
)
10923 rtx diff_vec
= PATTERN (NEXT_INSN (as_a
<rtx_insn
*> (operands
[2])));
10925 static const char *const patterns
[4][2] =
10928 "ldrb\t%w3, [%0,%w1,uxtw]",
10929 "add\t%3, %4, %w3, sxtb #2"
10932 "ldrh\t%w3, [%0,%w1,uxtw #1]",
10933 "add\t%3, %4, %w3, sxth #2"
10936 "ldr\t%w3, [%0,%w1,uxtw #2]",
10937 "add\t%3, %4, %w3, sxtw #2"
10939 /* We assume that DImode is only generated when not optimizing and
10940 that we don't really need 64-bit address offsets. That would
10941 imply an object file with 8GB of code in a single function! */
10943 "ldr\t%w3, [%0,%w1,uxtw #2]",
10944 "add\t%3, %4, %w3, sxtw #2"
10948 gcc_assert (GET_CODE (diff_vec
) == ADDR_DIFF_VEC
);
10950 scalar_int_mode mode
= as_a
<scalar_int_mode
> (GET_MODE (diff_vec
));
10951 index
= exact_log2 (GET_MODE_SIZE (mode
));
10953 gcc_assert (index
>= 0 && index
<= 3);
10955 /* Need to implement table size reduction, by chaning the code below. */
10956 output_asm_insn (patterns
[index
][0], operands
);
10957 ASM_GENERATE_INTERNAL_LABEL (label
, "Lrtx", CODE_LABEL_NUMBER (operands
[2]));
10958 snprintf (buf
, sizeof (buf
),
10959 "adr\t%%4, %s", targetm
.strip_name_encoding (label
));
10960 output_asm_insn (buf
, operands
);
10961 output_asm_insn (patterns
[index
][1], operands
);
10962 output_asm_insn ("br\t%3", operands
);
10963 assemble_label (asm_out_file
, label
);
10968 /* Return size in bits of an arithmetic operand which is shifted/scaled and
10969 masked such that it is suitable for a UXTB, UXTH, or UXTW extend
10973 aarch64_uxt_size (int shift
, HOST_WIDE_INT mask
)
10975 if (shift
>= 0 && shift
<= 3)
10978 for (size
= 8; size
<= 32; size
*= 2)
10980 HOST_WIDE_INT bits
= ((HOST_WIDE_INT
)1U << size
) - 1;
10981 if (mask
== bits
<< shift
)
10988 /* Constant pools are per function only when PC relative
10989 literal loads are true or we are in the large memory
10993 aarch64_can_use_per_function_literal_pools_p (void)
10995 return (aarch64_pcrelative_literal_loads
10996 || aarch64_cmodel
== AARCH64_CMODEL_LARGE
);
11000 aarch64_use_blocks_for_constant_p (machine_mode
, const_rtx
)
11002 /* We can't use blocks for constants when we're using a per-function
11004 return !aarch64_can_use_per_function_literal_pools_p ();
11007 /* Select appropriate section for constants depending
11008 on where we place literal pools. */
11011 aarch64_select_rtx_section (machine_mode mode
,
11013 unsigned HOST_WIDE_INT align
)
11015 if (aarch64_can_use_per_function_literal_pools_p ())
11016 return function_section (current_function_decl
);
11018 return default_elf_select_rtx_section (mode
, x
, align
);
11021 /* Implement ASM_OUTPUT_POOL_EPILOGUE. */
11023 aarch64_asm_output_pool_epilogue (FILE *f
, const char *, tree
,
11024 HOST_WIDE_INT offset
)
11026 /* When using per-function literal pools, we must ensure that any code
11027 section is aligned to the minimal instruction length, lest we get
11028 errors from the assembler re "unaligned instructions". */
11029 if ((offset
& 3) && aarch64_can_use_per_function_literal_pools_p ())
11030 ASM_OUTPUT_ALIGN (f
, 2);
11035 /* Helper function for rtx cost calculation. Strip a shift expression
11036 from X. Returns the inner operand if successful, or the original
11037 expression on failure. */
11039 aarch64_strip_shift (rtx x
)
11043 /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
11044 we can convert both to ROR during final output. */
11045 if ((GET_CODE (op
) == ASHIFT
11046 || GET_CODE (op
) == ASHIFTRT
11047 || GET_CODE (op
) == LSHIFTRT
11048 || GET_CODE (op
) == ROTATERT
11049 || GET_CODE (op
) == ROTATE
)
11050 && CONST_INT_P (XEXP (op
, 1)))
11051 return XEXP (op
, 0);
11053 if (GET_CODE (op
) == MULT
11054 && CONST_INT_P (XEXP (op
, 1))
11055 && ((unsigned) exact_log2 (INTVAL (XEXP (op
, 1)))) < 64)
11056 return XEXP (op
, 0);
11061 /* Helper function for rtx cost calculation. Strip an extend
11062 expression from X. Returns the inner operand if successful, or the
11063 original expression on failure. We deal with a number of possible
11064 canonicalization variations here. If STRIP_SHIFT is true, then
11065 we can strip off a shift also. */
11067 aarch64_strip_extend (rtx x
, bool strip_shift
)
11069 scalar_int_mode mode
;
11072 if (!is_a
<scalar_int_mode
> (GET_MODE (op
), &mode
))
11075 /* Zero and sign extraction of a widened value. */
11076 if ((GET_CODE (op
) == ZERO_EXTRACT
|| GET_CODE (op
) == SIGN_EXTRACT
)
11077 && XEXP (op
, 2) == const0_rtx
11078 && GET_CODE (XEXP (op
, 0)) == MULT
11079 && aarch64_is_extend_from_extract (mode
, XEXP (XEXP (op
, 0), 1),
11081 return XEXP (XEXP (op
, 0), 0);
11083 /* It can also be represented (for zero-extend) as an AND with an
11085 if (GET_CODE (op
) == AND
11086 && GET_CODE (XEXP (op
, 0)) == MULT
11087 && CONST_INT_P (XEXP (XEXP (op
, 0), 1))
11088 && CONST_INT_P (XEXP (op
, 1))
11089 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op
, 0), 1))),
11090 INTVAL (XEXP (op
, 1))) != 0)
11091 return XEXP (XEXP (op
, 0), 0);
11093 /* Now handle extended register, as this may also have an optional
11094 left shift by 1..4. */
11096 && GET_CODE (op
) == ASHIFT
11097 && CONST_INT_P (XEXP (op
, 1))
11098 && ((unsigned HOST_WIDE_INT
) INTVAL (XEXP (op
, 1))) <= 4)
11101 if (GET_CODE (op
) == ZERO_EXTEND
11102 || GET_CODE (op
) == SIGN_EXTEND
)
11111 /* Return true iff CODE is a shift supported in combination
11112 with arithmetic instructions. */
11115 aarch64_shift_p (enum rtx_code code
)
11117 return code
== ASHIFT
|| code
== ASHIFTRT
|| code
== LSHIFTRT
;
11121 /* Return true iff X is a cheap shift without a sign extend. */
11124 aarch64_cheap_mult_shift_p (rtx x
)
11131 if (!(aarch64_tune_params
.extra_tuning_flags
11132 & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND
))
11135 if (GET_CODE (op0
) == SIGN_EXTEND
)
11138 if (GET_CODE (x
) == ASHIFT
&& CONST_INT_P (op1
)
11139 && UINTVAL (op1
) <= 4)
11142 if (GET_CODE (x
) != MULT
|| !CONST_INT_P (op1
))
11145 HOST_WIDE_INT l2
= exact_log2 (INTVAL (op1
));
11147 if (l2
> 0 && l2
<= 4)
11153 /* Helper function for rtx cost calculation. Calculate the cost of
11154 a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
11155 Return the calculated cost of the expression, recursing manually in to
11156 operands where needed. */
11159 aarch64_rtx_mult_cost (rtx x
, enum rtx_code code
, int outer
, bool speed
)
11162 const struct cpu_cost_table
*extra_cost
11163 = aarch64_tune_params
.insn_extra_cost
;
11165 bool compound_p
= (outer
== PLUS
|| outer
== MINUS
);
11166 machine_mode mode
= GET_MODE (x
);
11168 gcc_checking_assert (code
== MULT
);
11173 if (VECTOR_MODE_P (mode
))
11174 mode
= GET_MODE_INNER (mode
);
11176 /* Integer multiply/fma. */
11177 if (GET_MODE_CLASS (mode
) == MODE_INT
)
11179 /* The multiply will be canonicalized as a shift, cost it as such. */
11180 if (aarch64_shift_p (GET_CODE (x
))
11181 || (CONST_INT_P (op1
)
11182 && exact_log2 (INTVAL (op1
)) > 0))
11184 bool is_extend
= GET_CODE (op0
) == ZERO_EXTEND
11185 || GET_CODE (op0
) == SIGN_EXTEND
;
11190 /* If the shift is considered cheap,
11191 then don't add any cost. */
11192 if (aarch64_cheap_mult_shift_p (x
))
11194 else if (REG_P (op1
))
11195 /* ARITH + shift-by-register. */
11196 cost
+= extra_cost
->alu
.arith_shift_reg
;
11197 else if (is_extend
)
11198 /* ARITH + extended register. We don't have a cost field
11199 for ARITH+EXTEND+SHIFT, so use extend_arith here. */
11200 cost
+= extra_cost
->alu
.extend_arith
;
11202 /* ARITH + shift-by-immediate. */
11203 cost
+= extra_cost
->alu
.arith_shift
;
11206 /* LSL (immediate). */
11207 cost
+= extra_cost
->alu
.shift
;
11210 /* Strip extends as we will have costed them in the case above. */
11212 op0
= aarch64_strip_extend (op0
, true);
11214 cost
+= rtx_cost (op0
, VOIDmode
, code
, 0, speed
);
11219 /* MNEG or [US]MNEGL. Extract the NEG operand and indicate that it's a
11220 compound and let the below cases handle it. After all, MNEG is a
11221 special-case alias of MSUB. */
11222 if (GET_CODE (op0
) == NEG
)
11224 op0
= XEXP (op0
, 0);
11228 /* Integer multiplies or FMAs have zero/sign extending variants. */
11229 if ((GET_CODE (op0
) == ZERO_EXTEND
11230 && GET_CODE (op1
) == ZERO_EXTEND
)
11231 || (GET_CODE (op0
) == SIGN_EXTEND
11232 && GET_CODE (op1
) == SIGN_EXTEND
))
11234 cost
+= rtx_cost (XEXP (op0
, 0), VOIDmode
, MULT
, 0, speed
);
11235 cost
+= rtx_cost (XEXP (op1
, 0), VOIDmode
, MULT
, 1, speed
);
11240 /* SMADDL/UMADDL/UMSUBL/SMSUBL. */
11241 cost
+= extra_cost
->mult
[0].extend_add
;
11243 /* MUL/SMULL/UMULL. */
11244 cost
+= extra_cost
->mult
[0].extend
;
11250 /* This is either an integer multiply or a MADD. In both cases
11251 we want to recurse and cost the operands. */
11252 cost
+= rtx_cost (op0
, mode
, MULT
, 0, speed
);
11253 cost
+= rtx_cost (op1
, mode
, MULT
, 1, speed
);
11259 cost
+= extra_cost
->mult
[mode
== DImode
].add
;
11262 cost
+= extra_cost
->mult
[mode
== DImode
].simple
;
11271 /* Floating-point FMA/FMUL can also support negations of the
11272 operands, unless the rounding mode is upward or downward in
11273 which case FNMUL is different than FMUL with operand negation. */
11274 bool neg0
= GET_CODE (op0
) == NEG
;
11275 bool neg1
= GET_CODE (op1
) == NEG
;
11276 if (compound_p
|| !flag_rounding_math
|| (neg0
&& neg1
))
11279 op0
= XEXP (op0
, 0);
11281 op1
= XEXP (op1
, 0);
11285 /* FMADD/FNMADD/FNMSUB/FMSUB. */
11286 cost
+= extra_cost
->fp
[mode
== DFmode
].fma
;
11289 cost
+= extra_cost
->fp
[mode
== DFmode
].mult
;
11292 cost
+= rtx_cost (op0
, mode
, MULT
, 0, speed
);
11293 cost
+= rtx_cost (op1
, mode
, MULT
, 1, speed
);
11299 aarch64_address_cost (rtx x
,
11301 addr_space_t as ATTRIBUTE_UNUSED
,
11304 enum rtx_code c
= GET_CODE (x
);
11305 const struct cpu_addrcost_table
*addr_cost
= aarch64_tune_params
.addr_cost
;
11306 struct aarch64_address_info info
;
11310 if (!aarch64_classify_address (&info
, x
, mode
, false))
11312 if (GET_CODE (x
) == CONST
|| GET_CODE (x
) == SYMBOL_REF
)
11314 /* This is a CONST or SYMBOL ref which will be split
11315 in a different way depending on the code model in use.
11316 Cost it through the generic infrastructure. */
11317 int cost_symbol_ref
= rtx_cost (x
, Pmode
, MEM
, 1, speed
);
11318 /* Divide through by the cost of one instruction to
11319 bring it to the same units as the address costs. */
11320 cost_symbol_ref
/= COSTS_N_INSNS (1);
11321 /* The cost is then the cost of preparing the address,
11322 followed by an immediate (possibly 0) offset. */
11323 return cost_symbol_ref
+ addr_cost
->imm_offset
;
11327 /* This is most likely a jump table from a case
11329 return addr_cost
->register_offset
;
11335 case ADDRESS_LO_SUM
:
11336 case ADDRESS_SYMBOLIC
:
11337 case ADDRESS_REG_IMM
:
11338 cost
+= addr_cost
->imm_offset
;
11341 case ADDRESS_REG_WB
:
11342 if (c
== PRE_INC
|| c
== PRE_DEC
|| c
== PRE_MODIFY
)
11343 cost
+= addr_cost
->pre_modify
;
11344 else if (c
== POST_INC
|| c
== POST_DEC
|| c
== POST_MODIFY
)
11345 cost
+= addr_cost
->post_modify
;
11347 gcc_unreachable ();
11351 case ADDRESS_REG_REG
:
11352 cost
+= addr_cost
->register_offset
;
11355 case ADDRESS_REG_SXTW
:
11356 cost
+= addr_cost
->register_sextend
;
11359 case ADDRESS_REG_UXTW
:
11360 cost
+= addr_cost
->register_zextend
;
11364 gcc_unreachable ();
11368 if (info
.shift
> 0)
11370 /* For the sake of calculating the cost of the shifted register
11371 component, we can treat same sized modes in the same way. */
11372 if (known_eq (GET_MODE_BITSIZE (mode
), 16))
11373 cost
+= addr_cost
->addr_scale_costs
.hi
;
11374 else if (known_eq (GET_MODE_BITSIZE (mode
), 32))
11375 cost
+= addr_cost
->addr_scale_costs
.si
;
11376 else if (known_eq (GET_MODE_BITSIZE (mode
), 64))
11377 cost
+= addr_cost
->addr_scale_costs
.di
;
11379 /* We can't tell, or this is a 128-bit vector. */
11380 cost
+= addr_cost
->addr_scale_costs
.ti
;
11386 /* Return the cost of a branch. If SPEED_P is true then the compiler is
11387 optimizing for speed. If PREDICTABLE_P is true then the branch is predicted
11391 aarch64_branch_cost (bool speed_p
, bool predictable_p
)
11393 /* When optimizing for speed, use the cost of unpredictable branches. */
11394 const struct cpu_branch_cost
*branch_costs
=
11395 aarch64_tune_params
.branch_costs
;
11397 if (!speed_p
|| predictable_p
)
11398 return branch_costs
->predictable
;
11400 return branch_costs
->unpredictable
;
11403 /* Return true if the RTX X in mode MODE is a zero or sign extract
11404 usable in an ADD or SUB (extended register) instruction. */
11406 aarch64_rtx_arith_op_extract_p (rtx x
, scalar_int_mode mode
)
11408 /* Catch add with a sign extract.
11409 This is add_<optab><mode>_multp2. */
11410 if (GET_CODE (x
) == SIGN_EXTRACT
11411 || GET_CODE (x
) == ZERO_EXTRACT
)
11413 rtx op0
= XEXP (x
, 0);
11414 rtx op1
= XEXP (x
, 1);
11415 rtx op2
= XEXP (x
, 2);
11417 if (GET_CODE (op0
) == MULT
11418 && CONST_INT_P (op1
)
11419 && op2
== const0_rtx
11420 && CONST_INT_P (XEXP (op0
, 1))
11421 && aarch64_is_extend_from_extract (mode
,
11428 /* The simple case <ARITH>, XD, XN, XM, [us]xt.
11430 else if (GET_CODE (x
) == SIGN_EXTEND
11431 || GET_CODE (x
) == ZERO_EXTEND
)
11432 return REG_P (XEXP (x
, 0));
11438 aarch64_frint_unspec_p (unsigned int u
)
11442 case UNSPEC_FRINTZ
:
11443 case UNSPEC_FRINTP
:
11444 case UNSPEC_FRINTM
:
11445 case UNSPEC_FRINTA
:
11446 case UNSPEC_FRINTN
:
11447 case UNSPEC_FRINTX
:
11448 case UNSPEC_FRINTI
:
11456 /* Return true iff X is an rtx that will match an extr instruction
11457 i.e. as described in the *extr<mode>5_insn family of patterns.
11458 OP0 and OP1 will be set to the operands of the shifts involved
11459 on success and will be NULL_RTX otherwise. */
11462 aarch64_extr_rtx_p (rtx x
, rtx
*res_op0
, rtx
*res_op1
)
11465 scalar_int_mode mode
;
11466 if (!is_a
<scalar_int_mode
> (GET_MODE (x
), &mode
))
11469 *res_op0
= NULL_RTX
;
11470 *res_op1
= NULL_RTX
;
11472 if (GET_CODE (x
) != IOR
)
11478 if ((GET_CODE (op0
) == ASHIFT
&& GET_CODE (op1
) == LSHIFTRT
)
11479 || (GET_CODE (op1
) == ASHIFT
&& GET_CODE (op0
) == LSHIFTRT
))
11481 /* Canonicalise locally to ashift in op0, lshiftrt in op1. */
11482 if (GET_CODE (op1
) == ASHIFT
)
11483 std::swap (op0
, op1
);
11485 if (!CONST_INT_P (XEXP (op0
, 1)) || !CONST_INT_P (XEXP (op1
, 1)))
11488 unsigned HOST_WIDE_INT shft_amnt_0
= UINTVAL (XEXP (op0
, 1));
11489 unsigned HOST_WIDE_INT shft_amnt_1
= UINTVAL (XEXP (op1
, 1));
11491 if (shft_amnt_0
< GET_MODE_BITSIZE (mode
)
11492 && shft_amnt_0
+ shft_amnt_1
== GET_MODE_BITSIZE (mode
))
11494 *res_op0
= XEXP (op0
, 0);
11495 *res_op1
= XEXP (op1
, 0);
11503 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
11504 storing it in *COST. Result is true if the total cost of the operation
11505 has now been calculated. */
11507 aarch64_if_then_else_costs (rtx op0
, rtx op1
, rtx op2
, int *cost
, bool speed
)
11511 enum rtx_code cmpcode
;
11512 const struct cpu_cost_table
*extra_cost
11513 = aarch64_tune_params
.insn_extra_cost
;
11515 if (COMPARISON_P (op0
))
11517 inner
= XEXP (op0
, 0);
11518 comparator
= XEXP (op0
, 1);
11519 cmpcode
= GET_CODE (op0
);
11524 comparator
= const0_rtx
;
11528 if (GET_CODE (op1
) == PC
|| GET_CODE (op2
) == PC
)
11530 /* Conditional branch. */
11531 if (GET_MODE_CLASS (GET_MODE (inner
)) == MODE_CC
)
11535 if (cmpcode
== NE
|| cmpcode
== EQ
)
11537 if (comparator
== const0_rtx
)
11539 /* TBZ/TBNZ/CBZ/CBNZ. */
11540 if (GET_CODE (inner
) == ZERO_EXTRACT
)
11542 *cost
+= rtx_cost (XEXP (inner
, 0), VOIDmode
,
11543 ZERO_EXTRACT
, 0, speed
);
11546 *cost
+= rtx_cost (inner
, VOIDmode
, cmpcode
, 0, speed
);
11550 if (register_operand (inner
, VOIDmode
)
11551 && aarch64_imm24 (comparator
, VOIDmode
))
11553 /* SUB and SUBS. */
11554 *cost
+= COSTS_N_INSNS (2);
11556 *cost
+= extra_cost
->alu
.arith
* 2;
11560 else if (cmpcode
== LT
|| cmpcode
== GE
)
11563 if (comparator
== const0_rtx
)
11568 else if (GET_MODE_CLASS (GET_MODE (inner
)) == MODE_CC
)
11571 if (GET_CODE (op1
) == COMPARE
)
11573 /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0. */
11574 if (XEXP (op1
, 1) == const0_rtx
)
11578 machine_mode mode
= GET_MODE (XEXP (op1
, 0));
11579 const struct cpu_cost_table
*extra_cost
11580 = aarch64_tune_params
.insn_extra_cost
;
11582 if (GET_MODE_CLASS (mode
) == MODE_INT
)
11583 *cost
+= extra_cost
->alu
.arith
;
11585 *cost
+= extra_cost
->fp
[mode
== DFmode
].compare
;
11590 /* It's a conditional operation based on the status flags,
11591 so it must be some flavor of CSEL. */
11593 /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL. */
11594 if (GET_CODE (op1
) == NEG
11595 || GET_CODE (op1
) == NOT
11596 || (GET_CODE (op1
) == PLUS
&& XEXP (op1
, 1) == const1_rtx
))
11597 op1
= XEXP (op1
, 0);
11598 else if (GET_CODE (op1
) == ZERO_EXTEND
&& GET_CODE (op2
) == ZERO_EXTEND
)
11600 /* CSEL with zero-extension (*cmovdi_insn_uxtw). */
11601 op1
= XEXP (op1
, 0);
11602 op2
= XEXP (op2
, 0);
11605 *cost
+= rtx_cost (op1
, VOIDmode
, IF_THEN_ELSE
, 1, speed
);
11606 *cost
+= rtx_cost (op2
, VOIDmode
, IF_THEN_ELSE
, 2, speed
);
11610 /* We don't know what this is, cost all operands. */
11614 /* Check whether X is a bitfield operation of the form shift + extend that
11615 maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction. If so, return the
11616 operand to which the bitfield operation is applied. Otherwise return
11620 aarch64_extend_bitfield_pattern_p (rtx x
)
11622 rtx_code outer_code
= GET_CODE (x
);
11623 machine_mode outer_mode
= GET_MODE (x
);
11625 if (outer_code
!= ZERO_EXTEND
&& outer_code
!= SIGN_EXTEND
11626 && outer_mode
!= SImode
&& outer_mode
!= DImode
)
11629 rtx inner
= XEXP (x
, 0);
11630 rtx_code inner_code
= GET_CODE (inner
);
11631 machine_mode inner_mode
= GET_MODE (inner
);
11634 switch (inner_code
)
11637 if (CONST_INT_P (XEXP (inner
, 1))
11638 && (inner_mode
== QImode
|| inner_mode
== HImode
))
11639 op
= XEXP (inner
, 0);
11642 if (outer_code
== ZERO_EXTEND
&& CONST_INT_P (XEXP (inner
, 1))
11643 && (inner_mode
== QImode
|| inner_mode
== HImode
))
11644 op
= XEXP (inner
, 0);
11647 if (outer_code
== SIGN_EXTEND
&& CONST_INT_P (XEXP (inner
, 1))
11648 && (inner_mode
== QImode
|| inner_mode
== HImode
))
11649 op
= XEXP (inner
, 0);
11658 /* Return true if the mask and a shift amount from an RTX of the form
11659 (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
11660 mode MODE. See the *andim_ashift<mode>_bfiz pattern. */
11663 aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode mode
, rtx mask
,
11666 return CONST_INT_P (mask
) && CONST_INT_P (shft_amnt
)
11667 && INTVAL (shft_amnt
) < GET_MODE_BITSIZE (mode
)
11668 && exact_log2 ((INTVAL (mask
) >> INTVAL (shft_amnt
)) + 1) >= 0
11670 & ((HOST_WIDE_INT_1U
<< INTVAL (shft_amnt
)) - 1)) == 0;
11673 /* Return true if the masks and a shift amount from an RTX of the form
11674 ((x & MASK1) | ((y << SHIFT_AMNT) & MASK2)) are valid to combine into
11675 a BFI instruction of mode MODE. See *arch64_bfi patterns. */
11678 aarch64_masks_and_shift_for_bfi_p (scalar_int_mode mode
,
11679 unsigned HOST_WIDE_INT mask1
,
11680 unsigned HOST_WIDE_INT shft_amnt
,
11681 unsigned HOST_WIDE_INT mask2
)
11683 unsigned HOST_WIDE_INT t
;
11685 /* Verify that there is no overlap in what bits are set in the two masks. */
11686 if (mask1
!= ~mask2
)
11689 /* Verify that mask2 is not all zeros or ones. */
11690 if (mask2
== 0 || mask2
== HOST_WIDE_INT_M1U
)
11693 /* The shift amount should always be less than the mode size. */
11694 gcc_assert (shft_amnt
< GET_MODE_BITSIZE (mode
));
11696 /* Verify that the mask being shifted is contiguous and would be in the
11697 least significant bits after shifting by shft_amnt. */
11698 t
= mask2
+ (HOST_WIDE_INT_1U
<< shft_amnt
);
11699 return (t
== (t
& -t
));
11702 /* Calculate the cost of calculating X, storing it in *COST. Result
11703 is true if the total cost of the operation has now been calculated. */
11705 aarch64_rtx_costs (rtx x
, machine_mode mode
, int outer ATTRIBUTE_UNUSED
,
11706 int param ATTRIBUTE_UNUSED
, int *cost
, bool speed
)
11709 const struct cpu_cost_table
*extra_cost
11710 = aarch64_tune_params
.insn_extra_cost
;
11711 int code
= GET_CODE (x
);
11712 scalar_int_mode int_mode
;
11714 /* By default, assume that everything has equivalent cost to the
11715 cheapest instruction. Any additional costs are applied as a delta
11716 above this default. */
11717 *cost
= COSTS_N_INSNS (1);
11722 /* The cost depends entirely on the operands to SET. */
11724 op0
= SET_DEST (x
);
11727 switch (GET_CODE (op0
))
11732 rtx address
= XEXP (op0
, 0);
11733 if (VECTOR_MODE_P (mode
))
11734 *cost
+= extra_cost
->ldst
.storev
;
11735 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
11736 *cost
+= extra_cost
->ldst
.store
;
11737 else if (mode
== SFmode
)
11738 *cost
+= extra_cost
->ldst
.storef
;
11739 else if (mode
== DFmode
)
11740 *cost
+= extra_cost
->ldst
.stored
;
11743 COSTS_N_INSNS (aarch64_address_cost (address
, mode
,
11747 *cost
+= rtx_cost (op1
, mode
, SET
, 1, speed
);
11751 if (! REG_P (SUBREG_REG (op0
)))
11752 *cost
+= rtx_cost (SUBREG_REG (op0
), VOIDmode
, SET
, 0, speed
);
11754 /* Fall through. */
11756 /* The cost is one per vector-register copied. */
11757 if (VECTOR_MODE_P (GET_MODE (op0
)) && REG_P (op1
))
11759 int nregs
= aarch64_hard_regno_nregs (V0_REGNUM
, GET_MODE (op0
));
11760 *cost
= COSTS_N_INSNS (nregs
);
11762 /* const0_rtx is in general free, but we will use an
11763 instruction to set a register to 0. */
11764 else if (REG_P (op1
) || op1
== const0_rtx
)
11766 /* The cost is 1 per register copied. */
11767 int nregs
= aarch64_hard_regno_nregs (R0_REGNUM
, GET_MODE (op0
));
11768 *cost
= COSTS_N_INSNS (nregs
);
11771 /* Cost is just the cost of the RHS of the set. */
11772 *cost
+= rtx_cost (op1
, mode
, SET
, 1, speed
);
11777 /* Bit-field insertion. Strip any redundant widening of
11778 the RHS to meet the width of the target. */
11779 if (GET_CODE (op1
) == SUBREG
)
11780 op1
= SUBREG_REG (op1
);
11781 if ((GET_CODE (op1
) == ZERO_EXTEND
11782 || GET_CODE (op1
) == SIGN_EXTEND
)
11783 && CONST_INT_P (XEXP (op0
, 1))
11784 && is_a
<scalar_int_mode
> (GET_MODE (XEXP (op1
, 0)), &int_mode
)
11785 && GET_MODE_BITSIZE (int_mode
) >= INTVAL (XEXP (op0
, 1)))
11786 op1
= XEXP (op1
, 0);
11788 if (CONST_INT_P (op1
))
11790 /* MOV immediate is assumed to always be cheap. */
11791 *cost
= COSTS_N_INSNS (1);
11797 *cost
+= extra_cost
->alu
.bfi
;
11798 *cost
+= rtx_cost (op1
, VOIDmode
, (enum rtx_code
) code
, 1, speed
);
11804 /* We can't make sense of this, assume default cost. */
11805 *cost
= COSTS_N_INSNS (1);
11811 /* If an instruction can incorporate a constant within the
11812 instruction, the instruction's expression avoids calling
11813 rtx_cost() on the constant. If rtx_cost() is called on a
11814 constant, then it is usually because the constant must be
11815 moved into a register by one or more instructions.
11817 The exception is constant 0, which can be expressed
11818 as XZR/WZR and is therefore free. The exception to this is
11819 if we have (set (reg) (const0_rtx)) in which case we must cost
11820 the move. However, we can catch that when we cost the SET, so
11821 we don't need to consider that here. */
11822 if (x
== const0_rtx
)
11826 /* To an approximation, building any other constant is
11827 proportionally expensive to the number of instructions
11828 required to build that constant. This is true whether we
11829 are compiling for SPEED or otherwise. */
11830 if (!is_a
<scalar_int_mode
> (mode
, &int_mode
))
11831 int_mode
= word_mode
;
11832 *cost
= COSTS_N_INSNS (aarch64_internal_mov_immediate
11833 (NULL_RTX
, x
, false, int_mode
));
11839 /* First determine number of instructions to do the move
11840 as an integer constant. */
11841 if (!aarch64_float_const_representable_p (x
)
11842 && !aarch64_can_const_movi_rtx_p (x
, mode
)
11843 && aarch64_float_const_rtx_p (x
))
11845 unsigned HOST_WIDE_INT ival
;
11846 bool succeed
= aarch64_reinterpret_float_as_int (x
, &ival
);
11847 gcc_assert (succeed
);
11849 scalar_int_mode imode
= (mode
== HFmode
11851 : int_mode_for_mode (mode
).require ());
11852 int ncost
= aarch64_internal_mov_immediate
11853 (NULL_RTX
, gen_int_mode (ival
, imode
), false, imode
);
11854 *cost
+= COSTS_N_INSNS (ncost
);
11860 /* mov[df,sf]_aarch64. */
11861 if (aarch64_float_const_representable_p (x
))
11862 /* FMOV (scalar immediate). */
11863 *cost
+= extra_cost
->fp
[mode
== DFmode
].fpconst
;
11864 else if (!aarch64_float_const_zero_rtx_p (x
))
11866 /* This will be a load from memory. */
11867 if (mode
== DFmode
)
11868 *cost
+= extra_cost
->ldst
.loadd
;
11870 *cost
+= extra_cost
->ldst
.loadf
;
11873 /* Otherwise this is +0.0. We get this using MOVI d0, #0
11874 or MOV v0.s[0], wzr - neither of which are modeled by the
11875 cost tables. Just use the default cost. */
11885 /* For loads we want the base cost of a load, plus an
11886 approximation for the additional cost of the addressing
11888 rtx address
= XEXP (x
, 0);
11889 if (VECTOR_MODE_P (mode
))
11890 *cost
+= extra_cost
->ldst
.loadv
;
11891 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
11892 *cost
+= extra_cost
->ldst
.load
;
11893 else if (mode
== SFmode
)
11894 *cost
+= extra_cost
->ldst
.loadf
;
11895 else if (mode
== DFmode
)
11896 *cost
+= extra_cost
->ldst
.loadd
;
11899 COSTS_N_INSNS (aarch64_address_cost (address
, mode
,
11908 if (VECTOR_MODE_P (mode
))
11913 *cost
+= extra_cost
->vect
.alu
;
11918 if (GET_MODE_CLASS (mode
) == MODE_INT
)
11920 if (GET_RTX_CLASS (GET_CODE (op0
)) == RTX_COMPARE
11921 || GET_RTX_CLASS (GET_CODE (op0
)) == RTX_COMM_COMPARE
)
11924 *cost
+= rtx_cost (XEXP (op0
, 0), VOIDmode
, NEG
, 0, speed
);
11928 /* Cost this as SUB wzr, X. */
11929 op0
= CONST0_RTX (mode
);
11934 if (GET_MODE_CLASS (mode
) == MODE_FLOAT
)
11936 /* Support (neg(fma...)) as a single instruction only if
11937 sign of zeros is unimportant. This matches the decision
11938 making in aarch64.md. */
11939 if (GET_CODE (op0
) == FMA
&& !HONOR_SIGNED_ZEROS (GET_MODE (op0
)))
11942 *cost
= rtx_cost (op0
, mode
, NEG
, 0, speed
);
11945 if (GET_CODE (op0
) == MULT
)
11948 *cost
= rtx_cost (op0
, mode
, NEG
, 0, speed
);
11953 *cost
+= extra_cost
->fp
[mode
== DFmode
].neg
;
11963 if (VECTOR_MODE_P (mode
))
11964 *cost
+= extra_cost
->vect
.alu
;
11966 *cost
+= extra_cost
->alu
.clz
;
11972 *cost
= COSTS_N_INSNS (2);
11975 *cost
+= extra_cost
->alu
.clz
+ extra_cost
->alu
.rev
;
11982 if (op1
== const0_rtx
11983 && GET_CODE (op0
) == AND
)
11986 mode
= GET_MODE (op0
);
11990 if (GET_MODE_CLASS (GET_MODE (op0
)) == MODE_INT
)
11992 /* TODO: A write to the CC flags possibly costs extra, this
11993 needs encoding in the cost tables. */
11995 mode
= GET_MODE (op0
);
11997 if (GET_CODE (op0
) == AND
)
12003 if (GET_CODE (op0
) == PLUS
)
12005 /* ADDS (and CMN alias). */
12010 if (GET_CODE (op0
) == MINUS
)
12017 if (GET_CODE (op0
) == ZERO_EXTRACT
&& op1
== const0_rtx
12018 && GET_MODE (x
) == CC_NZmode
&& CONST_INT_P (XEXP (op0
, 1))
12019 && CONST_INT_P (XEXP (op0
, 2)))
12021 /* COMPARE of ZERO_EXTRACT form of TST-immediate.
12022 Handle it here directly rather than going to cost_logic
12023 since we know the immediate generated for the TST is valid
12024 so we can avoid creating an intermediate rtx for it only
12025 for costing purposes. */
12027 *cost
+= extra_cost
->alu
.logical
;
12029 *cost
+= rtx_cost (XEXP (op0
, 0), GET_MODE (op0
),
12030 ZERO_EXTRACT
, 0, speed
);
12034 if (GET_CODE (op1
) == NEG
)
12038 *cost
+= extra_cost
->alu
.arith
;
12040 *cost
+= rtx_cost (op0
, mode
, COMPARE
, 0, speed
);
12041 *cost
+= rtx_cost (XEXP (op1
, 0), mode
, NEG
, 1, speed
);
12047 Compare can freely swap the order of operands, and
12048 canonicalization puts the more complex operation first.
12049 But the integer MINUS logic expects the shift/extend
12050 operation in op1. */
12052 || (GET_CODE (op0
) == SUBREG
&& REG_P (SUBREG_REG (op0
)))))
12060 if (GET_MODE_CLASS (GET_MODE (op0
)) == MODE_FLOAT
)
12064 *cost
+= extra_cost
->fp
[mode
== DFmode
].compare
;
12066 if (CONST_DOUBLE_P (op1
) && aarch64_float_const_zero_rtx_p (op1
))
12068 *cost
+= rtx_cost (op0
, VOIDmode
, COMPARE
, 0, speed
);
12069 /* FCMP supports constant 0.0 for no extra cost. */
12075 if (VECTOR_MODE_P (mode
))
12077 /* Vector compare. */
12079 *cost
+= extra_cost
->vect
.alu
;
12081 if (aarch64_float_const_zero_rtx_p (op1
))
12083 /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
12097 *cost
+= rtx_cost (op0
, mode
, MINUS
, 0, speed
);
12099 /* Detect valid immediates. */
12100 if ((GET_MODE_CLASS (mode
) == MODE_INT
12101 || (GET_MODE_CLASS (mode
) == MODE_CC
12102 && GET_MODE_CLASS (GET_MODE (op0
)) == MODE_INT
))
12103 && CONST_INT_P (op1
)
12104 && aarch64_uimm12_shift (INTVAL (op1
)))
12107 /* SUB(S) (immediate). */
12108 *cost
+= extra_cost
->alu
.arith
;
12112 /* Look for SUB (extended register). */
12113 if (is_a
<scalar_int_mode
> (mode
, &int_mode
)
12114 && aarch64_rtx_arith_op_extract_p (op1
, int_mode
))
12117 *cost
+= extra_cost
->alu
.extend_arith
;
12119 op1
= aarch64_strip_extend (op1
, true);
12120 *cost
+= rtx_cost (op1
, VOIDmode
,
12121 (enum rtx_code
) GET_CODE (op1
), 0, speed
);
12125 rtx new_op1
= aarch64_strip_extend (op1
, false);
12127 /* Cost this as an FMA-alike operation. */
12128 if ((GET_CODE (new_op1
) == MULT
12129 || aarch64_shift_p (GET_CODE (new_op1
)))
12130 && code
!= COMPARE
)
12132 *cost
+= aarch64_rtx_mult_cost (new_op1
, MULT
,
12133 (enum rtx_code
) code
,
12138 *cost
+= rtx_cost (new_op1
, VOIDmode
, MINUS
, 1, speed
);
12142 if (VECTOR_MODE_P (mode
))
12145 *cost
+= extra_cost
->vect
.alu
;
12147 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
12150 *cost
+= extra_cost
->alu
.arith
;
12152 else if (GET_MODE_CLASS (mode
) == MODE_FLOAT
)
12155 *cost
+= extra_cost
->fp
[mode
== DFmode
].addsub
;
12169 if (GET_RTX_CLASS (GET_CODE (op0
)) == RTX_COMPARE
12170 || GET_RTX_CLASS (GET_CODE (op0
)) == RTX_COMM_COMPARE
)
12173 *cost
+= rtx_cost (XEXP (op0
, 0), mode
, PLUS
, 0, speed
);
12174 *cost
+= rtx_cost (op1
, mode
, PLUS
, 1, speed
);
12178 if (GET_MODE_CLASS (mode
) == MODE_INT
12179 && (aarch64_plus_immediate (op1
, mode
)
12180 || aarch64_sve_addvl_addpl_immediate (op1
, mode
)))
12182 *cost
+= rtx_cost (op0
, mode
, PLUS
, 0, speed
);
12185 /* ADD (immediate). */
12186 *cost
+= extra_cost
->alu
.arith
;
12190 *cost
+= rtx_cost (op1
, mode
, PLUS
, 1, speed
);
12192 /* Look for ADD (extended register). */
12193 if (is_a
<scalar_int_mode
> (mode
, &int_mode
)
12194 && aarch64_rtx_arith_op_extract_p (op0
, int_mode
))
12197 *cost
+= extra_cost
->alu
.extend_arith
;
12199 op0
= aarch64_strip_extend (op0
, true);
12200 *cost
+= rtx_cost (op0
, VOIDmode
,
12201 (enum rtx_code
) GET_CODE (op0
), 0, speed
);
12205 /* Strip any extend, leave shifts behind as we will
12206 cost them through mult_cost. */
12207 new_op0
= aarch64_strip_extend (op0
, false);
12209 if (GET_CODE (new_op0
) == MULT
12210 || aarch64_shift_p (GET_CODE (new_op0
)))
12212 *cost
+= aarch64_rtx_mult_cost (new_op0
, MULT
, PLUS
,
12217 *cost
+= rtx_cost (new_op0
, VOIDmode
, PLUS
, 0, speed
);
12221 if (VECTOR_MODE_P (mode
))
12224 *cost
+= extra_cost
->vect
.alu
;
12226 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
12229 *cost
+= extra_cost
->alu
.arith
;
12231 else if (GET_MODE_CLASS (mode
) == MODE_FLOAT
)
12234 *cost
+= extra_cost
->fp
[mode
== DFmode
].addsub
;
12241 *cost
= COSTS_N_INSNS (1);
12245 if (VECTOR_MODE_P (mode
))
12246 *cost
+= extra_cost
->vect
.alu
;
12248 *cost
+= extra_cost
->alu
.rev
;
12253 if (aarch_rev16_p (x
))
12255 *cost
= COSTS_N_INSNS (1);
12259 if (VECTOR_MODE_P (mode
))
12260 *cost
+= extra_cost
->vect
.alu
;
12262 *cost
+= extra_cost
->alu
.rev
;
12267 if (aarch64_extr_rtx_p (x
, &op0
, &op1
))
12269 *cost
+= rtx_cost (op0
, mode
, IOR
, 0, speed
);
12270 *cost
+= rtx_cost (op1
, mode
, IOR
, 1, speed
);
12272 *cost
+= extra_cost
->alu
.shift
;
12276 /* Fall through. */
12283 if (VECTOR_MODE_P (mode
))
12286 *cost
+= extra_cost
->vect
.alu
;
12291 && GET_CODE (op0
) == MULT
12292 && CONST_INT_P (XEXP (op0
, 1))
12293 && CONST_INT_P (op1
)
12294 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0
, 1))),
12295 INTVAL (op1
)) != 0)
12297 /* This is a UBFM/SBFM. */
12298 *cost
+= rtx_cost (XEXP (op0
, 0), mode
, ZERO_EXTRACT
, 0, speed
);
12300 *cost
+= extra_cost
->alu
.bfx
;
12304 if (is_int_mode (mode
, &int_mode
))
12306 if (CONST_INT_P (op1
))
12308 /* We have a mask + shift version of a UBFIZ
12309 i.e. the *andim_ashift<mode>_bfiz pattern. */
12310 if (GET_CODE (op0
) == ASHIFT
12311 && aarch64_mask_and_shift_for_ubfiz_p (int_mode
, op1
,
12314 *cost
+= rtx_cost (XEXP (op0
, 0), int_mode
,
12315 (enum rtx_code
) code
, 0, speed
);
12317 *cost
+= extra_cost
->alu
.bfx
;
12321 else if (aarch64_bitmask_imm (INTVAL (op1
), int_mode
))
12323 /* We possibly get the immediate for free, this is not
12325 *cost
+= rtx_cost (op0
, int_mode
,
12326 (enum rtx_code
) code
, 0, speed
);
12328 *cost
+= extra_cost
->alu
.logical
;
12337 /* Handle ORN, EON, or BIC. */
12338 if (GET_CODE (op0
) == NOT
)
12339 op0
= XEXP (op0
, 0);
12341 new_op0
= aarch64_strip_shift (op0
);
12343 /* If we had a shift on op0 then this is a logical-shift-
12344 by-register/immediate operation. Otherwise, this is just
12345 a logical operation. */
12348 if (new_op0
!= op0
)
12350 /* Shift by immediate. */
12351 if (CONST_INT_P (XEXP (op0
, 1)))
12352 *cost
+= extra_cost
->alu
.log_shift
;
12354 *cost
+= extra_cost
->alu
.log_shift_reg
;
12357 *cost
+= extra_cost
->alu
.logical
;
12360 /* In both cases we want to cost both operands. */
12361 *cost
+= rtx_cost (new_op0
, int_mode
, (enum rtx_code
) code
,
12363 *cost
+= rtx_cost (op1
, int_mode
, (enum rtx_code
) code
,
12373 op0
= aarch64_strip_shift (x
);
12375 if (VECTOR_MODE_P (mode
))
12378 *cost
+= extra_cost
->vect
.alu
;
12382 /* MVN-shifted-reg. */
12385 *cost
+= rtx_cost (op0
, mode
, (enum rtx_code
) code
, 0, speed
);
12388 *cost
+= extra_cost
->alu
.log_shift
;
12392 /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
12393 Handle the second form here taking care that 'a' in the above can
12395 else if (GET_CODE (op0
) == XOR
)
12397 rtx newop0
= XEXP (op0
, 0);
12398 rtx newop1
= XEXP (op0
, 1);
12399 rtx op0_stripped
= aarch64_strip_shift (newop0
);
12401 *cost
+= rtx_cost (newop1
, mode
, (enum rtx_code
) code
, 1, speed
);
12402 *cost
+= rtx_cost (op0_stripped
, mode
, XOR
, 0, speed
);
12406 if (op0_stripped
!= newop0
)
12407 *cost
+= extra_cost
->alu
.log_shift
;
12409 *cost
+= extra_cost
->alu
.logical
;
12416 *cost
+= extra_cost
->alu
.logical
;
12423 /* If a value is written in SI mode, then zero extended to DI
12424 mode, the operation will in general be free as a write to
12425 a 'w' register implicitly zeroes the upper bits of an 'x'
12426 register. However, if this is
12428 (set (reg) (zero_extend (reg)))
12430 we must cost the explicit register move. */
12432 && GET_MODE (op0
) == SImode
12435 int op_cost
= rtx_cost (op0
, VOIDmode
, ZERO_EXTEND
, 0, speed
);
12437 /* If OP_COST is non-zero, then the cost of the zero extend
12438 is effectively the cost of the inner operation. Otherwise
12439 we have a MOV instruction and we take the cost from the MOV
12440 itself. This is true independently of whether we are
12441 optimizing for space or time. */
12447 else if (MEM_P (op0
))
12449 /* All loads can zero extend to any size for free. */
12450 *cost
= rtx_cost (op0
, VOIDmode
, ZERO_EXTEND
, param
, speed
);
12454 op0
= aarch64_extend_bitfield_pattern_p (x
);
12457 *cost
+= rtx_cost (op0
, mode
, ZERO_EXTEND
, 0, speed
);
12459 *cost
+= extra_cost
->alu
.bfx
;
12465 if (VECTOR_MODE_P (mode
))
12468 *cost
+= extra_cost
->vect
.alu
;
12472 /* We generate an AND instead of UXTB/UXTH. */
12473 *cost
+= extra_cost
->alu
.logical
;
12479 if (MEM_P (XEXP (x
, 0)))
12484 rtx address
= XEXP (XEXP (x
, 0), 0);
12485 *cost
+= extra_cost
->ldst
.load_sign_extend
;
12488 COSTS_N_INSNS (aarch64_address_cost (address
, mode
,
12494 op0
= aarch64_extend_bitfield_pattern_p (x
);
12497 *cost
+= rtx_cost (op0
, mode
, SIGN_EXTEND
, 0, speed
);
12499 *cost
+= extra_cost
->alu
.bfx
;
12505 if (VECTOR_MODE_P (mode
))
12506 *cost
+= extra_cost
->vect
.alu
;
12508 *cost
+= extra_cost
->alu
.extend
;
12516 if (CONST_INT_P (op1
))
12520 if (VECTOR_MODE_P (mode
))
12522 /* Vector shift (immediate). */
12523 *cost
+= extra_cost
->vect
.alu
;
12527 /* LSL (immediate), UBMF, UBFIZ and friends. These are all
12529 *cost
+= extra_cost
->alu
.shift
;
12533 /* We can incorporate zero/sign extend for free. */
12534 if (GET_CODE (op0
) == ZERO_EXTEND
12535 || GET_CODE (op0
) == SIGN_EXTEND
)
12536 op0
= XEXP (op0
, 0);
12538 *cost
+= rtx_cost (op0
, VOIDmode
, ASHIFT
, 0, speed
);
12543 if (VECTOR_MODE_P (mode
))
12546 /* Vector shift (register). */
12547 *cost
+= extra_cost
->vect
.alu
;
12553 *cost
+= extra_cost
->alu
.shift_reg
;
12555 if (GET_CODE (op1
) == AND
&& REG_P (XEXP (op1
, 0))
12556 && CONST_INT_P (XEXP (op1
, 1))
12557 && known_eq (INTVAL (XEXP (op1
, 1)),
12558 GET_MODE_BITSIZE (mode
) - 1))
12560 *cost
+= rtx_cost (op0
, mode
, (rtx_code
) code
, 0, speed
);
12561 /* We already demanded XEXP (op1, 0) to be REG_P, so
12562 don't recurse into it. */
12566 return false; /* All arguments need to be in registers. */
12576 if (CONST_INT_P (op1
))
12578 /* ASR (immediate) and friends. */
12581 if (VECTOR_MODE_P (mode
))
12582 *cost
+= extra_cost
->vect
.alu
;
12584 *cost
+= extra_cost
->alu
.shift
;
12587 *cost
+= rtx_cost (op0
, mode
, (enum rtx_code
) code
, 0, speed
);
12592 if (VECTOR_MODE_P (mode
))
12595 /* Vector shift (register). */
12596 *cost
+= extra_cost
->vect
.alu
;
12601 /* ASR (register) and friends. */
12602 *cost
+= extra_cost
->alu
.shift_reg
;
12604 if (GET_CODE (op1
) == AND
&& REG_P (XEXP (op1
, 0))
12605 && CONST_INT_P (XEXP (op1
, 1))
12606 && known_eq (INTVAL (XEXP (op1
, 1)),
12607 GET_MODE_BITSIZE (mode
) - 1))
12609 *cost
+= rtx_cost (op0
, mode
, (rtx_code
) code
, 0, speed
);
12610 /* We already demanded XEXP (op1, 0) to be REG_P, so
12611 don't recurse into it. */
12615 return false; /* All arguments need to be in registers. */
12620 if (aarch64_cmodel
== AARCH64_CMODEL_LARGE
12621 || aarch64_cmodel
== AARCH64_CMODEL_SMALL_SPIC
)
12625 *cost
+= extra_cost
->ldst
.load
;
12627 else if (aarch64_cmodel
== AARCH64_CMODEL_SMALL
12628 || aarch64_cmodel
== AARCH64_CMODEL_SMALL_PIC
)
12630 /* ADRP, followed by ADD. */
12631 *cost
+= COSTS_N_INSNS (1);
12633 *cost
+= 2 * extra_cost
->alu
.arith
;
12635 else if (aarch64_cmodel
== AARCH64_CMODEL_TINY
12636 || aarch64_cmodel
== AARCH64_CMODEL_TINY_PIC
)
12640 *cost
+= extra_cost
->alu
.arith
;
12645 /* One extra load instruction, after accessing the GOT. */
12646 *cost
+= COSTS_N_INSNS (1);
12648 *cost
+= extra_cost
->ldst
.load
;
12654 /* ADRP/ADD (immediate). */
12656 *cost
+= extra_cost
->alu
.arith
;
12664 if (VECTOR_MODE_P (mode
))
12665 *cost
+= extra_cost
->vect
.alu
;
12667 *cost
+= extra_cost
->alu
.bfx
;
12670 /* We can trust that the immediates used will be correct (there
12671 are no by-register forms), so we need only cost op0. */
12672 *cost
+= rtx_cost (XEXP (x
, 0), VOIDmode
, (enum rtx_code
) code
, 0, speed
);
12676 *cost
+= aarch64_rtx_mult_cost (x
, MULT
, 0, speed
);
12677 /* aarch64_rtx_mult_cost always handles recursion to its
12682 /* We can expand signed mod by power of 2 using a NEGS, two parallel
12683 ANDs and a CSNEG. Assume here that CSNEG is the same as the cost of
12684 an unconditional negate. This case should only ever be reached through
12685 the set_smod_pow2_cheap check in expmed.c. */
12686 if (CONST_INT_P (XEXP (x
, 1))
12687 && exact_log2 (INTVAL (XEXP (x
, 1))) > 0
12688 && (mode
== SImode
|| mode
== DImode
))
12690 /* We expand to 4 instructions. Reset the baseline. */
12691 *cost
= COSTS_N_INSNS (4);
12694 *cost
+= 2 * extra_cost
->alu
.logical
12695 + 2 * extra_cost
->alu
.arith
;
12700 /* Fall-through. */
12704 /* Slighly prefer UMOD over SMOD. */
12705 if (VECTOR_MODE_P (mode
))
12706 *cost
+= extra_cost
->vect
.alu
;
12707 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
12708 *cost
+= (extra_cost
->mult
[mode
== DImode
].add
12709 + extra_cost
->mult
[mode
== DImode
].idiv
12710 + (code
== MOD
? 1 : 0));
12712 return false; /* All arguments need to be in registers. */
12719 if (VECTOR_MODE_P (mode
))
12720 *cost
+= extra_cost
->vect
.alu
;
12721 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
12722 /* There is no integer SQRT, so only DIV and UDIV can get
12724 *cost
+= (extra_cost
->mult
[mode
== DImode
].idiv
12725 /* Slighly prefer UDIV over SDIV. */
12726 + (code
== DIV
? 1 : 0));
12728 *cost
+= extra_cost
->fp
[mode
== DFmode
].div
;
12730 return false; /* All arguments need to be in registers. */
12733 return aarch64_if_then_else_costs (XEXP (x
, 0), XEXP (x
, 1),
12734 XEXP (x
, 2), cost
, speed
);
12747 return false; /* All arguments must be in registers. */
12756 if (VECTOR_MODE_P (mode
))
12757 *cost
+= extra_cost
->vect
.alu
;
12759 *cost
+= extra_cost
->fp
[mode
== DFmode
].fma
;
12762 /* FMSUB, FNMADD, and FNMSUB are free. */
12763 if (GET_CODE (op0
) == NEG
)
12764 op0
= XEXP (op0
, 0);
12766 if (GET_CODE (op2
) == NEG
)
12767 op2
= XEXP (op2
, 0);
12769 /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
12770 and the by-element operand as operand 0. */
12771 if (GET_CODE (op1
) == NEG
)
12772 op1
= XEXP (op1
, 0);
12774 /* Catch vector-by-element operations. The by-element operand can
12775 either be (vec_duplicate (vec_select (x))) or just
12776 (vec_select (x)), depending on whether we are multiplying by
12777 a vector or a scalar.
12779 Canonicalization is not very good in these cases, FMA4 will put the
12780 by-element operand as operand 0, FNMA4 will have it as operand 1. */
12781 if (GET_CODE (op0
) == VEC_DUPLICATE
)
12782 op0
= XEXP (op0
, 0);
12783 else if (GET_CODE (op1
) == VEC_DUPLICATE
)
12784 op1
= XEXP (op1
, 0);
12786 if (GET_CODE (op0
) == VEC_SELECT
)
12787 op0
= XEXP (op0
, 0);
12788 else if (GET_CODE (op1
) == VEC_SELECT
)
12789 op1
= XEXP (op1
, 0);
12791 /* If the remaining parameters are not registers,
12792 get the cost to put them into registers. */
12793 *cost
+= rtx_cost (op0
, mode
, FMA
, 0, speed
);
12794 *cost
+= rtx_cost (op1
, mode
, FMA
, 1, speed
);
12795 *cost
+= rtx_cost (op2
, mode
, FMA
, 2, speed
);
12799 case UNSIGNED_FLOAT
:
12801 *cost
+= extra_cost
->fp
[mode
== DFmode
].fromint
;
12807 if (VECTOR_MODE_P (mode
))
12809 /*Vector truncate. */
12810 *cost
+= extra_cost
->vect
.alu
;
12813 *cost
+= extra_cost
->fp
[mode
== DFmode
].widen
;
12817 case FLOAT_TRUNCATE
:
12820 if (VECTOR_MODE_P (mode
))
12822 /*Vector conversion. */
12823 *cost
+= extra_cost
->vect
.alu
;
12826 *cost
+= extra_cost
->fp
[mode
== DFmode
].narrow
;
12833 /* Strip the rounding part. They will all be implemented
12834 by the fcvt* family of instructions anyway. */
12835 if (GET_CODE (x
) == UNSPEC
)
12837 unsigned int uns_code
= XINT (x
, 1);
12839 if (uns_code
== UNSPEC_FRINTA
12840 || uns_code
== UNSPEC_FRINTM
12841 || uns_code
== UNSPEC_FRINTN
12842 || uns_code
== UNSPEC_FRINTP
12843 || uns_code
== UNSPEC_FRINTZ
)
12844 x
= XVECEXP (x
, 0, 0);
12849 if (VECTOR_MODE_P (mode
))
12850 *cost
+= extra_cost
->vect
.alu
;
12852 *cost
+= extra_cost
->fp
[GET_MODE (x
) == DFmode
].toint
;
12855 /* We can combine fmul by a power of 2 followed by a fcvt into a single
12856 fixed-point fcvt. */
12857 if (GET_CODE (x
) == MULT
12858 && ((VECTOR_MODE_P (mode
)
12859 && aarch64_vec_fpconst_pow_of_2 (XEXP (x
, 1)) > 0)
12860 || aarch64_fpconst_pow_of_2 (XEXP (x
, 1)) > 0))
12862 *cost
+= rtx_cost (XEXP (x
, 0), VOIDmode
, (rtx_code
) code
,
12867 *cost
+= rtx_cost (x
, VOIDmode
, (enum rtx_code
) code
, 0, speed
);
12871 if (VECTOR_MODE_P (mode
))
12873 /* ABS (vector). */
12875 *cost
+= extra_cost
->vect
.alu
;
12877 else if (GET_MODE_CLASS (mode
) == MODE_FLOAT
)
12881 /* FABD, which is analogous to FADD. */
12882 if (GET_CODE (op0
) == MINUS
)
12884 *cost
+= rtx_cost (XEXP (op0
, 0), mode
, MINUS
, 0, speed
);
12885 *cost
+= rtx_cost (XEXP (op0
, 1), mode
, MINUS
, 1, speed
);
12887 *cost
+= extra_cost
->fp
[mode
== DFmode
].addsub
;
12891 /* Simple FABS is analogous to FNEG. */
12893 *cost
+= extra_cost
->fp
[mode
== DFmode
].neg
;
12897 /* Integer ABS will either be split to
12898 two arithmetic instructions, or will be an ABS
12899 (scalar), which we don't model. */
12900 *cost
= COSTS_N_INSNS (2);
12902 *cost
+= 2 * extra_cost
->alu
.arith
;
12910 if (VECTOR_MODE_P (mode
))
12911 *cost
+= extra_cost
->vect
.alu
;
12914 /* FMAXNM/FMINNM/FMAX/FMIN.
12915 TODO: This may not be accurate for all implementations, but
12916 we do not model this in the cost tables. */
12917 *cost
+= extra_cost
->fp
[mode
== DFmode
].addsub
;
12923 /* The floating point round to integer frint* instructions. */
12924 if (aarch64_frint_unspec_p (XINT (x
, 1)))
12927 *cost
+= extra_cost
->fp
[mode
== DFmode
].roundint
;
12932 if (XINT (x
, 1) == UNSPEC_RBIT
)
12935 *cost
+= extra_cost
->alu
.rev
;
12943 /* Decompose <su>muldi3_highpart. */
12944 if (/* (truncate:DI */
12947 && GET_MODE (XEXP (x
, 0)) == TImode
12948 && GET_CODE (XEXP (x
, 0)) == LSHIFTRT
12950 && GET_CODE (XEXP (XEXP (x
, 0), 0)) == MULT
12951 /* (ANY_EXTEND:TI (reg:DI))
12952 (ANY_EXTEND:TI (reg:DI))) */
12953 && ((GET_CODE (XEXP (XEXP (XEXP (x
, 0), 0), 0)) == ZERO_EXTEND
12954 && GET_CODE (XEXP (XEXP (XEXP (x
, 0), 0), 1)) == ZERO_EXTEND
)
12955 || (GET_CODE (XEXP (XEXP (XEXP (x
, 0), 0), 0)) == SIGN_EXTEND
12956 && GET_CODE (XEXP (XEXP (XEXP (x
, 0), 0), 1)) == SIGN_EXTEND
))
12957 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x
, 0), 0), 0), 0)) == DImode
12958 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x
, 0), 0), 1), 0)) == DImode
12959 /* (const_int 64) */
12960 && CONST_INT_P (XEXP (XEXP (x
, 0), 1))
12961 && UINTVAL (XEXP (XEXP (x
, 0), 1)) == 64)
12965 *cost
+= extra_cost
->mult
[mode
== DImode
].extend
;
12966 *cost
+= rtx_cost (XEXP (XEXP (XEXP (XEXP (x
, 0), 0), 0), 0),
12967 mode
, MULT
, 0, speed
);
12968 *cost
+= rtx_cost (XEXP (XEXP (XEXP (XEXP (x
, 0), 0), 1), 0),
12969 mode
, MULT
, 1, speed
);
12973 /* Fall through. */
12979 && flag_aarch64_verbose_cost
)
12980 fprintf (dump_file
,
12981 "\nFailed to cost RTX. Assuming default cost.\n");
12986 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
12987 calculated for X. This cost is stored in *COST. Returns true
12988 if the total cost of X was calculated. */
12990 aarch64_rtx_costs_wrapper (rtx x
, machine_mode mode
, int outer
,
12991 int param
, int *cost
, bool speed
)
12993 bool result
= aarch64_rtx_costs (x
, mode
, outer
, param
, cost
, speed
);
12996 && flag_aarch64_verbose_cost
)
12998 print_rtl_single (dump_file
, x
);
12999 fprintf (dump_file
, "\n%s cost: %d (%s)\n",
13000 speed
? "Hot" : "Cold",
13001 *cost
, result
? "final" : "partial");
13008 aarch64_register_move_cost (machine_mode mode
,
13009 reg_class_t from_i
, reg_class_t to_i
)
13011 enum reg_class from
= (enum reg_class
) from_i
;
13012 enum reg_class to
= (enum reg_class
) to_i
;
13013 const struct cpu_regmove_cost
*regmove_cost
13014 = aarch64_tune_params
.regmove_cost
;
13016 /* Caller save and pointer regs are equivalent to GENERAL_REGS. */
13017 if (to
== TAILCALL_ADDR_REGS
|| to
== POINTER_REGS
)
13020 if (from
== TAILCALL_ADDR_REGS
|| from
== POINTER_REGS
)
13021 from
= GENERAL_REGS
;
13023 /* Make RDFFR very expensive. In particular, if we know that the FFR
13024 contains a PTRUE (e.g. after a SETFFR), we must never use RDFFR
13025 as a way of obtaining a PTRUE. */
13026 if (GET_MODE_CLASS (mode
) == MODE_VECTOR_BOOL
13027 && hard_reg_set_subset_p (reg_class_contents
[from_i
],
13028 reg_class_contents
[FFR_REGS
]))
13031 /* Moving between GPR and stack cost is the same as GP2GP. */
13032 if ((from
== GENERAL_REGS
&& to
== STACK_REG
)
13033 || (to
== GENERAL_REGS
&& from
== STACK_REG
))
13034 return regmove_cost
->GP2GP
;
13036 /* To/From the stack register, we move via the gprs. */
13037 if (to
== STACK_REG
|| from
== STACK_REG
)
13038 return aarch64_register_move_cost (mode
, from
, GENERAL_REGS
)
13039 + aarch64_register_move_cost (mode
, GENERAL_REGS
, to
);
13041 if (known_eq (GET_MODE_SIZE (mode
), 16))
13043 /* 128-bit operations on general registers require 2 instructions. */
13044 if (from
== GENERAL_REGS
&& to
== GENERAL_REGS
)
13045 return regmove_cost
->GP2GP
* 2;
13046 else if (from
== GENERAL_REGS
)
13047 return regmove_cost
->GP2FP
* 2;
13048 else if (to
== GENERAL_REGS
)
13049 return regmove_cost
->FP2GP
* 2;
13051 /* When AdvSIMD instructions are disabled it is not possible to move
13052 a 128-bit value directly between Q registers. This is handled in
13053 secondary reload. A general register is used as a scratch to move
13054 the upper DI value and the lower DI value is moved directly,
13055 hence the cost is the sum of three moves. */
13057 return regmove_cost
->GP2FP
+ regmove_cost
->FP2GP
+ regmove_cost
->FP2FP
;
13059 return regmove_cost
->FP2FP
;
13062 if (from
== GENERAL_REGS
&& to
== GENERAL_REGS
)
13063 return regmove_cost
->GP2GP
;
13064 else if (from
== GENERAL_REGS
)
13065 return regmove_cost
->GP2FP
;
13066 else if (to
== GENERAL_REGS
)
13067 return regmove_cost
->FP2GP
;
13069 return regmove_cost
->FP2FP
;
13073 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED
,
13074 reg_class_t rclass ATTRIBUTE_UNUSED
,
13075 bool in ATTRIBUTE_UNUSED
)
13077 return aarch64_tune_params
.memmov_cost
;
13080 /* Implement TARGET_INIT_BUILTINS. */
13082 aarch64_init_builtins ()
13084 aarch64_general_init_builtins ();
13085 aarch64_sve::init_builtins ();
13088 /* Implement TARGET_FOLD_BUILTIN. */
13090 aarch64_fold_builtin (tree fndecl
, int nargs
, tree
*args
, bool)
13092 unsigned int code
= DECL_MD_FUNCTION_CODE (fndecl
);
13093 unsigned int subcode
= code
>> AARCH64_BUILTIN_SHIFT
;
13094 tree type
= TREE_TYPE (TREE_TYPE (fndecl
));
13095 switch (code
& AARCH64_BUILTIN_CLASS
)
13097 case AARCH64_BUILTIN_GENERAL
:
13098 return aarch64_general_fold_builtin (subcode
, type
, nargs
, args
);
13100 case AARCH64_BUILTIN_SVE
:
13103 gcc_unreachable ();
13106 /* Implement TARGET_GIMPLE_FOLD_BUILTIN. */
13108 aarch64_gimple_fold_builtin (gimple_stmt_iterator
*gsi
)
13110 gcall
*stmt
= as_a
<gcall
*> (gsi_stmt (*gsi
));
13111 tree fndecl
= gimple_call_fndecl (stmt
);
13112 unsigned int code
= DECL_MD_FUNCTION_CODE (fndecl
);
13113 unsigned int subcode
= code
>> AARCH64_BUILTIN_SHIFT
;
13114 gimple
*new_stmt
= NULL
;
13115 switch (code
& AARCH64_BUILTIN_CLASS
)
13117 case AARCH64_BUILTIN_GENERAL
:
13118 new_stmt
= aarch64_general_gimple_fold_builtin (subcode
, stmt
);
13121 case AARCH64_BUILTIN_SVE
:
13122 new_stmt
= aarch64_sve::gimple_fold_builtin (subcode
, gsi
, stmt
);
13129 gsi_replace (gsi
, new_stmt
, true);
13133 /* Implement TARGET_EXPAND_BUILTIN. */
13135 aarch64_expand_builtin (tree exp
, rtx target
, rtx
, machine_mode
, int ignore
)
13137 tree fndecl
= TREE_OPERAND (CALL_EXPR_FN (exp
), 0);
13138 unsigned int code
= DECL_MD_FUNCTION_CODE (fndecl
);
13139 unsigned int subcode
= code
>> AARCH64_BUILTIN_SHIFT
;
13140 switch (code
& AARCH64_BUILTIN_CLASS
)
13142 case AARCH64_BUILTIN_GENERAL
:
13143 return aarch64_general_expand_builtin (subcode
, exp
, target
, ignore
);
13145 case AARCH64_BUILTIN_SVE
:
13146 return aarch64_sve::expand_builtin (subcode
, exp
, target
);
13148 gcc_unreachable ();
13151 /* Implement TARGET_BUILTIN_DECL. */
13153 aarch64_builtin_decl (unsigned int code
, bool initialize_p
)
13155 unsigned int subcode
= code
>> AARCH64_BUILTIN_SHIFT
;
13156 switch (code
& AARCH64_BUILTIN_CLASS
)
13158 case AARCH64_BUILTIN_GENERAL
:
13159 return aarch64_general_builtin_decl (subcode
, initialize_p
);
13161 case AARCH64_BUILTIN_SVE
:
13162 return aarch64_sve::builtin_decl (subcode
, initialize_p
);
13164 gcc_unreachable ();
13167 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
13168 to optimize 1.0/sqrt. */
13171 use_rsqrt_p (machine_mode mode
)
13173 return (!flag_trapping_math
13174 && flag_unsafe_math_optimizations
13175 && ((aarch64_tune_params
.approx_modes
->recip_sqrt
13176 & AARCH64_APPROX_MODE (mode
))
13177 || flag_mrecip_low_precision_sqrt
));
13180 /* Function to decide when to use the approximate reciprocal square root
13184 aarch64_builtin_reciprocal (tree fndecl
)
13186 machine_mode mode
= TYPE_MODE (TREE_TYPE (fndecl
));
13188 if (!use_rsqrt_p (mode
))
13190 unsigned int code
= DECL_MD_FUNCTION_CODE (fndecl
);
13191 unsigned int subcode
= code
>> AARCH64_BUILTIN_SHIFT
;
13192 switch (code
& AARCH64_BUILTIN_CLASS
)
13194 case AARCH64_BUILTIN_GENERAL
:
13195 return aarch64_general_builtin_rsqrt (subcode
);
13197 case AARCH64_BUILTIN_SVE
:
13200 gcc_unreachable ();
13203 /* Emit code to perform the floating-point operation:
13207 where all three operands are already known to be registers.
13208 If the operation is an SVE one, PTRUE is a suitable all-true
13212 aarch64_emit_mult (rtx dst
, rtx ptrue
, rtx src1
, rtx src2
)
13215 emit_insn (gen_aarch64_pred (UNSPEC_COND_FMUL
, GET_MODE (dst
),
13216 dst
, ptrue
, src1
, src2
,
13217 gen_int_mode (SVE_RELAXED_GP
, SImode
)));
13219 emit_set_insn (dst
, gen_rtx_MULT (GET_MODE (dst
), src1
, src2
));
13222 /* Emit instruction sequence to compute either the approximate square root
13223 or its approximate reciprocal, depending on the flag RECP, and return
13224 whether the sequence was emitted or not. */
13227 aarch64_emit_approx_sqrt (rtx dst
, rtx src
, bool recp
)
13229 machine_mode mode
= GET_MODE (dst
);
13231 if (GET_MODE_INNER (mode
) == HFmode
)
13233 gcc_assert (!recp
);
13239 if (!(flag_mlow_precision_sqrt
13240 || (aarch64_tune_params
.approx_modes
->sqrt
13241 & AARCH64_APPROX_MODE (mode
))))
13244 if (!flag_finite_math_only
13245 || flag_trapping_math
13246 || !flag_unsafe_math_optimizations
13247 || optimize_function_for_size_p (cfun
))
13251 /* Caller assumes we cannot fail. */
13252 gcc_assert (use_rsqrt_p (mode
));
13255 if (aarch64_sve_mode_p (mode
))
13256 pg
= aarch64_ptrue_reg (aarch64_sve_pred_mode (mode
));
13257 machine_mode mmsk
= (VECTOR_MODE_P (mode
)
13258 ? related_int_vector_mode (mode
).require ()
13259 : int_mode_for_mode (mode
).require ());
13260 rtx xmsk
= NULL_RTX
;
13263 /* When calculating the approximate square root, compare the
13264 argument with 0.0 and create a mask. */
13265 rtx zero
= CONST0_RTX (mode
);
13268 xmsk
= gen_reg_rtx (GET_MODE (pg
));
13269 rtx hint
= gen_int_mode (SVE_KNOWN_PTRUE
, SImode
);
13270 emit_insn (gen_aarch64_pred_fcm (UNSPEC_COND_FCMNE
, mode
,
13271 xmsk
, pg
, hint
, src
, zero
));
13275 xmsk
= gen_reg_rtx (mmsk
);
13276 emit_insn (gen_rtx_SET (xmsk
,
13278 gen_rtx_EQ (mmsk
, src
, zero
))));
13282 /* Estimate the approximate reciprocal square root. */
13283 rtx xdst
= gen_reg_rtx (mode
);
13284 emit_insn (gen_aarch64_rsqrte (mode
, xdst
, src
));
13286 /* Iterate over the series twice for SF and thrice for DF. */
13287 int iterations
= (GET_MODE_INNER (mode
) == DFmode
) ? 3 : 2;
13289 /* Optionally iterate over the series once less for faster performance
13290 while sacrificing the accuracy. */
13291 if ((recp
&& flag_mrecip_low_precision_sqrt
)
13292 || (!recp
&& flag_mlow_precision_sqrt
))
13295 /* Iterate over the series to calculate the approximate reciprocal square
13297 rtx x1
= gen_reg_rtx (mode
);
13298 while (iterations
--)
13300 rtx x2
= gen_reg_rtx (mode
);
13301 aarch64_emit_mult (x2
, pg
, xdst
, xdst
);
13303 emit_insn (gen_aarch64_rsqrts (mode
, x1
, src
, x2
));
13305 if (iterations
> 0)
13306 aarch64_emit_mult (xdst
, pg
, xdst
, x1
);
13312 /* Multiply nonzero source values by the corresponding intermediate
13313 result elements, so that the final calculation is the approximate
13314 square root rather than its reciprocal. Select a zero result for
13315 zero source values, to avoid the Inf * 0 -> NaN that we'd get
13317 emit_insn (gen_cond (UNSPEC_COND_FMUL
, mode
,
13318 xdst
, xmsk
, xdst
, src
, CONST0_RTX (mode
)));
13321 /* Qualify the approximate reciprocal square root when the
13322 argument is 0.0 by squashing the intermediary result to 0.0. */
13323 rtx xtmp
= gen_reg_rtx (mmsk
);
13324 emit_set_insn (xtmp
, gen_rtx_AND (mmsk
, gen_rtx_NOT (mmsk
, xmsk
),
13325 gen_rtx_SUBREG (mmsk
, xdst
, 0)));
13326 emit_move_insn (xdst
, gen_rtx_SUBREG (mode
, xtmp
, 0));
13328 /* Calculate the approximate square root. */
13329 aarch64_emit_mult (xdst
, pg
, xdst
, src
);
13333 /* Finalize the approximation. */
13334 aarch64_emit_mult (dst
, pg
, xdst
, x1
);
13339 /* Emit the instruction sequence to compute the approximation for the division
13340 of NUM by DEN in QUO and return whether the sequence was emitted or not. */
13343 aarch64_emit_approx_div (rtx quo
, rtx num
, rtx den
)
13345 machine_mode mode
= GET_MODE (quo
);
13347 if (GET_MODE_INNER (mode
) == HFmode
)
13350 bool use_approx_division_p
= (flag_mlow_precision_div
13351 || (aarch64_tune_params
.approx_modes
->division
13352 & AARCH64_APPROX_MODE (mode
)));
13354 if (!flag_finite_math_only
13355 || flag_trapping_math
13356 || !flag_unsafe_math_optimizations
13357 || optimize_function_for_size_p (cfun
)
13358 || !use_approx_division_p
)
13361 if (!TARGET_SIMD
&& VECTOR_MODE_P (mode
))
13365 if (aarch64_sve_mode_p (mode
))
13366 pg
= aarch64_ptrue_reg (aarch64_sve_pred_mode (mode
));
13368 /* Estimate the approximate reciprocal. */
13369 rtx xrcp
= gen_reg_rtx (mode
);
13370 emit_insn (gen_aarch64_frecpe (mode
, xrcp
, den
));
13372 /* Iterate over the series twice for SF and thrice for DF. */
13373 int iterations
= (GET_MODE_INNER (mode
) == DFmode
) ? 3 : 2;
13375 /* Optionally iterate over the series less for faster performance,
13376 while sacrificing the accuracy. The default is 2 for DF and 1 for SF. */
13377 if (flag_mlow_precision_div
)
13378 iterations
= (GET_MODE_INNER (mode
) == DFmode
13379 ? aarch64_double_recp_precision
13380 : aarch64_float_recp_precision
);
13382 /* Iterate over the series to calculate the approximate reciprocal. */
13383 rtx xtmp
= gen_reg_rtx (mode
);
13384 while (iterations
--)
13386 emit_insn (gen_aarch64_frecps (mode
, xtmp
, xrcp
, den
));
13388 if (iterations
> 0)
13389 aarch64_emit_mult (xrcp
, pg
, xrcp
, xtmp
);
13392 if (num
!= CONST1_RTX (mode
))
13394 /* As the approximate reciprocal of DEN is already calculated, only
13395 calculate the approximate division when NUM is not 1.0. */
13396 rtx xnum
= force_reg (mode
, num
);
13397 aarch64_emit_mult (xrcp
, pg
, xrcp
, xnum
);
13400 /* Finalize the approximation. */
13401 aarch64_emit_mult (quo
, pg
, xrcp
, xtmp
);
13405 /* Return the number of instructions that can be issued per cycle. */
13407 aarch64_sched_issue_rate (void)
13409 return aarch64_tune_params
.issue_rate
;
13412 /* Implement TARGET_SCHED_VARIABLE_ISSUE. */
13414 aarch64_sched_variable_issue (FILE *, int, rtx_insn
*insn
, int more
)
13416 if (DEBUG_INSN_P (insn
))
13419 rtx_code code
= GET_CODE (PATTERN (insn
));
13420 if (code
== USE
|| code
== CLOBBER
)
13423 if (get_attr_type (insn
) == TYPE_NO_INSN
)
13430 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
13432 int issue_rate
= aarch64_sched_issue_rate ();
13434 return issue_rate
> 1 && !sched_fusion
? issue_rate
: 0;
13438 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
13439 autopref_multipass_dfa_lookahead_guard from haifa-sched.c. It only
13440 has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0. */
13443 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn
*insn
,
13446 return autopref_multipass_dfa_lookahead_guard (insn
, ready_index
);
13450 /* Vectorizer cost model target hooks. */
13452 /* Implement targetm.vectorize.builtin_vectorization_cost. */
13454 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost
,
13456 int misalign ATTRIBUTE_UNUSED
)
13459 const cpu_vector_cost
*costs
= aarch64_tune_params
.vec_costs
;
13462 if (vectype
!= NULL
)
13463 fp
= FLOAT_TYPE_P (vectype
);
13465 switch (type_of_cost
)
13468 return fp
? costs
->scalar_fp_stmt_cost
: costs
->scalar_int_stmt_cost
;
13471 return costs
->scalar_load_cost
;
13474 return costs
->scalar_store_cost
;
13477 return fp
? costs
->vec_fp_stmt_cost
: costs
->vec_int_stmt_cost
;
13480 return costs
->vec_align_load_cost
;
13483 return costs
->vec_store_cost
;
13485 case vec_to_scalar
:
13486 return costs
->vec_to_scalar_cost
;
13488 case scalar_to_vec
:
13489 return costs
->scalar_to_vec_cost
;
13491 case unaligned_load
:
13492 case vector_gather_load
:
13493 return costs
->vec_unalign_load_cost
;
13495 case unaligned_store
:
13496 case vector_scatter_store
:
13497 return costs
->vec_unalign_store_cost
;
13499 case cond_branch_taken
:
13500 return costs
->cond_taken_branch_cost
;
13502 case cond_branch_not_taken
:
13503 return costs
->cond_not_taken_branch_cost
;
13506 return costs
->vec_permute_cost
;
13508 case vec_promote_demote
:
13509 return fp
? costs
->vec_fp_stmt_cost
: costs
->vec_int_stmt_cost
;
13511 case vec_construct
:
13512 elements
= estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype
));
13513 return elements
/ 2 + 1;
13516 gcc_unreachable ();
13520 /* Return true if STMT_INFO extends the result of a load. */
13522 aarch64_extending_load_p (stmt_vec_info stmt_info
)
13524 gassign
*assign
= dyn_cast
<gassign
*> (stmt_info
->stmt
);
13525 if (!assign
|| !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (assign
)))
13528 tree rhs
= gimple_assign_rhs1 (stmt_info
->stmt
);
13529 tree lhs_type
= TREE_TYPE (gimple_assign_lhs (assign
));
13530 tree rhs_type
= TREE_TYPE (rhs
);
13531 if (!INTEGRAL_TYPE_P (lhs_type
)
13532 || !INTEGRAL_TYPE_P (rhs_type
)
13533 || TYPE_PRECISION (lhs_type
) <= TYPE_PRECISION (rhs_type
))
13536 stmt_vec_info def_stmt_info
= stmt_info
->vinfo
->lookup_def (rhs
);
13537 return (def_stmt_info
13538 && STMT_VINFO_DATA_REF (def_stmt_info
)
13539 && DR_IS_READ (STMT_VINFO_DATA_REF (def_stmt_info
)));
13542 /* Return true if STMT_INFO is an integer truncation. */
13544 aarch64_integer_truncation_p (stmt_vec_info stmt_info
)
13546 gassign
*assign
= dyn_cast
<gassign
*> (stmt_info
->stmt
);
13547 if (!assign
|| !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (assign
)))
13550 tree lhs_type
= TREE_TYPE (gimple_assign_lhs (assign
));
13551 tree rhs_type
= TREE_TYPE (gimple_assign_rhs1 (assign
));
13552 return (INTEGRAL_TYPE_P (lhs_type
)
13553 && INTEGRAL_TYPE_P (rhs_type
)
13554 && TYPE_PRECISION (lhs_type
) < TYPE_PRECISION (rhs_type
));
13557 /* STMT_COST is the cost calculated by aarch64_builtin_vectorization_cost
13558 for STMT_INFO, which has cost kind KIND. Adjust the cost as necessary
13559 for SVE targets. */
13560 static unsigned int
13561 aarch64_sve_adjust_stmt_cost (vect_cost_for_stmt kind
, stmt_vec_info stmt_info
,
13562 unsigned int stmt_cost
)
13564 /* Unlike vec_promote_demote, vector_stmt conversions do not change the
13565 vector register size or number of units. Integer promotions of this
13566 type therefore map to SXT[BHW] or UXT[BHW].
13568 Most loads have extending forms that can do the sign or zero extension
13569 on the fly. Optimistically assume that a load followed by an extension
13570 will fold to this form during combine, and that the extension therefore
13572 if (kind
== vector_stmt
&& aarch64_extending_load_p (stmt_info
))
13575 /* For similar reasons, vector_stmt integer truncations are a no-op,
13576 because we can just ignore the unused upper bits of the source. */
13577 if (kind
== vector_stmt
&& aarch64_integer_truncation_p (stmt_info
))
13583 /* Implement targetm.vectorize.add_stmt_cost. */
13585 aarch64_add_stmt_cost (void *data
, int count
, enum vect_cost_for_stmt kind
,
13586 struct _stmt_vec_info
*stmt_info
, int misalign
,
13587 enum vect_cost_model_location where
)
13589 unsigned *cost
= (unsigned *) data
;
13590 unsigned retval
= 0;
13592 if (flag_vect_cost_model
)
13594 tree vectype
= stmt_info
? stmt_vectype (stmt_info
) : NULL_TREE
;
13596 aarch64_builtin_vectorization_cost (kind
, vectype
, misalign
);
13598 if (stmt_info
&& vectype
&& aarch64_sve_mode_p (TYPE_MODE (vectype
)))
13599 stmt_cost
= aarch64_sve_adjust_stmt_cost (kind
, stmt_info
, stmt_cost
);
13601 /* Statements in an inner loop relative to the loop being
13602 vectorized are weighted more heavily. The value here is
13603 arbitrary and could potentially be improved with analysis. */
13604 if (where
== vect_body
&& stmt_info
&& stmt_in_inner_loop_p (stmt_info
))
13605 count
*= 50; /* FIXME */
13607 retval
= (unsigned) (count
* stmt_cost
);
13608 cost
[where
] += retval
;
13614 static void initialize_aarch64_code_model (struct gcc_options
*);
13616 /* Parse the TO_PARSE string and put the architecture struct that it
13617 selects into RES and the architectural features into ISA_FLAGS.
13618 Return an aarch64_parse_opt_result describing the parse result.
13619 If there is an error parsing, RES and ISA_FLAGS are left unchanged.
13620 When the TO_PARSE string contains an invalid extension,
13621 a copy of the string is created and stored to INVALID_EXTENSION. */
13623 static enum aarch64_parse_opt_result
13624 aarch64_parse_arch (const char *to_parse
, const struct processor
**res
,
13625 uint64_t *isa_flags
, std::string
*invalid_extension
)
13628 const struct processor
*arch
;
13631 ext
= strchr (to_parse
, '+');
13634 len
= ext
- to_parse
;
13636 len
= strlen (to_parse
);
13639 return AARCH64_PARSE_MISSING_ARG
;
13642 /* Loop through the list of supported ARCHes to find a match. */
13643 for (arch
= all_architectures
; arch
->name
!= NULL
; arch
++)
13645 if (strlen (arch
->name
) == len
13646 && strncmp (arch
->name
, to_parse
, len
) == 0)
13648 uint64_t isa_temp
= arch
->flags
;
13652 /* TO_PARSE string contains at least one extension. */
13653 enum aarch64_parse_opt_result ext_res
13654 = aarch64_parse_extension (ext
, &isa_temp
, invalid_extension
);
13656 if (ext_res
!= AARCH64_PARSE_OK
)
13659 /* Extension parsing was successful. Confirm the result
13660 arch and ISA flags. */
13662 *isa_flags
= isa_temp
;
13663 return AARCH64_PARSE_OK
;
13667 /* ARCH name not found in list. */
13668 return AARCH64_PARSE_INVALID_ARG
;
13671 /* Parse the TO_PARSE string and put the result tuning in RES and the
13672 architecture flags in ISA_FLAGS. Return an aarch64_parse_opt_result
13673 describing the parse result. If there is an error parsing, RES and
13674 ISA_FLAGS are left unchanged.
13675 When the TO_PARSE string contains an invalid extension,
13676 a copy of the string is created and stored to INVALID_EXTENSION. */
13678 static enum aarch64_parse_opt_result
13679 aarch64_parse_cpu (const char *to_parse
, const struct processor
**res
,
13680 uint64_t *isa_flags
, std::string
*invalid_extension
)
13683 const struct processor
*cpu
;
13686 ext
= strchr (to_parse
, '+');
13689 len
= ext
- to_parse
;
13691 len
= strlen (to_parse
);
13694 return AARCH64_PARSE_MISSING_ARG
;
13697 /* Loop through the list of supported CPUs to find a match. */
13698 for (cpu
= all_cores
; cpu
->name
!= NULL
; cpu
++)
13700 if (strlen (cpu
->name
) == len
&& strncmp (cpu
->name
, to_parse
, len
) == 0)
13702 uint64_t isa_temp
= cpu
->flags
;
13707 /* TO_PARSE string contains at least one extension. */
13708 enum aarch64_parse_opt_result ext_res
13709 = aarch64_parse_extension (ext
, &isa_temp
, invalid_extension
);
13711 if (ext_res
!= AARCH64_PARSE_OK
)
13714 /* Extension parsing was successfull. Confirm the result
13715 cpu and ISA flags. */
13717 *isa_flags
= isa_temp
;
13718 return AARCH64_PARSE_OK
;
13722 /* CPU name not found in list. */
13723 return AARCH64_PARSE_INVALID_ARG
;
13726 /* Parse the TO_PARSE string and put the cpu it selects into RES.
13727 Return an aarch64_parse_opt_result describing the parse result.
13728 If the parsing fails the RES does not change. */
13730 static enum aarch64_parse_opt_result
13731 aarch64_parse_tune (const char *to_parse
, const struct processor
**res
)
13733 const struct processor
*cpu
;
13735 /* Loop through the list of supported CPUs to find a match. */
13736 for (cpu
= all_cores
; cpu
->name
!= NULL
; cpu
++)
13738 if (strcmp (cpu
->name
, to_parse
) == 0)
13741 return AARCH64_PARSE_OK
;
13745 /* CPU name not found in list. */
13746 return AARCH64_PARSE_INVALID_ARG
;
13749 /* Parse TOKEN, which has length LENGTH to see if it is an option
13750 described in FLAG. If it is, return the index bit for that fusion type.
13751 If not, error (printing OPTION_NAME) and return zero. */
13753 static unsigned int
13754 aarch64_parse_one_option_token (const char *token
,
13756 const struct aarch64_flag_desc
*flag
,
13757 const char *option_name
)
13759 for (; flag
->name
!= NULL
; flag
++)
13761 if (length
== strlen (flag
->name
)
13762 && !strncmp (flag
->name
, token
, length
))
13766 error ("unknown flag passed in %<-moverride=%s%> (%s)", option_name
, token
);
13770 /* Parse OPTION which is a comma-separated list of flags to enable.
13771 FLAGS gives the list of flags we understand, INITIAL_STATE gives any
13772 default state we inherit from the CPU tuning structures. OPTION_NAME
13773 gives the top-level option we are parsing in the -moverride string,
13774 for use in error messages. */
13776 static unsigned int
13777 aarch64_parse_boolean_options (const char *option
,
13778 const struct aarch64_flag_desc
*flags
,
13779 unsigned int initial_state
,
13780 const char *option_name
)
13782 const char separator
= '.';
13783 const char* specs
= option
;
13784 const char* ntoken
= option
;
13785 unsigned int found_flags
= initial_state
;
13787 while ((ntoken
= strchr (specs
, separator
)))
13789 size_t token_length
= ntoken
- specs
;
13790 unsigned token_ops
= aarch64_parse_one_option_token (specs
,
13794 /* If we find "none" (or, for simplicity's sake, an error) anywhere
13795 in the token stream, reset the supported operations. So:
13797 adrp+add.cmp+branch.none.adrp+add
13799 would have the result of turning on only adrp+add fusion. */
13803 found_flags
|= token_ops
;
13807 /* We ended with a comma, print something. */
13810 error ("%s string ill-formed\n", option_name
);
13814 /* We still have one more token to parse. */
13815 size_t token_length
= strlen (specs
);
13816 unsigned token_ops
= aarch64_parse_one_option_token (specs
,
13823 found_flags
|= token_ops
;
13824 return found_flags
;
13827 /* Support for overriding instruction fusion. */
13830 aarch64_parse_fuse_string (const char *fuse_string
,
13831 struct tune_params
*tune
)
13833 tune
->fusible_ops
= aarch64_parse_boolean_options (fuse_string
,
13834 aarch64_fusible_pairs
,
13839 /* Support for overriding other tuning flags. */
13842 aarch64_parse_tune_string (const char *tune_string
,
13843 struct tune_params
*tune
)
13845 tune
->extra_tuning_flags
13846 = aarch64_parse_boolean_options (tune_string
,
13847 aarch64_tuning_flags
,
13848 tune
->extra_tuning_flags
,
13852 /* Parse the sve_width tuning moverride string in TUNE_STRING.
13853 Accept the valid SVE vector widths allowed by
13854 aarch64_sve_vector_bits_enum and use it to override sve_width
13858 aarch64_parse_sve_width_string (const char *tune_string
,
13859 struct tune_params
*tune
)
13863 int n
= sscanf (tune_string
, "%d", &width
);
13866 error ("invalid format for sve_width");
13878 error ("invalid sve_width value: %d", width
);
13880 tune
->sve_width
= (enum aarch64_sve_vector_bits_enum
) width
;
13883 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
13884 we understand. If it is, extract the option string and handoff to
13885 the appropriate function. */
13888 aarch64_parse_one_override_token (const char* token
,
13890 struct tune_params
*tune
)
13892 const struct aarch64_tuning_override_function
*fn
13893 = aarch64_tuning_override_functions
;
13895 const char *option_part
= strchr (token
, '=');
13898 error ("tuning string missing in option (%s)", token
);
13902 /* Get the length of the option name. */
13903 length
= option_part
- token
;
13904 /* Skip the '=' to get to the option string. */
13907 for (; fn
->name
!= NULL
; fn
++)
13909 if (!strncmp (fn
->name
, token
, length
))
13911 fn
->parse_override (option_part
, tune
);
13916 error ("unknown tuning option (%s)",token
);
13920 /* A checking mechanism for the implementation of the tls size. */
13923 initialize_aarch64_tls_size (struct gcc_options
*opts
)
13925 if (aarch64_tls_size
== 0)
13926 aarch64_tls_size
= 24;
13928 switch (opts
->x_aarch64_cmodel_var
)
13930 case AARCH64_CMODEL_TINY
:
13931 /* Both the default and maximum TLS size allowed under tiny is 1M which
13932 needs two instructions to address, so we clamp the size to 24. */
13933 if (aarch64_tls_size
> 24)
13934 aarch64_tls_size
= 24;
13936 case AARCH64_CMODEL_SMALL
:
13937 /* The maximum TLS size allowed under small is 4G. */
13938 if (aarch64_tls_size
> 32)
13939 aarch64_tls_size
= 32;
13941 case AARCH64_CMODEL_LARGE
:
13942 /* The maximum TLS size allowed under large is 16E.
13943 FIXME: 16E should be 64bit, we only support 48bit offset now. */
13944 if (aarch64_tls_size
> 48)
13945 aarch64_tls_size
= 48;
13948 gcc_unreachable ();
13954 /* Parse STRING looking for options in the format:
13955 string :: option:string
13956 option :: name=substring
13958 substring :: defined by option. */
13961 aarch64_parse_override_string (const char* input_string
,
13962 struct tune_params
* tune
)
13964 const char separator
= ':';
13965 size_t string_length
= strlen (input_string
) + 1;
13966 char *string_root
= (char *) xmalloc (sizeof (*string_root
) * string_length
);
13967 char *string
= string_root
;
13968 strncpy (string
, input_string
, string_length
);
13969 string
[string_length
- 1] = '\0';
13971 char* ntoken
= string
;
13973 while ((ntoken
= strchr (string
, separator
)))
13975 size_t token_length
= ntoken
- string
;
13976 /* Make this substring look like a string. */
13978 aarch64_parse_one_override_token (string
, token_length
, tune
);
13982 /* One last option to parse. */
13983 aarch64_parse_one_override_token (string
, strlen (string
), tune
);
13984 free (string_root
);
13989 aarch64_override_options_after_change_1 (struct gcc_options
*opts
)
13991 if (accepted_branch_protection_string
)
13993 opts
->x_aarch64_branch_protection_string
13994 = xstrdup (accepted_branch_protection_string
);
13997 /* PR 70044: We have to be careful about being called multiple times for the
13998 same function. This means all changes should be repeatable. */
14000 /* Set aarch64_use_frame_pointer based on -fno-omit-frame-pointer.
14001 Disable the frame pointer flag so the mid-end will not use a frame
14002 pointer in leaf functions in order to support -fomit-leaf-frame-pointer.
14003 Set x_flag_omit_frame_pointer to the special value 2 to differentiate
14004 between -fomit-frame-pointer (1) and -fno-omit-frame-pointer (2). */
14005 aarch64_use_frame_pointer
= opts
->x_flag_omit_frame_pointer
!= 1;
14006 if (opts
->x_flag_omit_frame_pointer
== 0)
14007 opts
->x_flag_omit_frame_pointer
= 2;
14009 /* If not optimizing for size, set the default
14010 alignment to what the target wants. */
14011 if (!opts
->x_optimize_size
)
14013 if (opts
->x_flag_align_loops
&& !opts
->x_str_align_loops
)
14014 opts
->x_str_align_loops
= aarch64_tune_params
.loop_align
;
14015 if (opts
->x_flag_align_jumps
&& !opts
->x_str_align_jumps
)
14016 opts
->x_str_align_jumps
= aarch64_tune_params
.jump_align
;
14017 if (opts
->x_flag_align_functions
&& !opts
->x_str_align_functions
)
14018 opts
->x_str_align_functions
= aarch64_tune_params
.function_align
;
14021 /* We default to no pc-relative literal loads. */
14023 aarch64_pcrelative_literal_loads
= false;
14025 /* If -mpc-relative-literal-loads is set on the command line, this
14026 implies that the user asked for PC relative literal loads. */
14027 if (opts
->x_pcrelative_literal_loads
== 1)
14028 aarch64_pcrelative_literal_loads
= true;
14030 /* In the tiny memory model it makes no sense to disallow PC relative
14031 literal pool loads. */
14032 if (aarch64_cmodel
== AARCH64_CMODEL_TINY
14033 || aarch64_cmodel
== AARCH64_CMODEL_TINY_PIC
)
14034 aarch64_pcrelative_literal_loads
= true;
14036 /* When enabling the lower precision Newton series for the square root, also
14037 enable it for the reciprocal square root, since the latter is an
14038 intermediary step for the former. */
14039 if (flag_mlow_precision_sqrt
)
14040 flag_mrecip_low_precision_sqrt
= true;
14043 /* 'Unpack' up the internal tuning structs and update the options
14044 in OPTS. The caller must have set up selected_tune and selected_arch
14045 as all the other target-specific codegen decisions are
14046 derived from them. */
14049 aarch64_override_options_internal (struct gcc_options
*opts
)
14051 aarch64_tune_flags
= selected_tune
->flags
;
14052 aarch64_tune
= selected_tune
->sched_core
;
14053 /* Make a copy of the tuning parameters attached to the core, which
14054 we may later overwrite. */
14055 aarch64_tune_params
= *(selected_tune
->tune
);
14056 aarch64_architecture_version
= selected_arch
->architecture_version
;
14058 if (opts
->x_aarch64_override_tune_string
)
14059 aarch64_parse_override_string (opts
->x_aarch64_override_tune_string
,
14060 &aarch64_tune_params
);
14062 /* This target defaults to strict volatile bitfields. */
14063 if (opts
->x_flag_strict_volatile_bitfields
< 0 && abi_version_at_least (2))
14064 opts
->x_flag_strict_volatile_bitfields
= 1;
14066 if (aarch64_stack_protector_guard
== SSP_GLOBAL
14067 && opts
->x_aarch64_stack_protector_guard_offset_str
)
14069 error ("incompatible options %<-mstack-protector-guard=global%> and "
14070 "%<-mstack-protector-guard-offset=%s%>",
14071 aarch64_stack_protector_guard_offset_str
);
14074 if (aarch64_stack_protector_guard
== SSP_SYSREG
14075 && !(opts
->x_aarch64_stack_protector_guard_offset_str
14076 && opts
->x_aarch64_stack_protector_guard_reg_str
))
14078 error ("both %<-mstack-protector-guard-offset%> and "
14079 "%<-mstack-protector-guard-reg%> must be used "
14080 "with %<-mstack-protector-guard=sysreg%>");
14083 if (opts
->x_aarch64_stack_protector_guard_reg_str
)
14085 if (strlen (opts
->x_aarch64_stack_protector_guard_reg_str
) > 100)
14086 error ("specify a system register with a small string length.");
14089 if (opts
->x_aarch64_stack_protector_guard_offset_str
)
14092 const char *str
= aarch64_stack_protector_guard_offset_str
;
14094 long offs
= strtol (aarch64_stack_protector_guard_offset_str
, &end
, 0);
14095 if (!*str
|| *end
|| errno
)
14096 error ("%qs is not a valid offset in %qs", str
,
14097 "-mstack-protector-guard-offset=");
14098 aarch64_stack_protector_guard_offset
= offs
;
14101 initialize_aarch64_code_model (opts
);
14102 initialize_aarch64_tls_size (opts
);
14104 int queue_depth
= 0;
14105 switch (aarch64_tune_params
.autoprefetcher_model
)
14107 case tune_params::AUTOPREFETCHER_OFF
:
14110 case tune_params::AUTOPREFETCHER_WEAK
:
14113 case tune_params::AUTOPREFETCHER_STRONG
:
14114 queue_depth
= max_insn_queue_index
+ 1;
14117 gcc_unreachable ();
14120 /* We don't mind passing in global_options_set here as we don't use
14121 the *options_set structs anyway. */
14122 SET_OPTION_IF_UNSET (opts
, &global_options_set
,
14123 param_sched_autopref_queue_depth
, queue_depth
);
14125 /* Set up parameters to be used in prefetching algorithm. Do not
14126 override the defaults unless we are tuning for a core we have
14127 researched values for. */
14128 if (aarch64_tune_params
.prefetch
->num_slots
> 0)
14129 SET_OPTION_IF_UNSET (opts
, &global_options_set
,
14130 param_simultaneous_prefetches
,
14131 aarch64_tune_params
.prefetch
->num_slots
);
14132 if (aarch64_tune_params
.prefetch
->l1_cache_size
>= 0)
14133 SET_OPTION_IF_UNSET (opts
, &global_options_set
,
14134 param_l1_cache_size
,
14135 aarch64_tune_params
.prefetch
->l1_cache_size
);
14136 if (aarch64_tune_params
.prefetch
->l1_cache_line_size
>= 0)
14137 SET_OPTION_IF_UNSET (opts
, &global_options_set
,
14138 param_l1_cache_line_size
,
14139 aarch64_tune_params
.prefetch
->l1_cache_line_size
);
14140 if (aarch64_tune_params
.prefetch
->l2_cache_size
>= 0)
14141 SET_OPTION_IF_UNSET (opts
, &global_options_set
,
14142 param_l2_cache_size
,
14143 aarch64_tune_params
.prefetch
->l2_cache_size
);
14144 if (!aarch64_tune_params
.prefetch
->prefetch_dynamic_strides
)
14145 SET_OPTION_IF_UNSET (opts
, &global_options_set
,
14146 param_prefetch_dynamic_strides
, 0);
14147 if (aarch64_tune_params
.prefetch
->minimum_stride
>= 0)
14148 SET_OPTION_IF_UNSET (opts
, &global_options_set
,
14149 param_prefetch_minimum_stride
,
14150 aarch64_tune_params
.prefetch
->minimum_stride
);
14152 /* Use the alternative scheduling-pressure algorithm by default. */
14153 SET_OPTION_IF_UNSET (opts
, &global_options_set
,
14154 param_sched_pressure_algorithm
,
14155 SCHED_PRESSURE_MODEL
);
14157 /* Validate the guard size. */
14158 int guard_size
= param_stack_clash_protection_guard_size
;
14160 if (guard_size
!= 12 && guard_size
!= 16)
14161 error ("only values 12 (4 KB) and 16 (64 KB) are supported for guard "
14162 "size. Given value %d (%llu KB) is out of range",
14163 guard_size
, (1ULL << guard_size
) / 1024ULL);
14165 /* Enforce that interval is the same size as size so the mid-end does the
14167 SET_OPTION_IF_UNSET (opts
, &global_options_set
,
14168 param_stack_clash_protection_probe_interval
,
14171 /* The maybe_set calls won't update the value if the user has explicitly set
14172 one. Which means we need to validate that probing interval and guard size
14175 = param_stack_clash_protection_probe_interval
;
14176 if (guard_size
!= probe_interval
)
14177 error ("stack clash guard size %<%d%> must be equal to probing interval "
14178 "%<%d%>", guard_size
, probe_interval
);
14180 /* Enable sw prefetching at specified optimization level for
14181 CPUS that have prefetch. Lower optimization level threshold by 1
14182 when profiling is enabled. */
14183 if (opts
->x_flag_prefetch_loop_arrays
< 0
14184 && !opts
->x_optimize_size
14185 && aarch64_tune_params
.prefetch
->default_opt_level
>= 0
14186 && opts
->x_optimize
>= aarch64_tune_params
.prefetch
->default_opt_level
)
14187 opts
->x_flag_prefetch_loop_arrays
= 1;
14189 if (opts
->x_aarch64_arch_string
== NULL
)
14190 opts
->x_aarch64_arch_string
= selected_arch
->name
;
14191 if (opts
->x_aarch64_cpu_string
== NULL
)
14192 opts
->x_aarch64_cpu_string
= selected_cpu
->name
;
14193 if (opts
->x_aarch64_tune_string
== NULL
)
14194 opts
->x_aarch64_tune_string
= selected_tune
->name
;
14196 aarch64_override_options_after_change_1 (opts
);
14199 /* Print a hint with a suggestion for a core or architecture name that
14200 most closely resembles what the user passed in STR. ARCH is true if
14201 the user is asking for an architecture name. ARCH is false if the user
14202 is asking for a core name. */
14205 aarch64_print_hint_for_core_or_arch (const char *str
, bool arch
)
14207 auto_vec
<const char *> candidates
;
14208 const struct processor
*entry
= arch
? all_architectures
: all_cores
;
14209 for (; entry
->name
!= NULL
; entry
++)
14210 candidates
.safe_push (entry
->name
);
14212 #ifdef HAVE_LOCAL_CPU_DETECT
14213 /* Add also "native" as possible value. */
14215 candidates
.safe_push ("native");
14219 const char *hint
= candidates_list_and_hint (str
, s
, candidates
);
14221 inform (input_location
, "valid arguments are: %s;"
14222 " did you mean %qs?", s
, hint
);
14224 inform (input_location
, "valid arguments are: %s", s
);
14229 /* Print a hint with a suggestion for a core name that most closely resembles
14230 what the user passed in STR. */
14233 aarch64_print_hint_for_core (const char *str
)
14235 aarch64_print_hint_for_core_or_arch (str
, false);
14238 /* Print a hint with a suggestion for an architecture name that most closely
14239 resembles what the user passed in STR. */
14242 aarch64_print_hint_for_arch (const char *str
)
14244 aarch64_print_hint_for_core_or_arch (str
, true);
14248 /* Print a hint with a suggestion for an extension name
14249 that most closely resembles what the user passed in STR. */
14252 aarch64_print_hint_for_extensions (const std::string
&str
)
14254 auto_vec
<const char *> candidates
;
14255 aarch64_get_all_extension_candidates (&candidates
);
14257 const char *hint
= candidates_list_and_hint (str
.c_str (), s
, candidates
);
14259 inform (input_location
, "valid arguments are: %s;"
14260 " did you mean %qs?", s
, hint
);
14262 inform (input_location
, "valid arguments are: %s;", s
);
14267 /* Validate a command-line -mcpu option. Parse the cpu and extensions (if any)
14268 specified in STR and throw errors if appropriate. Put the results if
14269 they are valid in RES and ISA_FLAGS. Return whether the option is
14273 aarch64_validate_mcpu (const char *str
, const struct processor
**res
,
14274 uint64_t *isa_flags
)
14276 std::string invalid_extension
;
14277 enum aarch64_parse_opt_result parse_res
14278 = aarch64_parse_cpu (str
, res
, isa_flags
, &invalid_extension
);
14280 if (parse_res
== AARCH64_PARSE_OK
)
14285 case AARCH64_PARSE_MISSING_ARG
:
14286 error ("missing cpu name in %<-mcpu=%s%>", str
);
14288 case AARCH64_PARSE_INVALID_ARG
:
14289 error ("unknown value %qs for %<-mcpu%>", str
);
14290 aarch64_print_hint_for_core (str
);
14292 case AARCH64_PARSE_INVALID_FEATURE
:
14293 error ("invalid feature modifier %qs in %<-mcpu=%s%>",
14294 invalid_extension
.c_str (), str
);
14295 aarch64_print_hint_for_extensions (invalid_extension
);
14298 gcc_unreachable ();
14304 /* Parses CONST_STR for branch protection features specified in
14305 aarch64_branch_protect_types, and set any global variables required. Returns
14306 the parsing result and assigns LAST_STR to the last processed token from
14307 CONST_STR so that it can be used for error reporting. */
14310 aarch64_parse_opt_result
aarch64_parse_branch_protection (const char *const_str
,
14313 char *str_root
= xstrdup (const_str
);
14314 char* token_save
= NULL
;
14315 char *str
= strtok_r (str_root
, "+", &token_save
);
14316 enum aarch64_parse_opt_result res
= AARCH64_PARSE_OK
;
14318 res
= AARCH64_PARSE_MISSING_ARG
;
14321 char *next_str
= strtok_r (NULL
, "+", &token_save
);
14322 /* Reset the branch protection features to their defaults. */
14323 aarch64_handle_no_branch_protection (NULL
, NULL
);
14325 while (str
&& res
== AARCH64_PARSE_OK
)
14327 const aarch64_branch_protect_type
* type
= aarch64_branch_protect_types
;
14328 bool found
= false;
14329 /* Search for this type. */
14330 while (type
&& type
->name
&& !found
&& res
== AARCH64_PARSE_OK
)
14332 if (strcmp (str
, type
->name
) == 0)
14335 res
= type
->handler (str
, next_str
);
14337 next_str
= strtok_r (NULL
, "+", &token_save
);
14342 if (found
&& res
== AARCH64_PARSE_OK
)
14344 bool found_subtype
= true;
14345 /* Loop through each token until we find one that isn't a
14347 while (found_subtype
)
14349 found_subtype
= false;
14350 const aarch64_branch_protect_type
*subtype
= type
->subtypes
;
14351 /* Search for the subtype. */
14352 while (str
&& subtype
&& subtype
->name
&& !found_subtype
14353 && res
== AARCH64_PARSE_OK
)
14355 if (strcmp (str
, subtype
->name
) == 0)
14357 found_subtype
= true;
14358 res
= subtype
->handler (str
, next_str
);
14360 next_str
= strtok_r (NULL
, "+", &token_save
);
14368 res
= AARCH64_PARSE_INVALID_ARG
;
14371 /* Copy the last processed token into the argument to pass it back.
14372 Used by option and attribute validation to print the offending token. */
14375 if (str
) strcpy (*last_str
, str
);
14376 else *last_str
= NULL
;
14378 if (res
== AARCH64_PARSE_OK
)
14380 /* If needed, alloc the accepted string then copy in const_str.
14381 Used by override_option_after_change_1. */
14382 if (!accepted_branch_protection_string
)
14383 accepted_branch_protection_string
= (char *) xmalloc (
14384 BRANCH_PROTECT_STR_MAX
14386 strncpy (accepted_branch_protection_string
, const_str
,
14387 BRANCH_PROTECT_STR_MAX
+ 1);
14388 /* Forcibly null-terminate. */
14389 accepted_branch_protection_string
[BRANCH_PROTECT_STR_MAX
] = '\0';
14395 aarch64_validate_mbranch_protection (const char *const_str
)
14397 char *str
= (char *) xmalloc (strlen (const_str
));
14398 enum aarch64_parse_opt_result res
=
14399 aarch64_parse_branch_protection (const_str
, &str
);
14400 if (res
== AARCH64_PARSE_INVALID_ARG
)
14401 error ("invalid argument %<%s%> for %<-mbranch-protection=%>", str
);
14402 else if (res
== AARCH64_PARSE_MISSING_ARG
)
14403 error ("missing argument for %<-mbranch-protection=%>");
14405 return res
== AARCH64_PARSE_OK
;
14408 /* Validate a command-line -march option. Parse the arch and extensions
14409 (if any) specified in STR and throw errors if appropriate. Put the
14410 results, if they are valid, in RES and ISA_FLAGS. Return whether the
14411 option is valid. */
14414 aarch64_validate_march (const char *str
, const struct processor
**res
,
14415 uint64_t *isa_flags
)
14417 std::string invalid_extension
;
14418 enum aarch64_parse_opt_result parse_res
14419 = aarch64_parse_arch (str
, res
, isa_flags
, &invalid_extension
);
14421 if (parse_res
== AARCH64_PARSE_OK
)
14426 case AARCH64_PARSE_MISSING_ARG
:
14427 error ("missing arch name in %<-march=%s%>", str
);
14429 case AARCH64_PARSE_INVALID_ARG
:
14430 error ("unknown value %qs for %<-march%>", str
);
14431 aarch64_print_hint_for_arch (str
);
14433 case AARCH64_PARSE_INVALID_FEATURE
:
14434 error ("invalid feature modifier %qs in %<-march=%s%>",
14435 invalid_extension
.c_str (), str
);
14436 aarch64_print_hint_for_extensions (invalid_extension
);
14439 gcc_unreachable ();
14445 /* Validate a command-line -mtune option. Parse the cpu
14446 specified in STR and throw errors if appropriate. Put the
14447 result, if it is valid, in RES. Return whether the option is
14451 aarch64_validate_mtune (const char *str
, const struct processor
**res
)
14453 enum aarch64_parse_opt_result parse_res
14454 = aarch64_parse_tune (str
, res
);
14456 if (parse_res
== AARCH64_PARSE_OK
)
14461 case AARCH64_PARSE_MISSING_ARG
:
14462 error ("missing cpu name in %<-mtune=%s%>", str
);
14464 case AARCH64_PARSE_INVALID_ARG
:
14465 error ("unknown value %qs for %<-mtune%>", str
);
14466 aarch64_print_hint_for_core (str
);
14469 gcc_unreachable ();
14474 /* Return the CPU corresponding to the enum CPU.
14475 If it doesn't specify a cpu, return the default. */
14477 static const struct processor
*
14478 aarch64_get_tune_cpu (enum aarch64_processor cpu
)
14480 if (cpu
!= aarch64_none
)
14481 return &all_cores
[cpu
];
14483 /* The & 0x3f is to extract the bottom 6 bits that encode the
14484 default cpu as selected by the --with-cpu GCC configure option
14486 ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
14487 flags mechanism should be reworked to make it more sane. */
14488 return &all_cores
[TARGET_CPU_DEFAULT
& 0x3f];
14491 /* Return the architecture corresponding to the enum ARCH.
14492 If it doesn't specify a valid architecture, return the default. */
14494 static const struct processor
*
14495 aarch64_get_arch (enum aarch64_arch arch
)
14497 if (arch
!= aarch64_no_arch
)
14498 return &all_architectures
[arch
];
14500 const struct processor
*cpu
= &all_cores
[TARGET_CPU_DEFAULT
& 0x3f];
14502 return &all_architectures
[cpu
->arch
];
14505 /* Return the VG value associated with -msve-vector-bits= value VALUE. */
14508 aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits_enum value
)
14510 /* 128-bit SVE and Advanced SIMD modes use different register layouts
14511 on big-endian targets, so we would need to forbid subregs that convert
14512 from one to the other. By default a reinterpret sequence would then
14513 involve a store to memory in one mode and a load back in the other.
14514 Even if we optimize that sequence using reverse instructions,
14515 it would still be a significant potential overhead.
14517 For now, it seems better to generate length-agnostic code for that
14519 if (value
== SVE_SCALABLE
14520 || (value
== SVE_128
&& BYTES_BIG_ENDIAN
))
14521 return poly_uint16 (2, 2);
14523 return (int) value
/ 64;
14526 /* Implement TARGET_OPTION_OVERRIDE. This is called once in the beginning
14527 and is used to parse the -m{cpu,tune,arch} strings and setup the initial
14528 tuning structs. In particular it must set selected_tune and
14529 aarch64_isa_flags that define the available ISA features and tuning
14530 decisions. It must also set selected_arch as this will be used to
14531 output the .arch asm tags for each function. */
14534 aarch64_override_options (void)
14536 uint64_t cpu_isa
= 0;
14537 uint64_t arch_isa
= 0;
14538 aarch64_isa_flags
= 0;
14540 bool valid_cpu
= true;
14541 bool valid_tune
= true;
14542 bool valid_arch
= true;
14544 selected_cpu
= NULL
;
14545 selected_arch
= NULL
;
14546 selected_tune
= NULL
;
14548 if (aarch64_branch_protection_string
)
14549 aarch64_validate_mbranch_protection (aarch64_branch_protection_string
);
14551 /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
14552 If either of -march or -mtune is given, they override their
14553 respective component of -mcpu. */
14554 if (aarch64_cpu_string
)
14555 valid_cpu
= aarch64_validate_mcpu (aarch64_cpu_string
, &selected_cpu
,
14558 if (aarch64_arch_string
)
14559 valid_arch
= aarch64_validate_march (aarch64_arch_string
, &selected_arch
,
14562 if (aarch64_tune_string
)
14563 valid_tune
= aarch64_validate_mtune (aarch64_tune_string
, &selected_tune
);
14565 #ifdef SUBTARGET_OVERRIDE_OPTIONS
14566 SUBTARGET_OVERRIDE_OPTIONS
;
14569 /* If the user did not specify a processor, choose the default
14570 one for them. This will be the CPU set during configuration using
14571 --with-cpu, otherwise it is "generic". */
14576 selected_cpu
= &all_cores
[selected_arch
->ident
];
14577 aarch64_isa_flags
= arch_isa
;
14578 explicit_arch
= selected_arch
->arch
;
14582 /* Get default configure-time CPU. */
14583 selected_cpu
= aarch64_get_tune_cpu (aarch64_none
);
14584 aarch64_isa_flags
= TARGET_CPU_DEFAULT
>> 6;
14588 explicit_tune_core
= selected_tune
->ident
;
14590 /* If both -mcpu and -march are specified check that they are architecturally
14591 compatible, warn if they're not and prefer the -march ISA flags. */
14592 else if (selected_arch
)
14594 if (selected_arch
->arch
!= selected_cpu
->arch
)
14596 warning (0, "switch %<-mcpu=%s%> conflicts with %<-march=%s%> switch",
14597 aarch64_cpu_string
,
14598 aarch64_arch_string
);
14600 aarch64_isa_flags
= arch_isa
;
14601 explicit_arch
= selected_arch
->arch
;
14602 explicit_tune_core
= selected_tune
? selected_tune
->ident
14603 : selected_cpu
->ident
;
14607 /* -mcpu but no -march. */
14608 aarch64_isa_flags
= cpu_isa
;
14609 explicit_tune_core
= selected_tune
? selected_tune
->ident
14610 : selected_cpu
->ident
;
14611 gcc_assert (selected_cpu
);
14612 selected_arch
= &all_architectures
[selected_cpu
->arch
];
14613 explicit_arch
= selected_arch
->arch
;
14616 /* Set the arch as well as we will need it when outputing
14617 the .arch directive in assembly. */
14618 if (!selected_arch
)
14620 gcc_assert (selected_cpu
);
14621 selected_arch
= &all_architectures
[selected_cpu
->arch
];
14624 if (!selected_tune
)
14625 selected_tune
= selected_cpu
;
14627 if (aarch64_enable_bti
== 2)
14629 #ifdef TARGET_ENABLE_BTI
14630 aarch64_enable_bti
= 1;
14632 aarch64_enable_bti
= 0;
14636 /* Return address signing is currently not supported for ILP32 targets. For
14637 LP64 targets use the configured option in the absence of a command-line
14638 option for -mbranch-protection. */
14639 if (!TARGET_ILP32
&& accepted_branch_protection_string
== NULL
)
14641 #ifdef TARGET_ENABLE_PAC_RET
14642 aarch64_ra_sign_scope
= AARCH64_FUNCTION_NON_LEAF
;
14644 aarch64_ra_sign_scope
= AARCH64_FUNCTION_NONE
;
14648 #ifndef HAVE_AS_MABI_OPTION
14649 /* The compiler may have been configured with 2.23.* binutils, which does
14650 not have support for ILP32. */
14652 error ("assembler does not support %<-mabi=ilp32%>");
14655 /* Convert -msve-vector-bits to a VG count. */
14656 aarch64_sve_vg
= aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits
);
14658 if (aarch64_ra_sign_scope
!= AARCH64_FUNCTION_NONE
&& TARGET_ILP32
)
14659 sorry ("return address signing is only supported for %<-mabi=lp64%>");
14661 /* Make sure we properly set up the explicit options. */
14662 if ((aarch64_cpu_string
&& valid_cpu
)
14663 || (aarch64_tune_string
&& valid_tune
))
14664 gcc_assert (explicit_tune_core
!= aarch64_none
);
14666 if ((aarch64_cpu_string
&& valid_cpu
)
14667 || (aarch64_arch_string
&& valid_arch
))
14668 gcc_assert (explicit_arch
!= aarch64_no_arch
);
14670 /* The pass to insert speculation tracking runs before
14671 shrink-wrapping and the latter does not know how to update the
14672 tracking status. So disable it in this case. */
14673 if (aarch64_track_speculation
)
14674 flag_shrink_wrap
= 0;
14676 aarch64_override_options_internal (&global_options
);
14678 /* Save these options as the default ones in case we push and pop them later
14679 while processing functions with potential target attributes. */
14680 target_option_default_node
= target_option_current_node
14681 = build_target_option_node (&global_options
);
14684 /* Implement targetm.override_options_after_change. */
14687 aarch64_override_options_after_change (void)
14689 aarch64_override_options_after_change_1 (&global_options
);
14692 static struct machine_function
*
14693 aarch64_init_machine_status (void)
14695 struct machine_function
*machine
;
14696 machine
= ggc_cleared_alloc
<machine_function
> ();
14701 aarch64_init_expanders (void)
14703 init_machine_status
= aarch64_init_machine_status
;
14706 /* A checking mechanism for the implementation of the various code models. */
14708 initialize_aarch64_code_model (struct gcc_options
*opts
)
14710 if (opts
->x_flag_pic
)
14712 switch (opts
->x_aarch64_cmodel_var
)
14714 case AARCH64_CMODEL_TINY
:
14715 aarch64_cmodel
= AARCH64_CMODEL_TINY_PIC
;
14717 case AARCH64_CMODEL_SMALL
:
14718 #ifdef HAVE_AS_SMALL_PIC_RELOCS
14719 aarch64_cmodel
= (flag_pic
== 2
14720 ? AARCH64_CMODEL_SMALL_PIC
14721 : AARCH64_CMODEL_SMALL_SPIC
);
14723 aarch64_cmodel
= AARCH64_CMODEL_SMALL_PIC
;
14726 case AARCH64_CMODEL_LARGE
:
14727 sorry ("code model %qs with %<-f%s%>", "large",
14728 opts
->x_flag_pic
> 1 ? "PIC" : "pic");
14731 gcc_unreachable ();
14735 aarch64_cmodel
= opts
->x_aarch64_cmodel_var
;
14738 /* Implement TARGET_OPTION_SAVE. */
14741 aarch64_option_save (struct cl_target_option
*ptr
, struct gcc_options
*opts
)
14743 ptr
->x_aarch64_override_tune_string
= opts
->x_aarch64_override_tune_string
;
14744 ptr
->x_aarch64_branch_protection_string
14745 = opts
->x_aarch64_branch_protection_string
;
14748 /* Implements TARGET_OPTION_RESTORE. Restore the backend codegen decisions
14749 using the information saved in PTR. */
14752 aarch64_option_restore (struct gcc_options
*opts
, struct cl_target_option
*ptr
)
14754 opts
->x_explicit_tune_core
= ptr
->x_explicit_tune_core
;
14755 selected_tune
= aarch64_get_tune_cpu (ptr
->x_explicit_tune_core
);
14756 opts
->x_explicit_arch
= ptr
->x_explicit_arch
;
14757 selected_arch
= aarch64_get_arch (ptr
->x_explicit_arch
);
14758 opts
->x_aarch64_override_tune_string
= ptr
->x_aarch64_override_tune_string
;
14759 opts
->x_aarch64_branch_protection_string
14760 = ptr
->x_aarch64_branch_protection_string
;
14761 if (opts
->x_aarch64_branch_protection_string
)
14763 aarch64_parse_branch_protection (opts
->x_aarch64_branch_protection_string
,
14767 aarch64_override_options_internal (opts
);
14770 /* Implement TARGET_OPTION_PRINT. */
14773 aarch64_option_print (FILE *file
, int indent
, struct cl_target_option
*ptr
)
14775 const struct processor
*cpu
14776 = aarch64_get_tune_cpu (ptr
->x_explicit_tune_core
);
14777 uint64_t isa_flags
= ptr
->x_aarch64_isa_flags
;
14778 const struct processor
*arch
= aarch64_get_arch (ptr
->x_explicit_arch
);
14779 std::string extension
14780 = aarch64_get_extension_string_for_isa_flags (isa_flags
, arch
->flags
);
14782 fprintf (file
, "%*sselected tune = %s\n", indent
, "", cpu
->name
);
14783 fprintf (file
, "%*sselected arch = %s%s\n", indent
, "",
14784 arch
->name
, extension
.c_str ());
14787 static GTY(()) tree aarch64_previous_fndecl
;
14790 aarch64_reset_previous_fndecl (void)
14792 aarch64_previous_fndecl
= NULL
;
14795 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
14796 Used by aarch64_set_current_function and aarch64_pragma_target_parse to
14797 make sure optab availability predicates are recomputed when necessary. */
14800 aarch64_save_restore_target_globals (tree new_tree
)
14802 if (TREE_TARGET_GLOBALS (new_tree
))
14803 restore_target_globals (TREE_TARGET_GLOBALS (new_tree
));
14804 else if (new_tree
== target_option_default_node
)
14805 restore_target_globals (&default_target_globals
);
14807 TREE_TARGET_GLOBALS (new_tree
) = save_target_globals_default_opts ();
14810 /* Implement TARGET_SET_CURRENT_FUNCTION. Unpack the codegen decisions
14811 like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
14812 of the function, if such exists. This function may be called multiple
14813 times on a single function so use aarch64_previous_fndecl to avoid
14814 setting up identical state. */
14817 aarch64_set_current_function (tree fndecl
)
14819 if (!fndecl
|| fndecl
== aarch64_previous_fndecl
)
14822 tree old_tree
= (aarch64_previous_fndecl
14823 ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl
)
14826 tree new_tree
= DECL_FUNCTION_SPECIFIC_TARGET (fndecl
);
14828 /* If current function has no attributes but the previous one did,
14829 use the default node. */
14830 if (!new_tree
&& old_tree
)
14831 new_tree
= target_option_default_node
;
14833 /* If nothing to do, return. #pragma GCC reset or #pragma GCC pop to
14834 the default have been handled by aarch64_save_restore_target_globals from
14835 aarch64_pragma_target_parse. */
14836 if (old_tree
== new_tree
)
14839 aarch64_previous_fndecl
= fndecl
;
14841 /* First set the target options. */
14842 cl_target_option_restore (&global_options
, TREE_TARGET_OPTION (new_tree
));
14844 aarch64_save_restore_target_globals (new_tree
);
14847 /* Enum describing the various ways we can handle attributes.
14848 In many cases we can reuse the generic option handling machinery. */
14850 enum aarch64_attr_opt_type
14852 aarch64_attr_mask
, /* Attribute should set a bit in target_flags. */
14853 aarch64_attr_bool
, /* Attribute sets or unsets a boolean variable. */
14854 aarch64_attr_enum
, /* Attribute sets an enum variable. */
14855 aarch64_attr_custom
/* Attribute requires a custom handling function. */
14858 /* All the information needed to handle a target attribute.
14859 NAME is the name of the attribute.
14860 ATTR_TYPE specifies the type of behavior of the attribute as described
14861 in the definition of enum aarch64_attr_opt_type.
14862 ALLOW_NEG is true if the attribute supports a "no-" form.
14863 HANDLER is the function that takes the attribute string as an argument
14864 It is needed only when the ATTR_TYPE is aarch64_attr_custom.
14865 OPT_NUM is the enum specifying the option that the attribute modifies.
14866 This is needed for attributes that mirror the behavior of a command-line
14867 option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
14868 aarch64_attr_enum. */
14870 struct aarch64_attribute_info
14873 enum aarch64_attr_opt_type attr_type
;
14875 bool (*handler
) (const char *);
14876 enum opt_code opt_num
;
14879 /* Handle the ARCH_STR argument to the arch= target attribute. */
14882 aarch64_handle_attr_arch (const char *str
)
14884 const struct processor
*tmp_arch
= NULL
;
14885 std::string invalid_extension
;
14886 enum aarch64_parse_opt_result parse_res
14887 = aarch64_parse_arch (str
, &tmp_arch
, &aarch64_isa_flags
, &invalid_extension
);
14889 if (parse_res
== AARCH64_PARSE_OK
)
14891 gcc_assert (tmp_arch
);
14892 selected_arch
= tmp_arch
;
14893 explicit_arch
= selected_arch
->arch
;
14899 case AARCH64_PARSE_MISSING_ARG
:
14900 error ("missing name in %<target(\"arch=\")%> pragma or attribute");
14902 case AARCH64_PARSE_INVALID_ARG
:
14903 error ("invalid name (\"%s\") in %<target(\"arch=\")%> pragma or attribute", str
);
14904 aarch64_print_hint_for_arch (str
);
14906 case AARCH64_PARSE_INVALID_FEATURE
:
14907 error ("invalid feature modifier %s of value (\"%s\") in "
14908 "%<target()%> pragma or attribute", invalid_extension
.c_str (), str
);
14909 aarch64_print_hint_for_extensions (invalid_extension
);
14912 gcc_unreachable ();
14918 /* Handle the argument CPU_STR to the cpu= target attribute. */
14921 aarch64_handle_attr_cpu (const char *str
)
14923 const struct processor
*tmp_cpu
= NULL
;
14924 std::string invalid_extension
;
14925 enum aarch64_parse_opt_result parse_res
14926 = aarch64_parse_cpu (str
, &tmp_cpu
, &aarch64_isa_flags
, &invalid_extension
);
14928 if (parse_res
== AARCH64_PARSE_OK
)
14930 gcc_assert (tmp_cpu
);
14931 selected_tune
= tmp_cpu
;
14932 explicit_tune_core
= selected_tune
->ident
;
14934 selected_arch
= &all_architectures
[tmp_cpu
->arch
];
14935 explicit_arch
= selected_arch
->arch
;
14941 case AARCH64_PARSE_MISSING_ARG
:
14942 error ("missing name in %<target(\"cpu=\")%> pragma or attribute");
14944 case AARCH64_PARSE_INVALID_ARG
:
14945 error ("invalid name (\"%s\") in %<target(\"cpu=\")%> pragma or attribute", str
);
14946 aarch64_print_hint_for_core (str
);
14948 case AARCH64_PARSE_INVALID_FEATURE
:
14949 error ("invalid feature modifier %s of value (\"%s\") in "
14950 "%<target()%> pragma or attribute", invalid_extension
.c_str (), str
);
14951 aarch64_print_hint_for_extensions (invalid_extension
);
14954 gcc_unreachable ();
14960 /* Handle the argument STR to the branch-protection= attribute. */
14963 aarch64_handle_attr_branch_protection (const char* str
)
14965 char *err_str
= (char *) xmalloc (strlen (str
) + 1);
14966 enum aarch64_parse_opt_result res
= aarch64_parse_branch_protection (str
,
14968 bool success
= false;
14971 case AARCH64_PARSE_MISSING_ARG
:
14972 error ("missing argument to %<target(\"branch-protection=\")%> pragma or"
14975 case AARCH64_PARSE_INVALID_ARG
:
14976 error ("invalid protection type (\"%s\") in %<target(\"branch-protection"
14977 "=\")%> pragma or attribute", err_str
);
14979 case AARCH64_PARSE_OK
:
14981 /* Fall through. */
14982 case AARCH64_PARSE_INVALID_FEATURE
:
14985 gcc_unreachable ();
14991 /* Handle the argument STR to the tune= target attribute. */
14994 aarch64_handle_attr_tune (const char *str
)
14996 const struct processor
*tmp_tune
= NULL
;
14997 enum aarch64_parse_opt_result parse_res
14998 = aarch64_parse_tune (str
, &tmp_tune
);
15000 if (parse_res
== AARCH64_PARSE_OK
)
15002 gcc_assert (tmp_tune
);
15003 selected_tune
= tmp_tune
;
15004 explicit_tune_core
= selected_tune
->ident
;
15010 case AARCH64_PARSE_INVALID_ARG
:
15011 error ("invalid name (\"%s\") in %<target(\"tune=\")%> pragma or attribute", str
);
15012 aarch64_print_hint_for_core (str
);
15015 gcc_unreachable ();
15021 /* Parse an architecture extensions target attribute string specified in STR.
15022 For example "+fp+nosimd". Show any errors if needed. Return TRUE
15023 if successful. Update aarch64_isa_flags to reflect the ISA features
15027 aarch64_handle_attr_isa_flags (char *str
)
15029 enum aarch64_parse_opt_result parse_res
;
15030 uint64_t isa_flags
= aarch64_isa_flags
;
15032 /* We allow "+nothing" in the beginning to clear out all architectural
15033 features if the user wants to handpick specific features. */
15034 if (strncmp ("+nothing", str
, 8) == 0)
15040 std::string invalid_extension
;
15041 parse_res
= aarch64_parse_extension (str
, &isa_flags
, &invalid_extension
);
15043 if (parse_res
== AARCH64_PARSE_OK
)
15045 aarch64_isa_flags
= isa_flags
;
15051 case AARCH64_PARSE_MISSING_ARG
:
15052 error ("missing value in %<target()%> pragma or attribute");
15055 case AARCH64_PARSE_INVALID_FEATURE
:
15056 error ("invalid feature modifier %s of value (\"%s\") in "
15057 "%<target()%> pragma or attribute", invalid_extension
.c_str (), str
);
15061 gcc_unreachable ();
15067 /* The target attributes that we support. On top of these we also support just
15068 ISA extensions, like __attribute__ ((target ("+crc"))), but that case is
15069 handled explicitly in aarch64_process_one_target_attr. */
15071 static const struct aarch64_attribute_info aarch64_attributes
[] =
15073 { "general-regs-only", aarch64_attr_mask
, false, NULL
,
15074 OPT_mgeneral_regs_only
},
15075 { "fix-cortex-a53-835769", aarch64_attr_bool
, true, NULL
,
15076 OPT_mfix_cortex_a53_835769
},
15077 { "fix-cortex-a53-843419", aarch64_attr_bool
, true, NULL
,
15078 OPT_mfix_cortex_a53_843419
},
15079 { "cmodel", aarch64_attr_enum
, false, NULL
, OPT_mcmodel_
},
15080 { "strict-align", aarch64_attr_mask
, true, NULL
, OPT_mstrict_align
},
15081 { "omit-leaf-frame-pointer", aarch64_attr_bool
, true, NULL
,
15082 OPT_momit_leaf_frame_pointer
},
15083 { "tls-dialect", aarch64_attr_enum
, false, NULL
, OPT_mtls_dialect_
},
15084 { "arch", aarch64_attr_custom
, false, aarch64_handle_attr_arch
,
15086 { "cpu", aarch64_attr_custom
, false, aarch64_handle_attr_cpu
, OPT_mcpu_
},
15087 { "tune", aarch64_attr_custom
, false, aarch64_handle_attr_tune
,
15089 { "branch-protection", aarch64_attr_custom
, false,
15090 aarch64_handle_attr_branch_protection
, OPT_mbranch_protection_
},
15091 { "sign-return-address", aarch64_attr_enum
, false, NULL
,
15092 OPT_msign_return_address_
},
15093 { NULL
, aarch64_attr_custom
, false, NULL
, OPT____
}
15096 /* Parse ARG_STR which contains the definition of one target attribute.
15097 Show appropriate errors if any or return true if the attribute is valid. */
15100 aarch64_process_one_target_attr (char *arg_str
)
15102 bool invert
= false;
15104 size_t len
= strlen (arg_str
);
15108 error ("malformed %<target()%> pragma or attribute");
15112 char *str_to_check
= (char *) alloca (len
+ 1);
15113 strcpy (str_to_check
, arg_str
);
15115 /* We have something like __attribute__ ((target ("+fp+nosimd"))).
15116 It is easier to detect and handle it explicitly here rather than going
15117 through the machinery for the rest of the target attributes in this
15119 if (*str_to_check
== '+')
15120 return aarch64_handle_attr_isa_flags (str_to_check
);
15122 if (len
> 3 && strncmp (str_to_check
, "no-", 3) == 0)
15127 char *arg
= strchr (str_to_check
, '=');
15129 /* If we found opt=foo then terminate STR_TO_CHECK at the '='
15130 and point ARG to "foo". */
15136 const struct aarch64_attribute_info
*p_attr
;
15137 bool found
= false;
15138 for (p_attr
= aarch64_attributes
; p_attr
->name
; p_attr
++)
15140 /* If the names don't match up, or the user has given an argument
15141 to an attribute that doesn't accept one, or didn't give an argument
15142 to an attribute that expects one, fail to match. */
15143 if (strcmp (str_to_check
, p_attr
->name
) != 0)
15147 bool attr_need_arg_p
= p_attr
->attr_type
== aarch64_attr_custom
15148 || p_attr
->attr_type
== aarch64_attr_enum
;
15150 if (attr_need_arg_p
^ (arg
!= NULL
))
15152 error ("pragma or attribute %<target(\"%s\")%> does not accept an argument", str_to_check
);
15156 /* If the name matches but the attribute does not allow "no-" versions
15157 then we can't match. */
15158 if (invert
&& !p_attr
->allow_neg
)
15160 error ("pragma or attribute %<target(\"%s\")%> does not allow a negated form", str_to_check
);
15164 switch (p_attr
->attr_type
)
15166 /* Has a custom handler registered.
15167 For example, cpu=, arch=, tune=. */
15168 case aarch64_attr_custom
:
15169 gcc_assert (p_attr
->handler
);
15170 if (!p_attr
->handler (arg
))
15174 /* Either set or unset a boolean option. */
15175 case aarch64_attr_bool
:
15177 struct cl_decoded_option decoded
;
15179 generate_option (p_attr
->opt_num
, NULL
, !invert
,
15180 CL_TARGET
, &decoded
);
15181 aarch64_handle_option (&global_options
, &global_options_set
,
15182 &decoded
, input_location
);
15185 /* Set or unset a bit in the target_flags. aarch64_handle_option
15186 should know what mask to apply given the option number. */
15187 case aarch64_attr_mask
:
15189 struct cl_decoded_option decoded
;
15190 /* We only need to specify the option number.
15191 aarch64_handle_option will know which mask to apply. */
15192 decoded
.opt_index
= p_attr
->opt_num
;
15193 decoded
.value
= !invert
;
15194 aarch64_handle_option (&global_options
, &global_options_set
,
15195 &decoded
, input_location
);
15198 /* Use the option setting machinery to set an option to an enum. */
15199 case aarch64_attr_enum
:
15204 valid
= opt_enum_arg_to_value (p_attr
->opt_num
, arg
,
15205 &value
, CL_TARGET
);
15208 set_option (&global_options
, NULL
, p_attr
->opt_num
, value
,
15209 NULL
, DK_UNSPECIFIED
, input_location
,
15214 error ("pragma or attribute %<target(\"%s=%s\")%> is not valid", str_to_check
, arg
);
15219 gcc_unreachable ();
15223 /* If we reached here we either have found an attribute and validated
15224 it or didn't match any. If we matched an attribute but its arguments
15225 were malformed we will have returned false already. */
15229 /* Count how many times the character C appears in
15230 NULL-terminated string STR. */
15232 static unsigned int
15233 num_occurences_in_str (char c
, char *str
)
15235 unsigned int res
= 0;
15236 while (*str
!= '\0')
15247 /* Parse the tree in ARGS that contains the target attribute information
15248 and update the global target options space. */
15251 aarch64_process_target_attr (tree args
)
15253 if (TREE_CODE (args
) == TREE_LIST
)
15257 tree head
= TREE_VALUE (args
);
15260 if (!aarch64_process_target_attr (head
))
15263 args
= TREE_CHAIN (args
);
15269 if (TREE_CODE (args
) != STRING_CST
)
15271 error ("attribute %<target%> argument not a string");
15275 size_t len
= strlen (TREE_STRING_POINTER (args
));
15276 char *str_to_check
= (char *) alloca (len
+ 1);
15277 strcpy (str_to_check
, TREE_STRING_POINTER (args
));
15281 error ("malformed %<target()%> pragma or attribute");
15285 /* Used to catch empty spaces between commas i.e.
15286 attribute ((target ("attr1,,attr2"))). */
15287 unsigned int num_commas
= num_occurences_in_str (',', str_to_check
);
15289 /* Handle multiple target attributes separated by ','. */
15290 char *token
= strtok_r (str_to_check
, ",", &str_to_check
);
15292 unsigned int num_attrs
= 0;
15296 if (!aarch64_process_one_target_attr (token
))
15298 error ("pragma or attribute %<target(\"%s\")%> is not valid", token
);
15302 token
= strtok_r (NULL
, ",", &str_to_check
);
15305 if (num_attrs
!= num_commas
+ 1)
15307 error ("malformed %<target(\"%s\")%> pragma or attribute", TREE_STRING_POINTER (args
));
15314 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P. This is used to
15315 process attribute ((target ("..."))). */
15318 aarch64_option_valid_attribute_p (tree fndecl
, tree
, tree args
, int)
15320 struct cl_target_option cur_target
;
15323 tree new_target
, new_optimize
;
15324 tree existing_target
= DECL_FUNCTION_SPECIFIC_TARGET (fndecl
);
15326 /* If what we're processing is the current pragma string then the
15327 target option node is already stored in target_option_current_node
15328 by aarch64_pragma_target_parse in aarch64-c.c. Use that to avoid
15329 having to re-parse the string. This is especially useful to keep
15330 arm_neon.h compile times down since that header contains a lot
15331 of intrinsics enclosed in pragmas. */
15332 if (!existing_target
&& args
== current_target_pragma
)
15334 DECL_FUNCTION_SPECIFIC_TARGET (fndecl
) = target_option_current_node
;
15337 tree func_optimize
= DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl
);
15339 old_optimize
= build_optimization_node (&global_options
);
15340 func_optimize
= DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl
);
15342 /* If the function changed the optimization levels as well as setting
15343 target options, start with the optimizations specified. */
15344 if (func_optimize
&& func_optimize
!= old_optimize
)
15345 cl_optimization_restore (&global_options
,
15346 TREE_OPTIMIZATION (func_optimize
));
15348 /* Save the current target options to restore at the end. */
15349 cl_target_option_save (&cur_target
, &global_options
);
15351 /* If fndecl already has some target attributes applied to it, unpack
15352 them so that we add this attribute on top of them, rather than
15353 overwriting them. */
15354 if (existing_target
)
15356 struct cl_target_option
*existing_options
15357 = TREE_TARGET_OPTION (existing_target
);
15359 if (existing_options
)
15360 cl_target_option_restore (&global_options
, existing_options
);
15363 cl_target_option_restore (&global_options
,
15364 TREE_TARGET_OPTION (target_option_current_node
));
15366 ret
= aarch64_process_target_attr (args
);
15368 /* Set up any additional state. */
15371 aarch64_override_options_internal (&global_options
);
15372 /* Initialize SIMD builtins if we haven't already.
15373 Set current_target_pragma to NULL for the duration so that
15374 the builtin initialization code doesn't try to tag the functions
15375 being built with the attributes specified by any current pragma, thus
15376 going into an infinite recursion. */
15379 tree saved_current_target_pragma
= current_target_pragma
;
15380 current_target_pragma
= NULL
;
15381 aarch64_init_simd_builtins ();
15382 current_target_pragma
= saved_current_target_pragma
;
15384 new_target
= build_target_option_node (&global_options
);
15389 new_optimize
= build_optimization_node (&global_options
);
15393 DECL_FUNCTION_SPECIFIC_TARGET (fndecl
) = new_target
;
15395 if (old_optimize
!= new_optimize
)
15396 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl
) = new_optimize
;
15399 cl_target_option_restore (&global_options
, &cur_target
);
15401 if (old_optimize
!= new_optimize
)
15402 cl_optimization_restore (&global_options
,
15403 TREE_OPTIMIZATION (old_optimize
));
15407 /* Helper for aarch64_can_inline_p. In the case where CALLER and CALLEE are
15408 tri-bool options (yes, no, don't care) and the default value is
15409 DEF, determine whether to reject inlining. */
15412 aarch64_tribools_ok_for_inlining_p (int caller
, int callee
,
15413 int dont_care
, int def
)
15415 /* If the callee doesn't care, always allow inlining. */
15416 if (callee
== dont_care
)
15419 /* If the caller doesn't care, always allow inlining. */
15420 if (caller
== dont_care
)
15423 /* Otherwise, allow inlining if either the callee and caller values
15424 agree, or if the callee is using the default value. */
15425 return (callee
== caller
|| callee
== def
);
15428 /* Implement TARGET_CAN_INLINE_P. Decide whether it is valid
15429 to inline CALLEE into CALLER based on target-specific info.
15430 Make sure that the caller and callee have compatible architectural
15431 features. Then go through the other possible target attributes
15432 and see if they can block inlining. Try not to reject always_inline
15433 callees unless they are incompatible architecturally. */
15436 aarch64_can_inline_p (tree caller
, tree callee
)
15438 tree caller_tree
= DECL_FUNCTION_SPECIFIC_TARGET (caller
);
15439 tree callee_tree
= DECL_FUNCTION_SPECIFIC_TARGET (callee
);
15441 struct cl_target_option
*caller_opts
15442 = TREE_TARGET_OPTION (caller_tree
? caller_tree
15443 : target_option_default_node
);
15445 struct cl_target_option
*callee_opts
15446 = TREE_TARGET_OPTION (callee_tree
? callee_tree
15447 : target_option_default_node
);
15449 /* Callee's ISA flags should be a subset of the caller's. */
15450 if ((caller_opts
->x_aarch64_isa_flags
& callee_opts
->x_aarch64_isa_flags
)
15451 != callee_opts
->x_aarch64_isa_flags
)
15454 /* Allow non-strict aligned functions inlining into strict
15456 if ((TARGET_STRICT_ALIGN_P (caller_opts
->x_target_flags
)
15457 != TARGET_STRICT_ALIGN_P (callee_opts
->x_target_flags
))
15458 && !(!TARGET_STRICT_ALIGN_P (callee_opts
->x_target_flags
)
15459 && TARGET_STRICT_ALIGN_P (caller_opts
->x_target_flags
)))
15462 bool always_inline
= lookup_attribute ("always_inline",
15463 DECL_ATTRIBUTES (callee
));
15465 /* If the architectural features match up and the callee is always_inline
15466 then the other attributes don't matter. */
15470 if (caller_opts
->x_aarch64_cmodel_var
15471 != callee_opts
->x_aarch64_cmodel_var
)
15474 if (caller_opts
->x_aarch64_tls_dialect
15475 != callee_opts
->x_aarch64_tls_dialect
)
15478 /* Honour explicit requests to workaround errata. */
15479 if (!aarch64_tribools_ok_for_inlining_p (
15480 caller_opts
->x_aarch64_fix_a53_err835769
,
15481 callee_opts
->x_aarch64_fix_a53_err835769
,
15482 2, TARGET_FIX_ERR_A53_835769_DEFAULT
))
15485 if (!aarch64_tribools_ok_for_inlining_p (
15486 caller_opts
->x_aarch64_fix_a53_err843419
,
15487 callee_opts
->x_aarch64_fix_a53_err843419
,
15488 2, TARGET_FIX_ERR_A53_843419
))
15491 /* If the user explicitly specified -momit-leaf-frame-pointer for the
15492 caller and calle and they don't match up, reject inlining. */
15493 if (!aarch64_tribools_ok_for_inlining_p (
15494 caller_opts
->x_flag_omit_leaf_frame_pointer
,
15495 callee_opts
->x_flag_omit_leaf_frame_pointer
,
15499 /* If the callee has specific tuning overrides, respect them. */
15500 if (callee_opts
->x_aarch64_override_tune_string
!= NULL
15501 && caller_opts
->x_aarch64_override_tune_string
== NULL
)
15504 /* If the user specified tuning override strings for the
15505 caller and callee and they don't match up, reject inlining.
15506 We just do a string compare here, we don't analyze the meaning
15507 of the string, as it would be too costly for little gain. */
15508 if (callee_opts
->x_aarch64_override_tune_string
15509 && caller_opts
->x_aarch64_override_tune_string
15510 && (strcmp (callee_opts
->x_aarch64_override_tune_string
,
15511 caller_opts
->x_aarch64_override_tune_string
) != 0))
15517 /* Return the ID of the TLDESC ABI, initializing the descriptor if hasn't
15521 aarch64_tlsdesc_abi_id ()
15523 predefined_function_abi
&tlsdesc_abi
= function_abis
[ARM_PCS_TLSDESC
];
15524 if (!tlsdesc_abi
.initialized_p ())
15526 HARD_REG_SET full_reg_clobbers
;
15527 CLEAR_HARD_REG_SET (full_reg_clobbers
);
15528 SET_HARD_REG_BIT (full_reg_clobbers
, R0_REGNUM
);
15529 SET_HARD_REG_BIT (full_reg_clobbers
, CC_REGNUM
);
15530 for (int regno
= P0_REGNUM
; regno
<= P15_REGNUM
; ++regno
)
15531 SET_HARD_REG_BIT (full_reg_clobbers
, regno
);
15532 tlsdesc_abi
.initialize (ARM_PCS_TLSDESC
, full_reg_clobbers
);
15534 return tlsdesc_abi
.id ();
15537 /* Return true if SYMBOL_REF X binds locally. */
15540 aarch64_symbol_binds_local_p (const_rtx x
)
15542 return (SYMBOL_REF_DECL (x
)
15543 ? targetm
.binds_local_p (SYMBOL_REF_DECL (x
))
15544 : SYMBOL_REF_LOCAL_P (x
));
15547 /* Return true if SYMBOL_REF X is thread local */
15549 aarch64_tls_symbol_p (rtx x
)
15551 if (! TARGET_HAVE_TLS
)
15554 if (GET_CODE (x
) != SYMBOL_REF
)
15557 return SYMBOL_REF_TLS_MODEL (x
) != 0;
15560 /* Classify a TLS symbol into one of the TLS kinds. */
15561 enum aarch64_symbol_type
15562 aarch64_classify_tls_symbol (rtx x
)
15564 enum tls_model tls_kind
= tls_symbolic_operand_type (x
);
15568 case TLS_MODEL_GLOBAL_DYNAMIC
:
15569 case TLS_MODEL_LOCAL_DYNAMIC
:
15570 return TARGET_TLS_DESC
? SYMBOL_SMALL_TLSDESC
: SYMBOL_SMALL_TLSGD
;
15572 case TLS_MODEL_INITIAL_EXEC
:
15573 switch (aarch64_cmodel
)
15575 case AARCH64_CMODEL_TINY
:
15576 case AARCH64_CMODEL_TINY_PIC
:
15577 return SYMBOL_TINY_TLSIE
;
15579 return SYMBOL_SMALL_TLSIE
;
15582 case TLS_MODEL_LOCAL_EXEC
:
15583 if (aarch64_tls_size
== 12)
15584 return SYMBOL_TLSLE12
;
15585 else if (aarch64_tls_size
== 24)
15586 return SYMBOL_TLSLE24
;
15587 else if (aarch64_tls_size
== 32)
15588 return SYMBOL_TLSLE32
;
15589 else if (aarch64_tls_size
== 48)
15590 return SYMBOL_TLSLE48
;
15592 gcc_unreachable ();
15594 case TLS_MODEL_EMULATED
:
15595 case TLS_MODEL_NONE
:
15596 return SYMBOL_FORCE_TO_MEM
;
15599 gcc_unreachable ();
15603 /* Return the correct method for accessing X + OFFSET, where X is either
15604 a SYMBOL_REF or LABEL_REF. */
15606 enum aarch64_symbol_type
15607 aarch64_classify_symbol (rtx x
, HOST_WIDE_INT offset
)
15609 if (GET_CODE (x
) == LABEL_REF
)
15611 switch (aarch64_cmodel
)
15613 case AARCH64_CMODEL_LARGE
:
15614 return SYMBOL_FORCE_TO_MEM
;
15616 case AARCH64_CMODEL_TINY_PIC
:
15617 case AARCH64_CMODEL_TINY
:
15618 return SYMBOL_TINY_ABSOLUTE
;
15620 case AARCH64_CMODEL_SMALL_SPIC
:
15621 case AARCH64_CMODEL_SMALL_PIC
:
15622 case AARCH64_CMODEL_SMALL
:
15623 return SYMBOL_SMALL_ABSOLUTE
;
15626 gcc_unreachable ();
15630 if (GET_CODE (x
) == SYMBOL_REF
)
15632 if (aarch64_tls_symbol_p (x
))
15633 return aarch64_classify_tls_symbol (x
);
15635 switch (aarch64_cmodel
)
15637 case AARCH64_CMODEL_TINY
:
15638 /* When we retrieve symbol + offset address, we have to make sure
15639 the offset does not cause overflow of the final address. But
15640 we have no way of knowing the address of symbol at compile time
15641 so we can't accurately say if the distance between the PC and
15642 symbol + offset is outside the addressible range of +/-1MB in the
15643 TINY code model. So we limit the maximum offset to +/-64KB and
15644 assume the offset to the symbol is not larger than +/-(1MB - 64KB).
15645 If offset_within_block_p is true we allow larger offsets.
15646 Furthermore force to memory if the symbol is a weak reference to
15647 something that doesn't resolve to a symbol in this module. */
15649 if (SYMBOL_REF_WEAK (x
) && !aarch64_symbol_binds_local_p (x
))
15650 return SYMBOL_FORCE_TO_MEM
;
15651 if (!(IN_RANGE (offset
, -0x10000, 0x10000)
15652 || offset_within_block_p (x
, offset
)))
15653 return SYMBOL_FORCE_TO_MEM
;
15655 return SYMBOL_TINY_ABSOLUTE
;
15657 case AARCH64_CMODEL_SMALL
:
15658 /* Same reasoning as the tiny code model, but the offset cap here is
15659 1MB, allowing +/-3.9GB for the offset to the symbol. */
15661 if (SYMBOL_REF_WEAK (x
) && !aarch64_symbol_binds_local_p (x
))
15662 return SYMBOL_FORCE_TO_MEM
;
15663 if (!(IN_RANGE (offset
, -0x100000, 0x100000)
15664 || offset_within_block_p (x
, offset
)))
15665 return SYMBOL_FORCE_TO_MEM
;
15667 return SYMBOL_SMALL_ABSOLUTE
;
15669 case AARCH64_CMODEL_TINY_PIC
:
15670 if (!aarch64_symbol_binds_local_p (x
))
15671 return SYMBOL_TINY_GOT
;
15672 return SYMBOL_TINY_ABSOLUTE
;
15674 case AARCH64_CMODEL_SMALL_SPIC
:
15675 case AARCH64_CMODEL_SMALL_PIC
:
15676 if (!aarch64_symbol_binds_local_p (x
))
15677 return (aarch64_cmodel
== AARCH64_CMODEL_SMALL_SPIC
15678 ? SYMBOL_SMALL_GOT_28K
: SYMBOL_SMALL_GOT_4G
);
15679 return SYMBOL_SMALL_ABSOLUTE
;
15681 case AARCH64_CMODEL_LARGE
:
15682 /* This is alright even in PIC code as the constant
15683 pool reference is always PC relative and within
15684 the same translation unit. */
15685 if (!aarch64_pcrelative_literal_loads
&& CONSTANT_POOL_ADDRESS_P (x
))
15686 return SYMBOL_SMALL_ABSOLUTE
;
15688 return SYMBOL_FORCE_TO_MEM
;
15691 gcc_unreachable ();
15695 /* By default push everything into the constant pool. */
15696 return SYMBOL_FORCE_TO_MEM
;
15700 aarch64_constant_address_p (rtx x
)
15702 return (CONSTANT_P (x
) && memory_address_p (DImode
, x
));
15706 aarch64_legitimate_pic_operand_p (rtx x
)
15708 if (GET_CODE (x
) == SYMBOL_REF
15709 || (GET_CODE (x
) == CONST
15710 && GET_CODE (XEXP (x
, 0)) == PLUS
15711 && GET_CODE (XEXP (XEXP (x
, 0), 0)) == SYMBOL_REF
))
15717 /* Implement TARGET_LEGITIMATE_CONSTANT_P hook. Return true for constants
15718 that should be rematerialized rather than spilled. */
15721 aarch64_legitimate_constant_p (machine_mode mode
, rtx x
)
15723 /* Support CSE and rematerialization of common constants. */
15724 if (CONST_INT_P (x
)
15725 || (CONST_DOUBLE_P (x
) && GET_MODE_CLASS (mode
) == MODE_FLOAT
)
15726 || GET_CODE (x
) == CONST_VECTOR
)
15729 /* Do not allow vector struct mode constants for Advanced SIMD.
15730 We could support 0 and -1 easily, but they need support in
15731 aarch64-simd.md. */
15732 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
15733 if (vec_flags
== (VEC_ADVSIMD
| VEC_STRUCT
))
15736 /* Only accept variable-length vector constants if they can be
15739 ??? It would be possible to handle rematerialization of other
15740 constants via secondary reloads. */
15741 if (vec_flags
& VEC_ANY_SVE
)
15742 return aarch64_simd_valid_immediate (x
, NULL
);
15744 if (GET_CODE (x
) == HIGH
)
15747 /* Accept polynomial constants that can be calculated by using the
15748 destination of a move as the sole temporary. Constants that
15749 require a second temporary cannot be rematerialized (they can't be
15750 forced to memory and also aren't legitimate constants). */
15752 if (poly_int_rtx_p (x
, &offset
))
15753 return aarch64_offset_temporaries (false, offset
) <= 1;
15755 /* If an offset is being added to something else, we need to allow the
15756 base to be moved into the destination register, meaning that there
15757 are no free temporaries for the offset. */
15758 x
= strip_offset (x
, &offset
);
15759 if (!offset
.is_constant () && aarch64_offset_temporaries (true, offset
) > 0)
15762 /* Do not allow const (plus (anchor_symbol, const_int)). */
15763 if (maybe_ne (offset
, 0) && SYMBOL_REF_P (x
) && SYMBOL_REF_ANCHOR_P (x
))
15766 /* Treat symbols as constants. Avoid TLS symbols as they are complex,
15767 so spilling them is better than rematerialization. */
15768 if (SYMBOL_REF_P (x
) && !SYMBOL_REF_TLS_MODEL (x
))
15771 /* Label references are always constant. */
15772 if (GET_CODE (x
) == LABEL_REF
)
15779 aarch64_load_tp (rtx target
)
15782 || GET_MODE (target
) != Pmode
15783 || !register_operand (target
, Pmode
))
15784 target
= gen_reg_rtx (Pmode
);
15786 /* Can return in any reg. */
15787 emit_insn (gen_aarch64_load_tp_hard (target
));
15791 /* On AAPCS systems, this is the "struct __va_list". */
15792 static GTY(()) tree va_list_type
;
15794 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
15795 Return the type to use as __builtin_va_list.
15797 AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
15809 aarch64_build_builtin_va_list (void)
15812 tree f_stack
, f_grtop
, f_vrtop
, f_groff
, f_vroff
;
15814 /* Create the type. */
15815 va_list_type
= lang_hooks
.types
.make_type (RECORD_TYPE
);
15816 /* Give it the required name. */
15817 va_list_name
= build_decl (BUILTINS_LOCATION
,
15819 get_identifier ("__va_list"),
15821 DECL_ARTIFICIAL (va_list_name
) = 1;
15822 TYPE_NAME (va_list_type
) = va_list_name
;
15823 TYPE_STUB_DECL (va_list_type
) = va_list_name
;
15825 /* Create the fields. */
15826 f_stack
= build_decl (BUILTINS_LOCATION
,
15827 FIELD_DECL
, get_identifier ("__stack"),
15829 f_grtop
= build_decl (BUILTINS_LOCATION
,
15830 FIELD_DECL
, get_identifier ("__gr_top"),
15832 f_vrtop
= build_decl (BUILTINS_LOCATION
,
15833 FIELD_DECL
, get_identifier ("__vr_top"),
15835 f_groff
= build_decl (BUILTINS_LOCATION
,
15836 FIELD_DECL
, get_identifier ("__gr_offs"),
15837 integer_type_node
);
15838 f_vroff
= build_decl (BUILTINS_LOCATION
,
15839 FIELD_DECL
, get_identifier ("__vr_offs"),
15840 integer_type_node
);
15842 /* Tell tree-stdarg pass about our internal offset fields.
15843 NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
15844 purpose to identify whether the code is updating va_list internal
15845 offset fields through irregular way. */
15846 va_list_gpr_counter_field
= f_groff
;
15847 va_list_fpr_counter_field
= f_vroff
;
15849 DECL_ARTIFICIAL (f_stack
) = 1;
15850 DECL_ARTIFICIAL (f_grtop
) = 1;
15851 DECL_ARTIFICIAL (f_vrtop
) = 1;
15852 DECL_ARTIFICIAL (f_groff
) = 1;
15853 DECL_ARTIFICIAL (f_vroff
) = 1;
15855 DECL_FIELD_CONTEXT (f_stack
) = va_list_type
;
15856 DECL_FIELD_CONTEXT (f_grtop
) = va_list_type
;
15857 DECL_FIELD_CONTEXT (f_vrtop
) = va_list_type
;
15858 DECL_FIELD_CONTEXT (f_groff
) = va_list_type
;
15859 DECL_FIELD_CONTEXT (f_vroff
) = va_list_type
;
15861 TYPE_FIELDS (va_list_type
) = f_stack
;
15862 DECL_CHAIN (f_stack
) = f_grtop
;
15863 DECL_CHAIN (f_grtop
) = f_vrtop
;
15864 DECL_CHAIN (f_vrtop
) = f_groff
;
15865 DECL_CHAIN (f_groff
) = f_vroff
;
15867 /* Compute its layout. */
15868 layout_type (va_list_type
);
15870 return va_list_type
;
15873 /* Implement TARGET_EXPAND_BUILTIN_VA_START. */
15875 aarch64_expand_builtin_va_start (tree valist
, rtx nextarg ATTRIBUTE_UNUSED
)
15877 const CUMULATIVE_ARGS
*cum
;
15878 tree f_stack
, f_grtop
, f_vrtop
, f_groff
, f_vroff
;
15879 tree stack
, grtop
, vrtop
, groff
, vroff
;
15881 int gr_save_area_size
= cfun
->va_list_gpr_size
;
15882 int vr_save_area_size
= cfun
->va_list_fpr_size
;
15885 cum
= &crtl
->args
.info
;
15886 if (cfun
->va_list_gpr_size
)
15887 gr_save_area_size
= MIN ((NUM_ARG_REGS
- cum
->aapcs_ncrn
) * UNITS_PER_WORD
,
15888 cfun
->va_list_gpr_size
);
15889 if (cfun
->va_list_fpr_size
)
15890 vr_save_area_size
= MIN ((NUM_FP_ARG_REGS
- cum
->aapcs_nvrn
)
15891 * UNITS_PER_VREG
, cfun
->va_list_fpr_size
);
15895 gcc_assert (cum
->aapcs_nvrn
== 0);
15896 vr_save_area_size
= 0;
15899 f_stack
= TYPE_FIELDS (va_list_type_node
);
15900 f_grtop
= DECL_CHAIN (f_stack
);
15901 f_vrtop
= DECL_CHAIN (f_grtop
);
15902 f_groff
= DECL_CHAIN (f_vrtop
);
15903 f_vroff
= DECL_CHAIN (f_groff
);
15905 stack
= build3 (COMPONENT_REF
, TREE_TYPE (f_stack
), valist
, f_stack
,
15907 grtop
= build3 (COMPONENT_REF
, TREE_TYPE (f_grtop
), valist
, f_grtop
,
15909 vrtop
= build3 (COMPONENT_REF
, TREE_TYPE (f_vrtop
), valist
, f_vrtop
,
15911 groff
= build3 (COMPONENT_REF
, TREE_TYPE (f_groff
), valist
, f_groff
,
15913 vroff
= build3 (COMPONENT_REF
, TREE_TYPE (f_vroff
), valist
, f_vroff
,
15916 /* Emit code to initialize STACK, which points to the next varargs stack
15917 argument. CUM->AAPCS_STACK_SIZE gives the number of stack words used
15918 by named arguments. STACK is 8-byte aligned. */
15919 t
= make_tree (TREE_TYPE (stack
), virtual_incoming_args_rtx
);
15920 if (cum
->aapcs_stack_size
> 0)
15921 t
= fold_build_pointer_plus_hwi (t
, cum
->aapcs_stack_size
* UNITS_PER_WORD
);
15922 t
= build2 (MODIFY_EXPR
, TREE_TYPE (stack
), stack
, t
);
15923 expand_expr (t
, const0_rtx
, VOIDmode
, EXPAND_NORMAL
);
15925 /* Emit code to initialize GRTOP, the top of the GR save area.
15926 virtual_incoming_args_rtx should have been 16 byte aligned. */
15927 t
= make_tree (TREE_TYPE (grtop
), virtual_incoming_args_rtx
);
15928 t
= build2 (MODIFY_EXPR
, TREE_TYPE (grtop
), grtop
, t
);
15929 expand_expr (t
, const0_rtx
, VOIDmode
, EXPAND_NORMAL
);
15931 /* Emit code to initialize VRTOP, the top of the VR save area.
15932 This address is gr_save_area_bytes below GRTOP, rounded
15933 down to the next 16-byte boundary. */
15934 t
= make_tree (TREE_TYPE (vrtop
), virtual_incoming_args_rtx
);
15935 vr_offset
= ROUND_UP (gr_save_area_size
,
15936 STACK_BOUNDARY
/ BITS_PER_UNIT
);
15939 t
= fold_build_pointer_plus_hwi (t
, -vr_offset
);
15940 t
= build2 (MODIFY_EXPR
, TREE_TYPE (vrtop
), vrtop
, t
);
15941 expand_expr (t
, const0_rtx
, VOIDmode
, EXPAND_NORMAL
);
15943 /* Emit code to initialize GROFF, the offset from GRTOP of the
15944 next GPR argument. */
15945 t
= build2 (MODIFY_EXPR
, TREE_TYPE (groff
), groff
,
15946 build_int_cst (TREE_TYPE (groff
), -gr_save_area_size
));
15947 expand_expr (t
, const0_rtx
, VOIDmode
, EXPAND_NORMAL
);
15949 /* Likewise emit code to initialize VROFF, the offset from FTOP
15950 of the next VR argument. */
15951 t
= build2 (MODIFY_EXPR
, TREE_TYPE (vroff
), vroff
,
15952 build_int_cst (TREE_TYPE (vroff
), -vr_save_area_size
));
15953 expand_expr (t
, const0_rtx
, VOIDmode
, EXPAND_NORMAL
);
15956 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR. */
15959 aarch64_gimplify_va_arg_expr (tree valist
, tree type
, gimple_seq
*pre_p
,
15960 gimple_seq
*post_p ATTRIBUTE_UNUSED
)
15964 bool is_ha
; /* is HFA or HVA. */
15965 bool dw_align
; /* double-word align. */
15966 machine_mode ag_mode
= VOIDmode
;
15970 tree f_stack
, f_grtop
, f_vrtop
, f_groff
, f_vroff
;
15971 tree stack
, f_top
, f_off
, off
, arg
, roundup
, on_stack
;
15972 HOST_WIDE_INT size
, rsize
, adjust
, align
;
15973 tree t
, u
, cond1
, cond2
;
15975 indirect_p
= pass_va_arg_by_reference (type
);
15977 type
= build_pointer_type (type
);
15979 mode
= TYPE_MODE (type
);
15981 f_stack
= TYPE_FIELDS (va_list_type_node
);
15982 f_grtop
= DECL_CHAIN (f_stack
);
15983 f_vrtop
= DECL_CHAIN (f_grtop
);
15984 f_groff
= DECL_CHAIN (f_vrtop
);
15985 f_vroff
= DECL_CHAIN (f_groff
);
15987 stack
= build3 (COMPONENT_REF
, TREE_TYPE (f_stack
), unshare_expr (valist
),
15988 f_stack
, NULL_TREE
);
15989 size
= int_size_in_bytes (type
);
15993 = aarch64_function_arg_alignment (mode
, type
, &abi_break
) / BITS_PER_UNIT
;
15997 if (aarch64_vfp_is_call_or_return_candidate (mode
,
16003 /* No frontends can create types with variable-sized modes, so we
16004 shouldn't be asked to pass or return them. */
16005 unsigned int ag_size
= GET_MODE_SIZE (ag_mode
).to_constant ();
16007 /* TYPE passed in fp/simd registers. */
16009 aarch64_err_no_fpadvsimd (mode
);
16011 f_top
= build3 (COMPONENT_REF
, TREE_TYPE (f_vrtop
),
16012 unshare_expr (valist
), f_vrtop
, NULL_TREE
);
16013 f_off
= build3 (COMPONENT_REF
, TREE_TYPE (f_vroff
),
16014 unshare_expr (valist
), f_vroff
, NULL_TREE
);
16016 rsize
= nregs
* UNITS_PER_VREG
;
16020 if (BYTES_BIG_ENDIAN
&& ag_size
< UNITS_PER_VREG
)
16021 adjust
= UNITS_PER_VREG
- ag_size
;
16023 else if (BLOCK_REG_PADDING (mode
, type
, 1) == PAD_DOWNWARD
16024 && size
< UNITS_PER_VREG
)
16026 adjust
= UNITS_PER_VREG
- size
;
16031 /* TYPE passed in general registers. */
16032 f_top
= build3 (COMPONENT_REF
, TREE_TYPE (f_grtop
),
16033 unshare_expr (valist
), f_grtop
, NULL_TREE
);
16034 f_off
= build3 (COMPONENT_REF
, TREE_TYPE (f_groff
),
16035 unshare_expr (valist
), f_groff
, NULL_TREE
);
16036 rsize
= ROUND_UP (size
, UNITS_PER_WORD
);
16037 nregs
= rsize
/ UNITS_PER_WORD
;
16041 if (abi_break
&& warn_psabi
)
16042 inform (input_location
, "parameter passing for argument of type "
16043 "%qT changed in GCC 9.1", type
);
16047 if (BLOCK_REG_PADDING (mode
, type
, 1) == PAD_DOWNWARD
16048 && size
< UNITS_PER_WORD
)
16050 adjust
= UNITS_PER_WORD
- size
;
16054 /* Get a local temporary for the field value. */
16055 off
= get_initialized_tmp_var (f_off
, pre_p
, NULL
);
16057 /* Emit code to branch if off >= 0. */
16058 t
= build2 (GE_EXPR
, boolean_type_node
, off
,
16059 build_int_cst (TREE_TYPE (off
), 0));
16060 cond1
= build3 (COND_EXPR
, ptr_type_node
, t
, NULL_TREE
, NULL_TREE
);
16064 /* Emit: offs = (offs + 15) & -16. */
16065 t
= build2 (PLUS_EXPR
, TREE_TYPE (off
), off
,
16066 build_int_cst (TREE_TYPE (off
), 15));
16067 t
= build2 (BIT_AND_EXPR
, TREE_TYPE (off
), t
,
16068 build_int_cst (TREE_TYPE (off
), -16));
16069 roundup
= build2 (MODIFY_EXPR
, TREE_TYPE (off
), off
, t
);
16074 /* Update ap.__[g|v]r_offs */
16075 t
= build2 (PLUS_EXPR
, TREE_TYPE (off
), off
,
16076 build_int_cst (TREE_TYPE (off
), rsize
));
16077 t
= build2 (MODIFY_EXPR
, TREE_TYPE (f_off
), unshare_expr (f_off
), t
);
16081 t
= build2 (COMPOUND_EXPR
, TREE_TYPE (t
), roundup
, t
);
16083 /* [cond2] if (ap.__[g|v]r_offs > 0) */
16084 u
= build2 (GT_EXPR
, boolean_type_node
, unshare_expr (f_off
),
16085 build_int_cst (TREE_TYPE (f_off
), 0));
16086 cond2
= build3 (COND_EXPR
, ptr_type_node
, u
, NULL_TREE
, NULL_TREE
);
16088 /* String up: make sure the assignment happens before the use. */
16089 t
= build2 (COMPOUND_EXPR
, TREE_TYPE (cond2
), t
, cond2
);
16090 COND_EXPR_ELSE (cond1
) = t
;
16092 /* Prepare the trees handling the argument that is passed on the stack;
16093 the top level node will store in ON_STACK. */
16094 arg
= get_initialized_tmp_var (stack
, pre_p
, NULL
);
16097 /* if (alignof(type) > 8) (arg = arg + 15) & -16; */
16098 t
= fold_build_pointer_plus_hwi (arg
, 15);
16099 t
= build2 (BIT_AND_EXPR
, TREE_TYPE (t
), t
,
16100 build_int_cst (TREE_TYPE (t
), -16));
16101 roundup
= build2 (MODIFY_EXPR
, TREE_TYPE (arg
), arg
, t
);
16105 /* Advance ap.__stack */
16106 t
= fold_build_pointer_plus_hwi (arg
, size
+ 7);
16107 t
= build2 (BIT_AND_EXPR
, TREE_TYPE (t
), t
,
16108 build_int_cst (TREE_TYPE (t
), -8));
16109 t
= build2 (MODIFY_EXPR
, TREE_TYPE (stack
), unshare_expr (stack
), t
);
16110 /* String up roundup and advance. */
16112 t
= build2 (COMPOUND_EXPR
, TREE_TYPE (t
), roundup
, t
);
16113 /* String up with arg */
16114 on_stack
= build2 (COMPOUND_EXPR
, TREE_TYPE (arg
), t
, arg
);
16115 /* Big-endianness related address adjustment. */
16116 if (BLOCK_REG_PADDING (mode
, type
, 1) == PAD_DOWNWARD
16117 && size
< UNITS_PER_WORD
)
16119 t
= build2 (POINTER_PLUS_EXPR
, TREE_TYPE (arg
), arg
,
16120 size_int (UNITS_PER_WORD
- size
));
16121 on_stack
= build2 (COMPOUND_EXPR
, TREE_TYPE (arg
), on_stack
, t
);
16124 COND_EXPR_THEN (cond1
) = unshare_expr (on_stack
);
16125 COND_EXPR_THEN (cond2
) = unshare_expr (on_stack
);
16127 /* Adjustment to OFFSET in the case of BIG_ENDIAN. */
16130 t
= build2 (PREINCREMENT_EXPR
, TREE_TYPE (off
), off
,
16131 build_int_cst (TREE_TYPE (off
), adjust
));
16133 t
= fold_convert (sizetype
, t
);
16134 t
= build2 (POINTER_PLUS_EXPR
, TREE_TYPE (f_top
), f_top
, t
);
16138 /* type ha; // treat as "struct {ftype field[n];}"
16139 ... [computing offs]
16140 for (i = 0; i <nregs; ++i, offs += 16)
16141 ha.field[i] = *((ftype *)(ap.__vr_top + offs));
16144 tree tmp_ha
, field_t
, field_ptr_t
;
16146 /* Declare a local variable. */
16147 tmp_ha
= create_tmp_var_raw (type
, "ha");
16148 gimple_add_tmp_var (tmp_ha
);
16150 /* Establish the base type. */
16154 field_t
= float_type_node
;
16155 field_ptr_t
= float_ptr_type_node
;
16158 field_t
= double_type_node
;
16159 field_ptr_t
= double_ptr_type_node
;
16162 field_t
= long_double_type_node
;
16163 field_ptr_t
= long_double_ptr_type_node
;
16166 field_t
= aarch64_fp16_type_node
;
16167 field_ptr_t
= aarch64_fp16_ptr_type_node
;
16170 field_t
= aarch64_bf16_type_node
;
16171 field_ptr_t
= aarch64_bf16_ptr_type_node
;
16176 tree innertype
= make_signed_type (GET_MODE_PRECISION (SImode
));
16177 field_t
= build_vector_type_for_mode (innertype
, ag_mode
);
16178 field_ptr_t
= build_pointer_type (field_t
);
16185 /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area */
16186 tmp_ha
= build1 (ADDR_EXPR
, field_ptr_t
, tmp_ha
);
16188 t
= fold_convert (field_ptr_t
, addr
);
16189 t
= build2 (MODIFY_EXPR
, field_t
,
16190 build1 (INDIRECT_REF
, field_t
, tmp_ha
),
16191 build1 (INDIRECT_REF
, field_t
, t
));
16193 /* ha.field[i] = *((field_ptr_t)vr_saved_area + i) */
16194 for (i
= 1; i
< nregs
; ++i
)
16196 addr
= fold_build_pointer_plus_hwi (addr
, UNITS_PER_VREG
);
16197 u
= fold_convert (field_ptr_t
, addr
);
16198 u
= build2 (MODIFY_EXPR
, field_t
,
16199 build2 (MEM_REF
, field_t
, tmp_ha
,
16200 build_int_cst (field_ptr_t
,
16202 int_size_in_bytes (field_t
)))),
16203 build1 (INDIRECT_REF
, field_t
, u
));
16204 t
= build2 (COMPOUND_EXPR
, TREE_TYPE (t
), t
, u
);
16207 u
= fold_convert (TREE_TYPE (f_top
), tmp_ha
);
16208 t
= build2 (COMPOUND_EXPR
, TREE_TYPE (f_top
), t
, u
);
16211 COND_EXPR_ELSE (cond2
) = t
;
16212 addr
= fold_convert (build_pointer_type (type
), cond1
);
16213 addr
= build_va_arg_indirect_ref (addr
);
16216 addr
= build_va_arg_indirect_ref (addr
);
16221 /* Implement TARGET_SETUP_INCOMING_VARARGS. */
16224 aarch64_setup_incoming_varargs (cumulative_args_t cum_v
,
16225 const function_arg_info
&arg
,
16226 int *pretend_size ATTRIBUTE_UNUSED
, int no_rtl
)
16228 CUMULATIVE_ARGS
*cum
= get_cumulative_args (cum_v
);
16229 CUMULATIVE_ARGS local_cum
;
16230 int gr_saved
= cfun
->va_list_gpr_size
;
16231 int vr_saved
= cfun
->va_list_fpr_size
;
16233 /* The caller has advanced CUM up to, but not beyond, the last named
16234 argument. Advance a local copy of CUM past the last "real" named
16235 argument, to find out how many registers are left over. */
16237 aarch64_function_arg_advance (pack_cumulative_args(&local_cum
), arg
);
16239 /* Found out how many registers we need to save.
16240 Honor tree-stdvar analysis results. */
16241 if (cfun
->va_list_gpr_size
)
16242 gr_saved
= MIN (NUM_ARG_REGS
- local_cum
.aapcs_ncrn
,
16243 cfun
->va_list_gpr_size
/ UNITS_PER_WORD
);
16244 if (cfun
->va_list_fpr_size
)
16245 vr_saved
= MIN (NUM_FP_ARG_REGS
- local_cum
.aapcs_nvrn
,
16246 cfun
->va_list_fpr_size
/ UNITS_PER_VREG
);
16250 gcc_assert (local_cum
.aapcs_nvrn
== 0);
16260 /* virtual_incoming_args_rtx should have been 16-byte aligned. */
16261 ptr
= plus_constant (Pmode
, virtual_incoming_args_rtx
,
16262 - gr_saved
* UNITS_PER_WORD
);
16263 mem
= gen_frame_mem (BLKmode
, ptr
);
16264 set_mem_alias_set (mem
, get_varargs_alias_set ());
16266 move_block_from_reg (local_cum
.aapcs_ncrn
+ R0_REGNUM
,
16271 /* We can't use move_block_from_reg, because it will use
16272 the wrong mode, storing D regs only. */
16273 machine_mode mode
= TImode
;
16274 int off
, i
, vr_start
;
16276 /* Set OFF to the offset from virtual_incoming_args_rtx of
16277 the first vector register. The VR save area lies below
16278 the GR one, and is aligned to 16 bytes. */
16279 off
= -ROUND_UP (gr_saved
* UNITS_PER_WORD
,
16280 STACK_BOUNDARY
/ BITS_PER_UNIT
);
16281 off
-= vr_saved
* UNITS_PER_VREG
;
16283 vr_start
= V0_REGNUM
+ local_cum
.aapcs_nvrn
;
16284 for (i
= 0; i
< vr_saved
; ++i
)
16288 ptr
= plus_constant (Pmode
, virtual_incoming_args_rtx
, off
);
16289 mem
= gen_frame_mem (mode
, ptr
);
16290 set_mem_alias_set (mem
, get_varargs_alias_set ());
16291 aarch64_emit_move (mem
, gen_rtx_REG (mode
, vr_start
+ i
));
16292 off
+= UNITS_PER_VREG
;
16297 /* We don't save the size into *PRETEND_SIZE because we want to avoid
16298 any complication of having crtl->args.pretend_args_size changed. */
16299 cfun
->machine
->frame
.saved_varargs_size
16300 = (ROUND_UP (gr_saved
* UNITS_PER_WORD
,
16301 STACK_BOUNDARY
/ BITS_PER_UNIT
)
16302 + vr_saved
* UNITS_PER_VREG
);
16306 aarch64_conditional_register_usage (void)
16311 for (i
= V0_REGNUM
; i
<= V31_REGNUM
; i
++)
16314 call_used_regs
[i
] = 1;
16318 for (i
= P0_REGNUM
; i
<= P15_REGNUM
; i
++)
16321 call_used_regs
[i
] = 1;
16324 /* Only allow the FFR and FFRT to be accessed via special patterns. */
16325 CLEAR_HARD_REG_BIT (operand_reg_set
, FFR_REGNUM
);
16326 CLEAR_HARD_REG_BIT (operand_reg_set
, FFRT_REGNUM
);
16328 /* When tracking speculation, we need a couple of call-clobbered registers
16329 to track the speculation state. It would be nice to just use
16330 IP0 and IP1, but currently there are numerous places that just
16331 assume these registers are free for other uses (eg pointer
16332 authentication). */
16333 if (aarch64_track_speculation
)
16335 fixed_regs
[SPECULATION_TRACKER_REGNUM
] = 1;
16336 call_used_regs
[SPECULATION_TRACKER_REGNUM
] = 1;
16337 fixed_regs
[SPECULATION_SCRATCH_REGNUM
] = 1;
16338 call_used_regs
[SPECULATION_SCRATCH_REGNUM
] = 1;
16342 /* Implement TARGET_MEMBER_TYPE_FORCES_BLK. */
16345 aarch64_member_type_forces_blk (const_tree field_or_array
, machine_mode mode
)
16347 /* For records we're passed a FIELD_DECL, for arrays we're passed
16348 an ARRAY_TYPE. In both cases we're interested in the TREE_TYPE. */
16349 const_tree type
= TREE_TYPE (field_or_array
);
16351 /* Assign BLKmode to anything that contains multiple SVE predicates.
16352 For structures, the "multiple" case is indicated by MODE being
16354 unsigned int num_zr
, num_pr
;
16355 if (aarch64_sve::builtin_type_p (type
, &num_zr
, &num_pr
) && num_pr
!= 0)
16357 if (TREE_CODE (field_or_array
) == ARRAY_TYPE
)
16358 return !simple_cst_equal (TYPE_SIZE (field_or_array
),
16360 return mode
== VOIDmode
;
16363 return default_member_type_forces_blk (field_or_array
, mode
);
16366 /* Walk down the type tree of TYPE counting consecutive base elements.
16367 If *MODEP is VOIDmode, then set it to the first valid floating point
16368 type. If a non-floating point type is found, or if a floating point
16369 type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
16370 otherwise return the count in the sub-tree. */
16372 aapcs_vfp_sub_candidate (const_tree type
, machine_mode
*modep
)
16375 HOST_WIDE_INT size
;
16377 if (aarch64_sve::builtin_type_p (type
))
16380 switch (TREE_CODE (type
))
16383 mode
= TYPE_MODE (type
);
16384 if (mode
!= DFmode
&& mode
!= SFmode
16385 && mode
!= TFmode
&& mode
!= HFmode
)
16388 if (*modep
== VOIDmode
)
16391 if (*modep
== mode
)
16397 mode
= TYPE_MODE (TREE_TYPE (type
));
16398 if (mode
!= DFmode
&& mode
!= SFmode
16399 && mode
!= TFmode
&& mode
!= HFmode
)
16402 if (*modep
== VOIDmode
)
16405 if (*modep
== mode
)
16411 /* Use V2SImode and V4SImode as representatives of all 64-bit
16412 and 128-bit vector types. */
16413 size
= int_size_in_bytes (type
);
16426 if (*modep
== VOIDmode
)
16429 /* Vector modes are considered to be opaque: two vectors are
16430 equivalent for the purposes of being homogeneous aggregates
16431 if they are the same size. */
16432 if (*modep
== mode
)
16440 tree index
= TYPE_DOMAIN (type
);
16442 /* Can't handle incomplete types nor sizes that are not
16444 if (!COMPLETE_TYPE_P (type
)
16445 || TREE_CODE (TYPE_SIZE (type
)) != INTEGER_CST
)
16448 count
= aapcs_vfp_sub_candidate (TREE_TYPE (type
), modep
);
16451 || !TYPE_MAX_VALUE (index
)
16452 || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index
))
16453 || !TYPE_MIN_VALUE (index
)
16454 || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index
))
16458 count
*= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index
))
16459 - tree_to_uhwi (TYPE_MIN_VALUE (index
)));
16461 /* There must be no padding. */
16462 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type
)),
16463 count
* GET_MODE_BITSIZE (*modep
)))
16475 /* Can't handle incomplete types nor sizes that are not
16477 if (!COMPLETE_TYPE_P (type
)
16478 || TREE_CODE (TYPE_SIZE (type
)) != INTEGER_CST
)
16481 for (field
= TYPE_FIELDS (type
); field
; field
= TREE_CHAIN (field
))
16483 if (TREE_CODE (field
) != FIELD_DECL
)
16486 sub_count
= aapcs_vfp_sub_candidate (TREE_TYPE (field
), modep
);
16489 count
+= sub_count
;
16492 /* There must be no padding. */
16493 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type
)),
16494 count
* GET_MODE_BITSIZE (*modep
)))
16501 case QUAL_UNION_TYPE
:
16503 /* These aren't very interesting except in a degenerate case. */
16508 /* Can't handle incomplete types nor sizes that are not
16510 if (!COMPLETE_TYPE_P (type
)
16511 || TREE_CODE (TYPE_SIZE (type
)) != INTEGER_CST
)
16514 for (field
= TYPE_FIELDS (type
); field
; field
= TREE_CHAIN (field
))
16516 if (TREE_CODE (field
) != FIELD_DECL
)
16519 sub_count
= aapcs_vfp_sub_candidate (TREE_TYPE (field
), modep
);
16522 count
= count
> sub_count
? count
: sub_count
;
16525 /* There must be no padding. */
16526 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type
)),
16527 count
* GET_MODE_BITSIZE (*modep
)))
16540 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
16541 type as described in AAPCS64 \S 4.1.2.
16543 See the comment above aarch64_composite_type_p for the notes on MODE. */
16546 aarch64_short_vector_p (const_tree type
,
16549 poly_int64 size
= -1;
16551 if (type
&& TREE_CODE (type
) == VECTOR_TYPE
)
16553 if (aarch64_sve::builtin_type_p (type
))
16555 size
= int_size_in_bytes (type
);
16557 else if (GET_MODE_CLASS (mode
) == MODE_VECTOR_INT
16558 || GET_MODE_CLASS (mode
) == MODE_VECTOR_FLOAT
)
16560 /* Rely only on the type, not the mode, when processing SVE types. */
16561 if (type
&& aarch64_some_values_include_pst_objects_p (type
))
16562 gcc_assert (aarch64_sve_mode_p (mode
));
16564 size
= GET_MODE_SIZE (mode
);
16566 if (known_eq (size
, 8) || known_eq (size
, 16))
16568 /* 64-bit and 128-bit vectors should only acquire an SVE mode if
16569 they are being treated as scalable AAPCS64 types. */
16570 gcc_assert (!aarch64_sve_mode_p (mode
));
16576 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
16577 type as described in AAPCS64 \S 4.3. This includes aggregate, union and
16578 array types. The C99 floating-point complex types are also considered
16579 as composite types, according to AAPCS64 \S 7.1.1. The complex integer
16580 types, which are GCC extensions and out of the scope of AAPCS64, are
16581 treated as composite types here as well.
16583 Note that MODE itself is not sufficient in determining whether a type
16584 is such a composite type or not. This is because
16585 stor-layout.c:compute_record_mode may have already changed the MODE
16586 (BLKmode) of a RECORD_TYPE TYPE to some other mode. For example, a
16587 structure with only one field may have its MODE set to the mode of the
16588 field. Also an integer mode whose size matches the size of the
16589 RECORD_TYPE type may be used to substitute the original mode
16590 (i.e. BLKmode) in certain circumstances. In other words, MODE cannot be
16591 solely relied on. */
16594 aarch64_composite_type_p (const_tree type
,
16597 if (aarch64_short_vector_p (type
, mode
))
16600 if (type
&& (AGGREGATE_TYPE_P (type
) || TREE_CODE (type
) == COMPLEX_TYPE
))
16603 if (mode
== BLKmode
16604 || GET_MODE_CLASS (mode
) == MODE_COMPLEX_FLOAT
16605 || GET_MODE_CLASS (mode
) == MODE_COMPLEX_INT
)
16611 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
16612 shall be passed or returned in simd/fp register(s) (providing these
16613 parameter passing registers are available).
16615 Upon successful return, *COUNT returns the number of needed registers,
16616 *BASE_MODE returns the mode of the individual register and when IS_HAF
16617 is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
16618 floating-point aggregate or a homogeneous short-vector aggregate. */
16621 aarch64_vfp_is_call_or_return_candidate (machine_mode mode
,
16623 machine_mode
*base_mode
,
16627 if (is_ha
!= NULL
) *is_ha
= false;
16629 machine_mode new_mode
= VOIDmode
;
16630 bool composite_p
= aarch64_composite_type_p (type
, mode
);
16632 if ((!composite_p
&& GET_MODE_CLASS (mode
) == MODE_FLOAT
)
16633 || aarch64_short_vector_p (type
, mode
))
16638 else if (GET_MODE_CLASS (mode
) == MODE_COMPLEX_FLOAT
)
16640 if (is_ha
!= NULL
) *is_ha
= true;
16642 new_mode
= GET_MODE_INNER (mode
);
16644 else if (type
&& composite_p
)
16646 int ag_count
= aapcs_vfp_sub_candidate (type
, &new_mode
);
16648 if (ag_count
> 0 && ag_count
<= HA_MAX_NUM_FLDS
)
16650 if (is_ha
!= NULL
) *is_ha
= true;
16659 gcc_assert (!aarch64_sve_mode_p (new_mode
));
16660 *base_mode
= new_mode
;
16664 /* Implement TARGET_STRUCT_VALUE_RTX. */
16667 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED
,
16668 int incoming ATTRIBUTE_UNUSED
)
16670 return gen_rtx_REG (Pmode
, AARCH64_STRUCT_VALUE_REGNUM
);
16673 /* Implements target hook vector_mode_supported_p. */
16675 aarch64_vector_mode_supported_p (machine_mode mode
)
16677 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
16678 return vec_flags
!= 0 && (vec_flags
& VEC_STRUCT
) == 0;
16681 /* Return the full-width SVE vector mode for element mode MODE, if one
16684 aarch64_full_sve_mode (scalar_mode mode
)
16703 return VNx16QImode
;
16705 return opt_machine_mode ();
16709 /* Return the 128-bit Advanced SIMD vector mode for element mode MODE,
16712 aarch64_vq_mode (scalar_mode mode
)
16733 return opt_machine_mode ();
16737 /* Return appropriate SIMD container
16738 for MODE within a vector of WIDTH bits. */
16739 static machine_mode
16740 aarch64_simd_container_mode (scalar_mode mode
, poly_int64 width
)
16743 && maybe_ne (width
, 128)
16744 && known_eq (width
, BITS_PER_SVE_VECTOR
))
16745 return aarch64_full_sve_mode (mode
).else_mode (word_mode
);
16747 gcc_assert (known_eq (width
, 64) || known_eq (width
, 128));
16750 if (known_eq (width
, 128))
16751 return aarch64_vq_mode (mode
).else_mode (word_mode
);
16774 /* Return 128-bit container as the preferred SIMD mode for MODE. */
16775 static machine_mode
16776 aarch64_preferred_simd_mode (scalar_mode mode
)
16778 poly_int64 bits
= TARGET_SVE
? BITS_PER_SVE_VECTOR
: 128;
16779 return aarch64_simd_container_mode (mode
, bits
);
16782 /* Return a list of possible vector sizes for the vectorizer
16783 to iterate over. */
16784 static unsigned int
16785 aarch64_autovectorize_vector_modes (vector_modes
*modes
, bool)
16787 static const machine_mode sve_modes
[] = {
16788 /* Try using full vectors for all element types. */
16791 /* Try using 16-bit containers for 8-bit elements and full vectors
16792 for wider elements. */
16795 /* Try using 32-bit containers for 8-bit and 16-bit elements and
16796 full vectors for wider elements. */
16799 /* Try using 64-bit containers for all element types. */
16803 static const machine_mode advsimd_modes
[] = {
16804 /* Try using 128-bit vectors for all element types. */
16807 /* Try using 64-bit vectors for 8-bit elements and 128-bit vectors
16808 for wider elements. */
16811 /* Try using 64-bit vectors for 16-bit elements and 128-bit vectors
16812 for wider elements.
16814 TODO: We could support a limited form of V4QImode too, so that
16815 we use 32-bit vectors for 8-bit elements. */
16818 /* Try using 64-bit vectors for 32-bit elements and 128-bit vectors
16819 for 64-bit elements.
16821 TODO: We could similarly support limited forms of V2QImode and V2HImode
16826 /* Try using N-byte SVE modes only after trying N-byte Advanced SIMD mode.
16829 - If we can't use N-byte Advanced SIMD vectors then the placement
16830 doesn't matter; we'll just continue as though the Advanced SIMD
16831 entry didn't exist.
16833 - If an SVE main loop with N bytes ends up being cheaper than an
16834 Advanced SIMD main loop with N bytes then by default we'll replace
16835 the Advanced SIMD version with the SVE one.
16837 - If an Advanced SIMD main loop with N bytes ends up being cheaper
16838 than an SVE main loop with N bytes then by default we'll try to
16839 use the SVE loop to vectorize the epilogue instead. */
16840 unsigned int sve_i
= TARGET_SVE
? 0 : ARRAY_SIZE (sve_modes
);
16841 unsigned int advsimd_i
= 0;
16842 while (advsimd_i
< ARRAY_SIZE (advsimd_modes
))
16844 if (sve_i
< ARRAY_SIZE (sve_modes
)
16845 && maybe_gt (GET_MODE_NUNITS (sve_modes
[sve_i
]),
16846 GET_MODE_NUNITS (advsimd_modes
[advsimd_i
])))
16847 modes
->safe_push (sve_modes
[sve_i
++]);
16849 modes
->safe_push (advsimd_modes
[advsimd_i
++]);
16851 while (sve_i
< ARRAY_SIZE (sve_modes
))
16852 modes
->safe_push (sve_modes
[sve_i
++]);
16854 unsigned int flags
= 0;
16855 /* Consider enabling VECT_COMPARE_COSTS for SVE, both so that we
16856 can compare SVE against Advanced SIMD and so that we can compare
16857 multiple SVE vectorization approaches against each other. There's
16858 not really any point doing this for Advanced SIMD only, since the
16859 first mode that works should always be the best. */
16860 if (TARGET_SVE
&& aarch64_sve_compare_costs
)
16861 flags
|= VECT_COMPARE_COSTS
;
16865 /* Implement TARGET_MANGLE_TYPE. */
16867 static const char *
16868 aarch64_mangle_type (const_tree type
)
16870 /* The AArch64 ABI documents say that "__va_list" has to be
16871 mangled as if it is in the "std" namespace. */
16872 if (lang_hooks
.types_compatible_p (CONST_CAST_TREE (type
), va_list_type
))
16873 return "St9__va_list";
16875 /* Half-precision floating point types. */
16876 if (TREE_CODE (type
) == REAL_TYPE
&& TYPE_PRECISION (type
) == 16)
16878 if (TYPE_MODE (type
) == BFmode
)
16884 /* Mangle AArch64-specific internal types. TYPE_NAME is non-NULL_TREE for
16886 if (TYPE_NAME (type
) != NULL
)
16889 if ((res
= aarch64_general_mangle_builtin_type (type
))
16890 || (res
= aarch64_sve::mangle_builtin_type (type
)))
16894 /* Use the default mangling. */
16898 /* Implement TARGET_VERIFY_TYPE_CONTEXT. */
16901 aarch64_verify_type_context (location_t loc
, type_context_kind context
,
16902 const_tree type
, bool silent_p
)
16904 return aarch64_sve::verify_type_context (loc
, context
, type
, silent_p
);
16907 /* Find the first rtx_insn before insn that will generate an assembly
16911 aarch64_prev_real_insn (rtx_insn
*insn
)
16918 insn
= prev_real_insn (insn
);
16920 while (insn
&& recog_memoized (insn
) < 0);
16926 is_madd_op (enum attr_type t1
)
16929 /* A number of these may be AArch32 only. */
16930 enum attr_type mlatypes
[] = {
16931 TYPE_MLA
, TYPE_MLAS
, TYPE_SMLAD
, TYPE_SMLADX
, TYPE_SMLAL
, TYPE_SMLALD
,
16932 TYPE_SMLALS
, TYPE_SMLALXY
, TYPE_SMLAWX
, TYPE_SMLAWY
, TYPE_SMLAXY
,
16933 TYPE_SMMLA
, TYPE_UMLAL
, TYPE_UMLALS
,TYPE_SMLSD
, TYPE_SMLSDX
, TYPE_SMLSLD
16936 for (i
= 0; i
< sizeof (mlatypes
) / sizeof (enum attr_type
); i
++)
16938 if (t1
== mlatypes
[i
])
16945 /* Check if there is a register dependency between a load and the insn
16946 for which we hold recog_data. */
16949 dep_between_memop_and_curr (rtx memop
)
16954 gcc_assert (GET_CODE (memop
) == SET
);
16956 if (!REG_P (SET_DEST (memop
)))
16959 load_reg
= SET_DEST (memop
);
16960 for (opno
= 1; opno
< recog_data
.n_operands
; opno
++)
16962 rtx operand
= recog_data
.operand
[opno
];
16963 if (REG_P (operand
)
16964 && reg_overlap_mentioned_p (load_reg
, operand
))
16972 /* When working around the Cortex-A53 erratum 835769,
16973 given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
16974 instruction and has a preceding memory instruction such that a NOP
16975 should be inserted between them. */
16978 aarch64_madd_needs_nop (rtx_insn
* insn
)
16980 enum attr_type attr_type
;
16984 if (!TARGET_FIX_ERR_A53_835769
)
16987 if (!INSN_P (insn
) || recog_memoized (insn
) < 0)
16990 attr_type
= get_attr_type (insn
);
16991 if (!is_madd_op (attr_type
))
16994 prev
= aarch64_prev_real_insn (insn
);
16995 /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
16996 Restore recog state to INSN to avoid state corruption. */
16997 extract_constrain_insn_cached (insn
);
16999 if (!prev
|| !contains_mem_rtx_p (PATTERN (prev
)))
17002 body
= single_set (prev
);
17004 /* If the previous insn is a memory op and there is no dependency between
17005 it and the DImode madd, emit a NOP between them. If body is NULL then we
17006 have a complex memory operation, probably a load/store pair.
17007 Be conservative for now and emit a NOP. */
17008 if (GET_MODE (recog_data
.operand
[0]) == DImode
17009 && (!body
|| !dep_between_memop_and_curr (body
)))
17017 /* Implement FINAL_PRESCAN_INSN. */
17020 aarch64_final_prescan_insn (rtx_insn
*insn
)
17022 if (aarch64_madd_needs_nop (insn
))
17023 fprintf (asm_out_file
, "\tnop // between mem op and mult-accumulate\n");
17027 /* Return true if BASE_OR_STEP is a valid immediate operand for an SVE INDEX
17031 aarch64_sve_index_immediate_p (rtx base_or_step
)
17033 return (CONST_INT_P (base_or_step
)
17034 && IN_RANGE (INTVAL (base_or_step
), -16, 15));
17037 /* Return true if X is a valid immediate for the SVE ADD and SUB instructions
17038 when applied to mode MODE. Negate X first if NEGATE_P is true. */
17041 aarch64_sve_arith_immediate_p (machine_mode mode
, rtx x
, bool negate_p
)
17043 rtx elt
= unwrap_const_vec_duplicate (x
);
17044 if (!CONST_INT_P (elt
))
17047 HOST_WIDE_INT val
= INTVAL (elt
);
17050 val
&= GET_MODE_MASK (GET_MODE_INNER (mode
));
17053 return IN_RANGE (val
, 0, 0xff);
17054 return IN_RANGE (val
, 0, 0xff00);
17057 /* Return true if X is a valid immediate for the SVE SQADD and SQSUB
17058 instructions when applied to mode MODE. Negate X first if NEGATE_P
17062 aarch64_sve_sqadd_sqsub_immediate_p (machine_mode mode
, rtx x
, bool negate_p
)
17064 if (!aarch64_sve_arith_immediate_p (mode
, x
, negate_p
))
17067 /* After the optional negation, the immediate must be nonnegative.
17068 E.g. a saturating add of -127 must be done via SQSUB Zn.B, Zn.B, #127
17069 instead of SQADD Zn.B, Zn.B, #129. */
17070 rtx elt
= unwrap_const_vec_duplicate (x
);
17071 return negate_p
== (INTVAL (elt
) < 0);
17074 /* Return true if X is a valid immediate operand for an SVE logical
17075 instruction such as AND. */
17078 aarch64_sve_bitmask_immediate_p (rtx x
)
17082 return (const_vec_duplicate_p (x
, &elt
)
17083 && CONST_INT_P (elt
)
17084 && aarch64_bitmask_imm (INTVAL (elt
),
17085 GET_MODE_INNER (GET_MODE (x
))));
17088 /* Return true if X is a valid immediate for the SVE DUP and CPY
17092 aarch64_sve_dup_immediate_p (rtx x
)
17094 x
= aarch64_bit_representation (unwrap_const_vec_duplicate (x
));
17095 if (!CONST_INT_P (x
))
17098 HOST_WIDE_INT val
= INTVAL (x
);
17100 return IN_RANGE (val
, -0x80, 0x7f);
17101 return IN_RANGE (val
, -0x8000, 0x7f00);
17104 /* Return true if X is a valid immediate operand for an SVE CMP instruction.
17105 SIGNED_P says whether the operand is signed rather than unsigned. */
17108 aarch64_sve_cmp_immediate_p (rtx x
, bool signed_p
)
17110 x
= unwrap_const_vec_duplicate (x
);
17111 return (CONST_INT_P (x
)
17113 ? IN_RANGE (INTVAL (x
), -16, 15)
17114 : IN_RANGE (INTVAL (x
), 0, 127)));
17117 /* Return true if X is a valid immediate operand for an SVE FADD or FSUB
17118 instruction. Negate X first if NEGATE_P is true. */
17121 aarch64_sve_float_arith_immediate_p (rtx x
, bool negate_p
)
17126 if (!const_vec_duplicate_p (x
, &elt
)
17127 || GET_CODE (elt
) != CONST_DOUBLE
)
17130 r
= *CONST_DOUBLE_REAL_VALUE (elt
);
17133 r
= real_value_negate (&r
);
17135 if (real_equal (&r
, &dconst1
))
17137 if (real_equal (&r
, &dconsthalf
))
17142 /* Return true if X is a valid immediate operand for an SVE FMUL
17146 aarch64_sve_float_mul_immediate_p (rtx x
)
17150 return (const_vec_duplicate_p (x
, &elt
)
17151 && GET_CODE (elt
) == CONST_DOUBLE
17152 && (real_equal (CONST_DOUBLE_REAL_VALUE (elt
), &dconsthalf
)
17153 || real_equal (CONST_DOUBLE_REAL_VALUE (elt
), &dconst2
)));
17156 /* Return true if replicating VAL32 is a valid 2-byte or 4-byte immediate
17157 for the Advanced SIMD operation described by WHICH and INSN. If INFO
17158 is nonnull, use it to describe valid immediates. */
17160 aarch64_advsimd_valid_immediate_hs (unsigned int val32
,
17161 simd_immediate_info
*info
,
17162 enum simd_immediate_check which
,
17163 simd_immediate_info::insn_type insn
)
17165 /* Try a 4-byte immediate with LSL. */
17166 for (unsigned int shift
= 0; shift
< 32; shift
+= 8)
17167 if ((val32
& (0xff << shift
)) == val32
)
17170 *info
= simd_immediate_info (SImode
, val32
>> shift
, insn
,
17171 simd_immediate_info::LSL
, shift
);
17175 /* Try a 2-byte immediate with LSL. */
17176 unsigned int imm16
= val32
& 0xffff;
17177 if (imm16
== (val32
>> 16))
17178 for (unsigned int shift
= 0; shift
< 16; shift
+= 8)
17179 if ((imm16
& (0xff << shift
)) == imm16
)
17182 *info
= simd_immediate_info (HImode
, imm16
>> shift
, insn
,
17183 simd_immediate_info::LSL
, shift
);
17187 /* Try a 4-byte immediate with MSL, except for cases that MVN
17189 if (which
== AARCH64_CHECK_MOV
)
17190 for (unsigned int shift
= 8; shift
< 24; shift
+= 8)
17192 unsigned int low
= (1 << shift
) - 1;
17193 if (((val32
& (0xff << shift
)) | low
) == val32
)
17196 *info
= simd_immediate_info (SImode
, val32
>> shift
, insn
,
17197 simd_immediate_info::MSL
, shift
);
17205 /* Return true if replicating VAL64 is a valid immediate for the
17206 Advanced SIMD operation described by WHICH. If INFO is nonnull,
17207 use it to describe valid immediates. */
17209 aarch64_advsimd_valid_immediate (unsigned HOST_WIDE_INT val64
,
17210 simd_immediate_info
*info
,
17211 enum simd_immediate_check which
)
17213 unsigned int val32
= val64
& 0xffffffff;
17214 unsigned int val16
= val64
& 0xffff;
17215 unsigned int val8
= val64
& 0xff;
17217 if (val32
== (val64
>> 32))
17219 if ((which
& AARCH64_CHECK_ORR
) != 0
17220 && aarch64_advsimd_valid_immediate_hs (val32
, info
, which
,
17221 simd_immediate_info::MOV
))
17224 if ((which
& AARCH64_CHECK_BIC
) != 0
17225 && aarch64_advsimd_valid_immediate_hs (~val32
, info
, which
,
17226 simd_immediate_info::MVN
))
17229 /* Try using a replicated byte. */
17230 if (which
== AARCH64_CHECK_MOV
17231 && val16
== (val32
>> 16)
17232 && val8
== (val16
>> 8))
17235 *info
= simd_immediate_info (QImode
, val8
);
17240 /* Try using a bit-to-bytemask. */
17241 if (which
== AARCH64_CHECK_MOV
)
17244 for (i
= 0; i
< 64; i
+= 8)
17246 unsigned char byte
= (val64
>> i
) & 0xff;
17247 if (byte
!= 0 && byte
!= 0xff)
17253 *info
= simd_immediate_info (DImode
, val64
);
17260 /* Return true if replicating VAL64 gives a valid immediate for an SVE MOV
17261 instruction. If INFO is nonnull, use it to describe valid immediates. */
17264 aarch64_sve_valid_immediate (unsigned HOST_WIDE_INT val64
,
17265 simd_immediate_info
*info
)
17267 scalar_int_mode mode
= DImode
;
17268 unsigned int val32
= val64
& 0xffffffff;
17269 if (val32
== (val64
>> 32))
17272 unsigned int val16
= val32
& 0xffff;
17273 if (val16
== (val32
>> 16))
17276 unsigned int val8
= val16
& 0xff;
17277 if (val8
== (val16
>> 8))
17281 HOST_WIDE_INT val
= trunc_int_for_mode (val64
, mode
);
17282 if (IN_RANGE (val
, -0x80, 0x7f))
17284 /* DUP with no shift. */
17286 *info
= simd_immediate_info (mode
, val
);
17289 if ((val
& 0xff) == 0 && IN_RANGE (val
, -0x8000, 0x7f00))
17291 /* DUP with LSL #8. */
17293 *info
= simd_immediate_info (mode
, val
);
17296 if (aarch64_bitmask_imm (val64
, mode
))
17300 *info
= simd_immediate_info (mode
, val
);
17306 /* Return true if X is an UNSPEC_PTRUE constant of the form:
17308 (const (unspec [PATTERN ZERO] UNSPEC_PTRUE))
17310 where PATTERN is the svpattern as a CONST_INT and where ZERO
17311 is a zero constant of the required PTRUE mode (which can have
17312 fewer elements than X's mode, if zero bits are significant).
17314 If so, and if INFO is nonnull, describe the immediate in INFO. */
17316 aarch64_sve_ptrue_svpattern_p (rtx x
, struct simd_immediate_info
*info
)
17318 if (GET_CODE (x
) != CONST
)
17322 if (GET_CODE (x
) != UNSPEC
|| XINT (x
, 1) != UNSPEC_PTRUE
)
17327 aarch64_svpattern pattern
17328 = (aarch64_svpattern
) INTVAL (XVECEXP (x
, 0, 0));
17329 machine_mode pred_mode
= GET_MODE (XVECEXP (x
, 0, 1));
17330 scalar_int_mode int_mode
= aarch64_sve_element_int_mode (pred_mode
);
17331 *info
= simd_immediate_info (int_mode
, pattern
);
17336 /* Return true if X is a valid SVE predicate. If INFO is nonnull, use
17337 it to describe valid immediates. */
17340 aarch64_sve_pred_valid_immediate (rtx x
, simd_immediate_info
*info
)
17342 if (aarch64_sve_ptrue_svpattern_p (x
, info
))
17345 if (x
== CONST0_RTX (GET_MODE (x
)))
17348 *info
= simd_immediate_info (DImode
, 0);
17352 /* Analyze the value as a VNx16BImode. This should be relatively
17353 efficient, since rtx_vector_builder has enough built-in capacity
17354 to store all VLA predicate constants without needing the heap. */
17355 rtx_vector_builder builder
;
17356 if (!aarch64_get_sve_pred_bits (builder
, x
))
17359 unsigned int elt_size
= aarch64_widest_sve_pred_elt_size (builder
);
17360 if (int vl
= aarch64_partial_ptrue_length (builder
, elt_size
))
17362 machine_mode mode
= aarch64_sve_pred_mode (elt_size
).require ();
17363 aarch64_svpattern pattern
= aarch64_svpattern_for_vl (mode
, vl
);
17364 if (pattern
!= AARCH64_NUM_SVPATTERNS
)
17368 scalar_int_mode int_mode
= aarch64_sve_element_int_mode (mode
);
17369 *info
= simd_immediate_info (int_mode
, pattern
);
17377 /* Return true if OP is a valid SIMD immediate for the operation
17378 described by WHICH. If INFO is nonnull, use it to describe valid
17381 aarch64_simd_valid_immediate (rtx op
, simd_immediate_info
*info
,
17382 enum simd_immediate_check which
)
17384 machine_mode mode
= GET_MODE (op
);
17385 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
17386 if (vec_flags
== 0 || vec_flags
== (VEC_ADVSIMD
| VEC_STRUCT
))
17389 if (vec_flags
& VEC_SVE_PRED
)
17390 return aarch64_sve_pred_valid_immediate (op
, info
);
17392 scalar_mode elt_mode
= GET_MODE_INNER (mode
);
17394 unsigned int n_elts
;
17395 if (GET_CODE (op
) == CONST_VECTOR
17396 && CONST_VECTOR_DUPLICATE_P (op
))
17397 n_elts
= CONST_VECTOR_NPATTERNS (op
);
17398 else if ((vec_flags
& VEC_SVE_DATA
)
17399 && const_vec_series_p (op
, &base
, &step
))
17401 gcc_assert (GET_MODE_CLASS (mode
) == MODE_VECTOR_INT
);
17402 if (!aarch64_sve_index_immediate_p (base
)
17403 || !aarch64_sve_index_immediate_p (step
))
17408 /* Get the corresponding container mode. E.g. an INDEX on V2SI
17409 should yield two integer values per 128-bit block, meaning
17410 that we need to treat it in the same way as V2DI and then
17411 ignore the upper 32 bits of each element. */
17412 elt_mode
= aarch64_sve_container_int_mode (mode
);
17413 *info
= simd_immediate_info (elt_mode
, base
, step
);
17417 else if (GET_CODE (op
) == CONST_VECTOR
17418 && CONST_VECTOR_NUNITS (op
).is_constant (&n_elts
))
17419 /* N_ELTS set above. */;
17423 scalar_float_mode elt_float_mode
;
17425 && is_a
<scalar_float_mode
> (elt_mode
, &elt_float_mode
))
17427 rtx elt
= CONST_VECTOR_ENCODED_ELT (op
, 0);
17428 if (aarch64_float_const_zero_rtx_p (elt
)
17429 || aarch64_float_const_representable_p (elt
))
17432 *info
= simd_immediate_info (elt_float_mode
, elt
);
17437 /* If all elements in an SVE vector have the same value, we have a free
17438 choice between using the element mode and using the container mode.
17439 Using the element mode means that unused parts of the vector are
17440 duplicates of the used elements, while using the container mode means
17441 that the unused parts are an extension of the used elements. Using the
17442 element mode is better for (say) VNx4HI 0x101, since 0x01010101 is valid
17443 for its container mode VNx4SI while 0x00000101 isn't.
17445 If not all elements in an SVE vector have the same value, we need the
17446 transition from one element to the next to occur at container boundaries.
17447 E.g. a fixed-length VNx4HI containing { 1, 2, 3, 4 } should be treated
17448 in the same way as a VNx4SI containing { 1, 2, 3, 4 }. */
17449 scalar_int_mode elt_int_mode
;
17450 if ((vec_flags
& VEC_SVE_DATA
) && n_elts
> 1)
17451 elt_int_mode
= aarch64_sve_container_int_mode (mode
);
17453 elt_int_mode
= int_mode_for_mode (elt_mode
).require ();
17455 unsigned int elt_size
= GET_MODE_SIZE (elt_int_mode
);
17459 /* Expand the vector constant out into a byte vector, with the least
17460 significant byte of the register first. */
17461 auto_vec
<unsigned char, 16> bytes
;
17462 bytes
.reserve (n_elts
* elt_size
);
17463 for (unsigned int i
= 0; i
< n_elts
; i
++)
17465 /* The vector is provided in gcc endian-neutral fashion.
17466 For aarch64_be Advanced SIMD, it must be laid out in the vector
17467 register in reverse order. */
17468 bool swap_p
= ((vec_flags
& VEC_ADVSIMD
) != 0 && BYTES_BIG_ENDIAN
);
17469 rtx elt
= CONST_VECTOR_ELT (op
, swap_p
? (n_elts
- 1 - i
) : i
);
17471 if (elt_mode
!= elt_int_mode
)
17472 elt
= gen_lowpart (elt_int_mode
, elt
);
17474 if (!CONST_INT_P (elt
))
17477 unsigned HOST_WIDE_INT elt_val
= INTVAL (elt
);
17478 for (unsigned int byte
= 0; byte
< elt_size
; byte
++)
17480 bytes
.quick_push (elt_val
& 0xff);
17481 elt_val
>>= BITS_PER_UNIT
;
17485 /* The immediate must repeat every eight bytes. */
17486 unsigned int nbytes
= bytes
.length ();
17487 for (unsigned i
= 8; i
< nbytes
; ++i
)
17488 if (bytes
[i
] != bytes
[i
- 8])
17491 /* Get the repeating 8-byte value as an integer. No endian correction
17492 is needed here because bytes is already in lsb-first order. */
17493 unsigned HOST_WIDE_INT val64
= 0;
17494 for (unsigned int i
= 0; i
< 8; i
++)
17495 val64
|= ((unsigned HOST_WIDE_INT
) bytes
[i
% nbytes
]
17496 << (i
* BITS_PER_UNIT
));
17498 if (vec_flags
& VEC_SVE_DATA
)
17499 return aarch64_sve_valid_immediate (val64
, info
);
17501 return aarch64_advsimd_valid_immediate (val64
, info
, which
);
17504 /* Check whether X is a VEC_SERIES-like constant that starts at 0 and
17505 has a step in the range of INDEX. Return the index expression if so,
17506 otherwise return null. */
17508 aarch64_check_zero_based_sve_index_immediate (rtx x
)
17511 if (const_vec_series_p (x
, &base
, &step
)
17512 && base
== const0_rtx
17513 && aarch64_sve_index_immediate_p (step
))
17518 /* Check of immediate shift constants are within range. */
17520 aarch64_simd_shift_imm_p (rtx x
, machine_mode mode
, bool left
)
17522 x
= unwrap_const_vec_duplicate (x
);
17523 if (!CONST_INT_P (x
))
17525 int bit_width
= GET_MODE_UNIT_SIZE (mode
) * BITS_PER_UNIT
;
17527 return IN_RANGE (INTVAL (x
), 0, bit_width
- 1);
17529 return IN_RANGE (INTVAL (x
), 1, bit_width
);
17532 /* Return the bitmask CONST_INT to select the bits required by a zero extract
17533 operation of width WIDTH at bit position POS. */
17536 aarch64_mask_from_zextract_ops (rtx width
, rtx pos
)
17538 gcc_assert (CONST_INT_P (width
));
17539 gcc_assert (CONST_INT_P (pos
));
17541 unsigned HOST_WIDE_INT mask
17542 = ((unsigned HOST_WIDE_INT
) 1 << UINTVAL (width
)) - 1;
17543 return GEN_INT (mask
<< UINTVAL (pos
));
17547 aarch64_mov_operand_p (rtx x
, machine_mode mode
)
17549 if (GET_CODE (x
) == HIGH
17550 && aarch64_valid_symref (XEXP (x
, 0), GET_MODE (XEXP (x
, 0))))
17553 if (CONST_INT_P (x
))
17556 if (VECTOR_MODE_P (GET_MODE (x
)))
17558 /* Require predicate constants to be VNx16BI before RA, so that we
17559 force everything to have a canonical form. */
17560 if (!lra_in_progress
17561 && !reload_completed
17562 && GET_MODE_CLASS (GET_MODE (x
)) == MODE_VECTOR_BOOL
17563 && GET_MODE (x
) != VNx16BImode
)
17566 return aarch64_simd_valid_immediate (x
, NULL
);
17569 if (GET_CODE (x
) == SYMBOL_REF
&& mode
== DImode
&& CONSTANT_ADDRESS_P (x
))
17572 if (TARGET_SVE
&& aarch64_sve_cnt_immediate_p (x
))
17575 return aarch64_classify_symbolic_expression (x
)
17576 == SYMBOL_TINY_ABSOLUTE
;
17579 /* Return a const_int vector of VAL. */
17581 aarch64_simd_gen_const_vector_dup (machine_mode mode
, HOST_WIDE_INT val
)
17583 rtx c
= gen_int_mode (val
, GET_MODE_INNER (mode
));
17584 return gen_const_vec_duplicate (mode
, c
);
17587 /* Check OP is a legal scalar immediate for the MOVI instruction. */
17590 aarch64_simd_scalar_immediate_valid_for_move (rtx op
, scalar_int_mode mode
)
17592 machine_mode vmode
;
17594 vmode
= aarch64_simd_container_mode (mode
, 64);
17595 rtx op_v
= aarch64_simd_gen_const_vector_dup (vmode
, INTVAL (op
));
17596 return aarch64_simd_valid_immediate (op_v
, NULL
);
17599 /* Construct and return a PARALLEL RTX vector with elements numbering the
17600 lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
17601 the vector - from the perspective of the architecture. This does not
17602 line up with GCC's perspective on lane numbers, so we end up with
17603 different masks depending on our target endian-ness. The diagram
17604 below may help. We must draw the distinction when building masks
17605 which select one half of the vector. An instruction selecting
17606 architectural low-lanes for a big-endian target, must be described using
17607 a mask selecting GCC high-lanes.
17609 Big-Endian Little-Endian
17611 GCC 0 1 2 3 3 2 1 0
17612 | x | x | x | x | | x | x | x | x |
17613 Architecture 3 2 1 0 3 2 1 0
17615 Low Mask: { 2, 3 } { 0, 1 }
17616 High Mask: { 0, 1 } { 2, 3 }
17618 MODE Is the mode of the vector and NUNITS is the number of units in it. */
17621 aarch64_simd_vect_par_cnst_half (machine_mode mode
, int nunits
, bool high
)
17623 rtvec v
= rtvec_alloc (nunits
/ 2);
17624 int high_base
= nunits
/ 2;
17630 if (BYTES_BIG_ENDIAN
)
17631 base
= high
? low_base
: high_base
;
17633 base
= high
? high_base
: low_base
;
17635 for (i
= 0; i
< nunits
/ 2; i
++)
17636 RTVEC_ELT (v
, i
) = GEN_INT (base
+ i
);
17638 t1
= gen_rtx_PARALLEL (mode
, v
);
17642 /* Check OP for validity as a PARALLEL RTX vector with elements
17643 numbering the lanes of either the high (HIGH == TRUE) or low lanes,
17644 from the perspective of the architecture. See the diagram above
17645 aarch64_simd_vect_par_cnst_half for more details. */
17648 aarch64_simd_check_vect_par_cnst_half (rtx op
, machine_mode mode
,
17652 if (!VECTOR_MODE_P (mode
) || !GET_MODE_NUNITS (mode
).is_constant (&nelts
))
17655 rtx ideal
= aarch64_simd_vect_par_cnst_half (mode
, nelts
, high
);
17656 HOST_WIDE_INT count_op
= XVECLEN (op
, 0);
17657 HOST_WIDE_INT count_ideal
= XVECLEN (ideal
, 0);
17660 if (count_op
!= count_ideal
)
17663 for (i
= 0; i
< count_ideal
; i
++)
17665 rtx elt_op
= XVECEXP (op
, 0, i
);
17666 rtx elt_ideal
= XVECEXP (ideal
, 0, i
);
17668 if (!CONST_INT_P (elt_op
)
17669 || INTVAL (elt_ideal
) != INTVAL (elt_op
))
17675 /* Return a PARALLEL containing NELTS elements, with element I equal
17676 to BASE + I * STEP. */
17679 aarch64_gen_stepped_int_parallel (unsigned int nelts
, int base
, int step
)
17681 rtvec vec
= rtvec_alloc (nelts
);
17682 for (unsigned int i
= 0; i
< nelts
; ++i
)
17683 RTVEC_ELT (vec
, i
) = gen_int_mode (base
+ i
* step
, DImode
);
17684 return gen_rtx_PARALLEL (VOIDmode
, vec
);
17687 /* Return true if OP is a PARALLEL of CONST_INTs that form a linear
17688 series with step STEP. */
17691 aarch64_stepped_int_parallel_p (rtx op
, int step
)
17693 if (GET_CODE (op
) != PARALLEL
|| !CONST_INT_P (XVECEXP (op
, 0, 0)))
17696 unsigned HOST_WIDE_INT base
= UINTVAL (XVECEXP (op
, 0, 0));
17697 for (int i
= 1; i
< XVECLEN (op
, 0); ++i
)
17698 if (!CONST_INT_P (XVECEXP (op
, 0, i
))
17699 || UINTVAL (XVECEXP (op
, 0, i
)) != base
+ i
* step
)
17705 /* Bounds-check lanes. Ensure OPERAND lies between LOW (inclusive) and
17706 HIGH (exclusive). */
17708 aarch64_simd_lane_bounds (rtx operand
, HOST_WIDE_INT low
, HOST_WIDE_INT high
,
17711 HOST_WIDE_INT lane
;
17712 gcc_assert (CONST_INT_P (operand
));
17713 lane
= INTVAL (operand
);
17715 if (lane
< low
|| lane
>= high
)
17718 error ("%Klane %wd out of range %wd - %wd", exp
, lane
, low
, high
- 1);
17720 error ("lane %wd out of range %wd - %wd", lane
, low
, high
- 1);
17724 /* Peform endian correction on lane number N, which indexes a vector
17725 of mode MODE, and return the result as an SImode rtx. */
17728 aarch64_endian_lane_rtx (machine_mode mode
, unsigned int n
)
17730 return gen_int_mode (ENDIAN_LANE_N (GET_MODE_NUNITS (mode
), n
), SImode
);
17733 /* Return TRUE if OP is a valid vector addressing mode. */
17736 aarch64_simd_mem_operand_p (rtx op
)
17738 return MEM_P (op
) && (GET_CODE (XEXP (op
, 0)) == POST_INC
17739 || REG_P (XEXP (op
, 0)));
17742 /* Return true if OP is a valid MEM operand for an SVE LD1R instruction. */
17745 aarch64_sve_ld1r_operand_p (rtx op
)
17747 struct aarch64_address_info addr
;
17751 && is_a
<scalar_mode
> (GET_MODE (op
), &mode
)
17752 && aarch64_classify_address (&addr
, XEXP (op
, 0), mode
, false)
17753 && addr
.type
== ADDRESS_REG_IMM
17754 && offset_6bit_unsigned_scaled_p (mode
, addr
.const_offset
));
17757 /* Return true if OP is a valid MEM operand for an SVE LD1R{Q,O} instruction
17758 where the size of the read data is specified by `mode` and the size of the
17759 vector elements are specified by `elem_mode`. */
17761 aarch64_sve_ld1rq_ld1ro_operand_p (rtx op
, machine_mode mode
,
17762 scalar_mode elem_mode
)
17764 struct aarch64_address_info addr
;
17766 || !aarch64_classify_address (&addr
, XEXP (op
, 0), elem_mode
, false))
17769 if (addr
.type
== ADDRESS_REG_IMM
)
17770 return offset_4bit_signed_scaled_p (mode
, addr
.const_offset
);
17772 if (addr
.type
== ADDRESS_REG_REG
)
17773 return (1U << addr
.shift
) == GET_MODE_SIZE (elem_mode
);
17778 /* Return true if OP is a valid MEM operand for an SVE LD1RQ instruction. */
17780 aarch64_sve_ld1rq_operand_p (rtx op
)
17782 return aarch64_sve_ld1rq_ld1ro_operand_p (op
, TImode
,
17783 GET_MODE_INNER (GET_MODE (op
)));
17786 /* Return true if OP is a valid MEM operand for an SVE LD1RO instruction for
17787 accessing a vector where the element size is specified by `elem_mode`. */
17789 aarch64_sve_ld1ro_operand_p (rtx op
, scalar_mode elem_mode
)
17791 return aarch64_sve_ld1rq_ld1ro_operand_p (op
, OImode
, elem_mode
);
17794 /* Return true if OP is a valid MEM operand for an SVE LDFF1 instruction. */
17796 aarch64_sve_ldff1_operand_p (rtx op
)
17801 struct aarch64_address_info addr
;
17802 if (!aarch64_classify_address (&addr
, XEXP (op
, 0), GET_MODE (op
), false))
17805 if (addr
.type
== ADDRESS_REG_IMM
)
17806 return known_eq (addr
.const_offset
, 0);
17808 return addr
.type
== ADDRESS_REG_REG
;
17811 /* Return true if OP is a valid MEM operand for an SVE LDNF1 instruction. */
17813 aarch64_sve_ldnf1_operand_p (rtx op
)
17815 struct aarch64_address_info addr
;
17818 && aarch64_classify_address (&addr
, XEXP (op
, 0),
17819 GET_MODE (op
), false)
17820 && addr
.type
== ADDRESS_REG_IMM
);
17823 /* Return true if OP is a valid MEM operand for an SVE LDR instruction.
17824 The conditions for STR are the same. */
17826 aarch64_sve_ldr_operand_p (rtx op
)
17828 struct aarch64_address_info addr
;
17831 && aarch64_classify_address (&addr
, XEXP (op
, 0), GET_MODE (op
),
17832 false, ADDR_QUERY_ANY
)
17833 && addr
.type
== ADDRESS_REG_IMM
);
17836 /* Return true if OP is a valid address for an SVE PRF[BHWD] instruction,
17837 addressing memory of mode MODE. */
17839 aarch64_sve_prefetch_operand_p (rtx op
, machine_mode mode
)
17841 struct aarch64_address_info addr
;
17842 if (!aarch64_classify_address (&addr
, op
, mode
, false))
17845 if (addr
.type
== ADDRESS_REG_IMM
)
17846 return known_eq (addr
.const_offset
, 0);
17848 return addr
.type
== ADDRESS_REG_REG
;
17851 /* Return true if OP is a valid MEM operand for an SVE_STRUCT mode.
17852 We need to be able to access the individual pieces, so the range
17853 is different from LD[234] and ST[234]. */
17855 aarch64_sve_struct_memory_operand_p (rtx op
)
17860 machine_mode mode
= GET_MODE (op
);
17861 struct aarch64_address_info addr
;
17862 if (!aarch64_classify_address (&addr
, XEXP (op
, 0), SVE_BYTE_MODE
, false,
17864 || addr
.type
!= ADDRESS_REG_IMM
)
17867 poly_int64 first
= addr
.const_offset
;
17868 poly_int64 last
= first
+ GET_MODE_SIZE (mode
) - BYTES_PER_SVE_VECTOR
;
17869 return (offset_4bit_signed_scaled_p (SVE_BYTE_MODE
, first
)
17870 && offset_4bit_signed_scaled_p (SVE_BYTE_MODE
, last
));
17873 /* Emit a register copy from operand to operand, taking care not to
17874 early-clobber source registers in the process.
17876 COUNT is the number of components into which the copy needs to be
17879 aarch64_simd_emit_reg_reg_move (rtx
*operands
, machine_mode mode
,
17880 unsigned int count
)
17883 int rdest
= REGNO (operands
[0]);
17884 int rsrc
= REGNO (operands
[1]);
17886 if (!reg_overlap_mentioned_p (operands
[0], operands
[1])
17888 for (i
= 0; i
< count
; i
++)
17889 emit_move_insn (gen_rtx_REG (mode
, rdest
+ i
),
17890 gen_rtx_REG (mode
, rsrc
+ i
));
17892 for (i
= 0; i
< count
; i
++)
17893 emit_move_insn (gen_rtx_REG (mode
, rdest
+ count
- i
- 1),
17894 gen_rtx_REG (mode
, rsrc
+ count
- i
- 1));
17897 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
17898 one of VSTRUCT modes: OI, CI, or XI. */
17900 aarch64_simd_attr_length_rglist (machine_mode mode
)
17902 /* This is only used (and only meaningful) for Advanced SIMD, not SVE. */
17903 return (GET_MODE_SIZE (mode
).to_constant () / UNITS_PER_VREG
) * 4;
17906 /* Implement target hook TARGET_VECTOR_ALIGNMENT. The AAPCS64 sets the maximum
17907 alignment of a vector to 128 bits. SVE predicates have an alignment of
17909 static HOST_WIDE_INT
17910 aarch64_simd_vector_alignment (const_tree type
)
17912 /* ??? Checking the mode isn't ideal, but VECTOR_BOOLEAN_TYPE_P can
17913 be set for non-predicate vectors of booleans. Modes are the most
17914 direct way we have of identifying real SVE predicate types. */
17915 if (GET_MODE_CLASS (TYPE_MODE (type
)) == MODE_VECTOR_BOOL
)
17917 widest_int min_size
17918 = constant_lower_bound (wi::to_poly_widest (TYPE_SIZE (type
)));
17919 return wi::umin (min_size
, 128).to_uhwi ();
17922 /* Implement target hook TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT. */
17924 aarch64_vectorize_preferred_vector_alignment (const_tree type
)
17926 if (aarch64_sve_data_mode_p (TYPE_MODE (type
)))
17928 /* If the length of the vector is fixed, try to align to that length,
17929 otherwise don't try to align at all. */
17930 HOST_WIDE_INT result
;
17931 if (!BITS_PER_SVE_VECTOR
.is_constant (&result
))
17932 result
= TYPE_ALIGN (TREE_TYPE (type
));
17935 return TYPE_ALIGN (type
);
17938 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE. */
17940 aarch64_simd_vector_alignment_reachable (const_tree type
, bool is_packed
)
17945 /* For fixed-length vectors, check that the vectorizer will aim for
17946 full-vector alignment. This isn't true for generic GCC vectors
17947 that are wider than the ABI maximum of 128 bits. */
17948 poly_uint64 preferred_alignment
=
17949 aarch64_vectorize_preferred_vector_alignment (type
);
17950 if (TREE_CODE (TYPE_SIZE (type
)) == INTEGER_CST
17951 && maybe_ne (wi::to_widest (TYPE_SIZE (type
)),
17952 preferred_alignment
))
17955 /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned. */
17959 /* Return true if the vector misalignment factor is supported by the
17962 aarch64_builtin_support_vector_misalignment (machine_mode mode
,
17963 const_tree type
, int misalignment
,
17966 if (TARGET_SIMD
&& STRICT_ALIGNMENT
)
17968 /* Return if movmisalign pattern is not supported for this mode. */
17969 if (optab_handler (movmisalign_optab
, mode
) == CODE_FOR_nothing
)
17972 /* Misalignment factor is unknown at compile time. */
17973 if (misalignment
== -1)
17976 return default_builtin_support_vector_misalignment (mode
, type
, misalignment
,
17980 /* If VALS is a vector constant that can be loaded into a register
17981 using DUP, generate instructions to do so and return an RTX to
17982 assign to the register. Otherwise return NULL_RTX. */
17984 aarch64_simd_dup_constant (rtx vals
)
17986 machine_mode mode
= GET_MODE (vals
);
17987 machine_mode inner_mode
= GET_MODE_INNER (mode
);
17990 if (!const_vec_duplicate_p (vals
, &x
))
17993 /* We can load this constant by using DUP and a constant in a
17994 single ARM register. This will be cheaper than a vector
17996 x
= copy_to_mode_reg (inner_mode
, x
);
17997 return gen_vec_duplicate (mode
, x
);
18001 /* Generate code to load VALS, which is a PARALLEL containing only
18002 constants (for vec_init) or CONST_VECTOR, efficiently into a
18003 register. Returns an RTX to copy into the register, or NULL_RTX
18004 for a PARALLEL that cannot be converted into a CONST_VECTOR. */
18006 aarch64_simd_make_constant (rtx vals
)
18008 machine_mode mode
= GET_MODE (vals
);
18010 rtx const_vec
= NULL_RTX
;
18014 if (GET_CODE (vals
) == CONST_VECTOR
)
18016 else if (GET_CODE (vals
) == PARALLEL
)
18018 /* A CONST_VECTOR must contain only CONST_INTs and
18019 CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
18020 Only store valid constants in a CONST_VECTOR. */
18021 int n_elts
= XVECLEN (vals
, 0);
18022 for (i
= 0; i
< n_elts
; ++i
)
18024 rtx x
= XVECEXP (vals
, 0, i
);
18025 if (CONST_INT_P (x
) || CONST_DOUBLE_P (x
))
18028 if (n_const
== n_elts
)
18029 const_vec
= gen_rtx_CONST_VECTOR (mode
, XVEC (vals
, 0));
18032 gcc_unreachable ();
18034 if (const_vec
!= NULL_RTX
18035 && aarch64_simd_valid_immediate (const_vec
, NULL
))
18036 /* Load using MOVI/MVNI. */
18038 else if ((const_dup
= aarch64_simd_dup_constant (vals
)) != NULL_RTX
)
18039 /* Loaded using DUP. */
18041 else if (const_vec
!= NULL_RTX
)
18042 /* Load from constant pool. We cannot take advantage of single-cycle
18043 LD1 because we need a PC-relative addressing mode. */
18046 /* A PARALLEL containing something not valid inside CONST_VECTOR.
18047 We cannot construct an initializer. */
18051 /* Expand a vector initialisation sequence, such that TARGET is
18052 initialised to contain VALS. */
18055 aarch64_expand_vector_init (rtx target
, rtx vals
)
18057 machine_mode mode
= GET_MODE (target
);
18058 scalar_mode inner_mode
= GET_MODE_INNER (mode
);
18059 /* The number of vector elements. */
18060 int n_elts
= XVECLEN (vals
, 0);
18061 /* The number of vector elements which are not constant. */
18063 rtx any_const
= NULL_RTX
;
18064 /* The first element of vals. */
18065 rtx v0
= XVECEXP (vals
, 0, 0);
18066 bool all_same
= true;
18068 /* This is a special vec_init<M><N> where N is not an element mode but a
18069 vector mode with half the elements of M. We expect to find two entries
18070 of mode N in VALS and we must put their concatentation into TARGET. */
18071 if (XVECLEN (vals
, 0) == 2 && VECTOR_MODE_P (GET_MODE (XVECEXP (vals
, 0, 0))))
18073 gcc_assert (known_eq (GET_MODE_SIZE (mode
),
18074 2 * GET_MODE_SIZE (GET_MODE (XVECEXP (vals
, 0, 0)))));
18075 rtx lo
= XVECEXP (vals
, 0, 0);
18076 rtx hi
= XVECEXP (vals
, 0, 1);
18077 machine_mode narrow_mode
= GET_MODE (lo
);
18078 gcc_assert (GET_MODE_INNER (narrow_mode
) == inner_mode
);
18079 gcc_assert (narrow_mode
== GET_MODE (hi
));
18081 /* When we want to concatenate a half-width vector with zeroes we can
18082 use the aarch64_combinez[_be] patterns. Just make sure that the
18083 zeroes are in the right half. */
18084 if (BYTES_BIG_ENDIAN
18085 && aarch64_simd_imm_zero (lo
, narrow_mode
)
18086 && general_operand (hi
, narrow_mode
))
18087 emit_insn (gen_aarch64_combinez_be (narrow_mode
, target
, hi
, lo
));
18088 else if (!BYTES_BIG_ENDIAN
18089 && aarch64_simd_imm_zero (hi
, narrow_mode
)
18090 && general_operand (lo
, narrow_mode
))
18091 emit_insn (gen_aarch64_combinez (narrow_mode
, target
, lo
, hi
));
18094 /* Else create the two half-width registers and combine them. */
18096 lo
= force_reg (GET_MODE (lo
), lo
);
18098 hi
= force_reg (GET_MODE (hi
), hi
);
18100 if (BYTES_BIG_ENDIAN
)
18101 std::swap (lo
, hi
);
18102 emit_insn (gen_aarch64_simd_combine (narrow_mode
, target
, lo
, hi
));
18107 /* Count the number of variable elements to initialise. */
18108 for (int i
= 0; i
< n_elts
; ++i
)
18110 rtx x
= XVECEXP (vals
, 0, i
);
18111 if (!(CONST_INT_P (x
) || CONST_DOUBLE_P (x
)))
18116 all_same
&= rtx_equal_p (x
, v0
);
18119 /* No variable elements, hand off to aarch64_simd_make_constant which knows
18120 how best to handle this. */
18123 rtx constant
= aarch64_simd_make_constant (vals
);
18124 if (constant
!= NULL_RTX
)
18126 emit_move_insn (target
, constant
);
18131 /* Splat a single non-constant element if we can. */
18134 rtx x
= copy_to_mode_reg (inner_mode
, v0
);
18135 aarch64_emit_move (target
, gen_vec_duplicate (mode
, x
));
18139 enum insn_code icode
= optab_handler (vec_set_optab
, mode
);
18140 gcc_assert (icode
!= CODE_FOR_nothing
);
18142 /* If there are only variable elements, try to optimize
18143 the insertion using dup for the most common element
18144 followed by insertions. */
18146 /* The algorithm will fill matches[*][0] with the earliest matching element,
18147 and matches[X][1] with the count of duplicate elements (if X is the
18148 earliest element which has duplicates). */
18150 if (n_var
== n_elts
&& n_elts
<= 16)
18152 int matches
[16][2] = {0};
18153 for (int i
= 0; i
< n_elts
; i
++)
18155 for (int j
= 0; j
<= i
; j
++)
18157 if (rtx_equal_p (XVECEXP (vals
, 0, i
), XVECEXP (vals
, 0, j
)))
18165 int maxelement
= 0;
18167 for (int i
= 0; i
< n_elts
; i
++)
18168 if (matches
[i
][1] > maxv
)
18171 maxv
= matches
[i
][1];
18174 /* Create a duplicate of the most common element, unless all elements
18175 are equally useless to us, in which case just immediately set the
18176 vector register using the first element. */
18180 /* For vectors of two 64-bit elements, we can do even better. */
18182 && (inner_mode
== E_DImode
18183 || inner_mode
== E_DFmode
))
18186 rtx x0
= XVECEXP (vals
, 0, 0);
18187 rtx x1
= XVECEXP (vals
, 0, 1);
18188 /* Combine can pick up this case, but handling it directly
18189 here leaves clearer RTL.
18191 This is load_pair_lanes<mode>, and also gives us a clean-up
18192 for store_pair_lanes<mode>. */
18193 if (memory_operand (x0
, inner_mode
)
18194 && memory_operand (x1
, inner_mode
)
18195 && !STRICT_ALIGNMENT
18196 && rtx_equal_p (XEXP (x1
, 0),
18197 plus_constant (Pmode
,
18199 GET_MODE_SIZE (inner_mode
))))
18202 if (inner_mode
== DFmode
)
18203 t
= gen_load_pair_lanesdf (target
, x0
, x1
);
18205 t
= gen_load_pair_lanesdi (target
, x0
, x1
);
18210 /* The subreg-move sequence below will move into lane zero of the
18211 vector register. For big-endian we want that position to hold
18212 the last element of VALS. */
18213 maxelement
= BYTES_BIG_ENDIAN
? n_elts
- 1 : 0;
18214 rtx x
= copy_to_mode_reg (inner_mode
, XVECEXP (vals
, 0, maxelement
));
18215 aarch64_emit_move (target
, lowpart_subreg (mode
, x
, inner_mode
));
18219 rtx x
= copy_to_mode_reg (inner_mode
, XVECEXP (vals
, 0, maxelement
));
18220 aarch64_emit_move (target
, gen_vec_duplicate (mode
, x
));
18223 /* Insert the rest. */
18224 for (int i
= 0; i
< n_elts
; i
++)
18226 rtx x
= XVECEXP (vals
, 0, i
);
18227 if (matches
[i
][0] == maxelement
)
18229 x
= copy_to_mode_reg (inner_mode
, x
);
18230 emit_insn (GEN_FCN (icode
) (target
, x
, GEN_INT (i
)));
18235 /* Initialise a vector which is part-variable. We want to first try
18236 to build those lanes which are constant in the most efficient way we
18238 if (n_var
!= n_elts
)
18240 rtx copy
= copy_rtx (vals
);
18242 /* Load constant part of vector. We really don't care what goes into the
18243 parts we will overwrite, but we're more likely to be able to load the
18244 constant efficiently if it has fewer, larger, repeating parts
18245 (see aarch64_simd_valid_immediate). */
18246 for (int i
= 0; i
< n_elts
; i
++)
18248 rtx x
= XVECEXP (vals
, 0, i
);
18249 if (CONST_INT_P (x
) || CONST_DOUBLE_P (x
))
18251 rtx subst
= any_const
;
18252 for (int bit
= n_elts
/ 2; bit
> 0; bit
/= 2)
18254 /* Look in the copied vector, as more elements are const. */
18255 rtx test
= XVECEXP (copy
, 0, i
^ bit
);
18256 if (CONST_INT_P (test
) || CONST_DOUBLE_P (test
))
18262 XVECEXP (copy
, 0, i
) = subst
;
18264 aarch64_expand_vector_init (target
, copy
);
18267 /* Insert the variable lanes directly. */
18268 for (int i
= 0; i
< n_elts
; i
++)
18270 rtx x
= XVECEXP (vals
, 0, i
);
18271 if (CONST_INT_P (x
) || CONST_DOUBLE_P (x
))
18273 x
= copy_to_mode_reg (inner_mode
, x
);
18274 emit_insn (GEN_FCN (icode
) (target
, x
, GEN_INT (i
)));
18278 /* Emit RTL corresponding to:
18279 insr TARGET, ELEM. */
18282 emit_insr (rtx target
, rtx elem
)
18284 machine_mode mode
= GET_MODE (target
);
18285 scalar_mode elem_mode
= GET_MODE_INNER (mode
);
18286 elem
= force_reg (elem_mode
, elem
);
18288 insn_code icode
= optab_handler (vec_shl_insert_optab
, mode
);
18289 gcc_assert (icode
!= CODE_FOR_nothing
);
18290 emit_insn (GEN_FCN (icode
) (target
, target
, elem
));
18293 /* Subroutine of aarch64_sve_expand_vector_init for handling
18294 trailing constants.
18295 This function works as follows:
18296 (a) Create a new vector consisting of trailing constants.
18297 (b) Initialize TARGET with the constant vector using emit_move_insn.
18298 (c) Insert remaining elements in TARGET using insr.
18299 NELTS is the total number of elements in original vector while
18300 while NELTS_REQD is the number of elements that are actually
18303 ??? The heuristic used is to do above only if number of constants
18304 is at least half the total number of elements. May need fine tuning. */
18307 aarch64_sve_expand_vector_init_handle_trailing_constants
18308 (rtx target
, const rtx_vector_builder
&builder
, int nelts
, int nelts_reqd
)
18310 machine_mode mode
= GET_MODE (target
);
18311 scalar_mode elem_mode
= GET_MODE_INNER (mode
);
18312 int n_trailing_constants
= 0;
18314 for (int i
= nelts_reqd
- 1;
18315 i
>= 0 && aarch64_legitimate_constant_p (elem_mode
, builder
.elt (i
));
18317 n_trailing_constants
++;
18319 if (n_trailing_constants
>= nelts_reqd
/ 2)
18321 rtx_vector_builder
v (mode
, 1, nelts
);
18322 for (int i
= 0; i
< nelts
; i
++)
18323 v
.quick_push (builder
.elt (i
+ nelts_reqd
- n_trailing_constants
));
18324 rtx const_vec
= v
.build ();
18325 emit_move_insn (target
, const_vec
);
18327 for (int i
= nelts_reqd
- n_trailing_constants
- 1; i
>= 0; i
--)
18328 emit_insr (target
, builder
.elt (i
));
18336 /* Subroutine of aarch64_sve_expand_vector_init.
18338 (a) Initialize TARGET by broadcasting element NELTS_REQD - 1 of BUILDER.
18339 (b) Skip trailing elements from BUILDER, which are the same as
18340 element NELTS_REQD - 1.
18341 (c) Insert earlier elements in reverse order in TARGET using insr. */
18344 aarch64_sve_expand_vector_init_insert_elems (rtx target
,
18345 const rtx_vector_builder
&builder
,
18348 machine_mode mode
= GET_MODE (target
);
18349 scalar_mode elem_mode
= GET_MODE_INNER (mode
);
18351 struct expand_operand ops
[2];
18352 enum insn_code icode
= optab_handler (vec_duplicate_optab
, mode
);
18353 gcc_assert (icode
!= CODE_FOR_nothing
);
18355 create_output_operand (&ops
[0], target
, mode
);
18356 create_input_operand (&ops
[1], builder
.elt (nelts_reqd
- 1), elem_mode
);
18357 expand_insn (icode
, 2, ops
);
18359 int ndups
= builder
.count_dups (nelts_reqd
- 1, -1, -1);
18360 for (int i
= nelts_reqd
- ndups
- 1; i
>= 0; i
--)
18361 emit_insr (target
, builder
.elt (i
));
18364 /* Subroutine of aarch64_sve_expand_vector_init to handle case
18365 when all trailing elements of builder are same.
18366 This works as follows:
18367 (a) Use expand_insn interface to broadcast last vector element in TARGET.
18368 (b) Insert remaining elements in TARGET using insr.
18370 ??? The heuristic used is to do above if number of same trailing elements
18371 is at least 3/4 of total number of elements, loosely based on
18372 heuristic from mostly_zeros_p. May need fine-tuning. */
18375 aarch64_sve_expand_vector_init_handle_trailing_same_elem
18376 (rtx target
, const rtx_vector_builder
&builder
, int nelts_reqd
)
18378 int ndups
= builder
.count_dups (nelts_reqd
- 1, -1, -1);
18379 if (ndups
>= (3 * nelts_reqd
) / 4)
18381 aarch64_sve_expand_vector_init_insert_elems (target
, builder
,
18382 nelts_reqd
- ndups
+ 1);
18389 /* Initialize register TARGET from BUILDER. NELTS is the constant number
18390 of elements in BUILDER.
18392 The function tries to initialize TARGET from BUILDER if it fits one
18393 of the special cases outlined below.
18395 Failing that, the function divides BUILDER into two sub-vectors:
18396 v_even = even elements of BUILDER;
18397 v_odd = odd elements of BUILDER;
18399 and recursively calls itself with v_even and v_odd.
18401 if (recursive call succeeded for v_even or v_odd)
18402 TARGET = zip (v_even, v_odd)
18404 The function returns true if it managed to build TARGET from BUILDER
18405 with one of the special cases, false otherwise.
18407 Example: {a, 1, b, 2, c, 3, d, 4}
18409 The vector gets divided into:
18410 v_even = {a, b, c, d}
18411 v_odd = {1, 2, 3, 4}
18413 aarch64_sve_expand_vector_init(v_odd) hits case 1 and
18414 initialize tmp2 from constant vector v_odd using emit_move_insn.
18416 aarch64_sve_expand_vector_init(v_even) fails since v_even contains
18417 4 elements, so we construct tmp1 from v_even using insr:
18424 TARGET = zip (tmp1, tmp2)
18425 which sets TARGET to {a, 1, b, 2, c, 3, d, 4}. */
18428 aarch64_sve_expand_vector_init (rtx target
, const rtx_vector_builder
&builder
,
18429 int nelts
, int nelts_reqd
)
18431 machine_mode mode
= GET_MODE (target
);
18433 /* Case 1: Vector contains trailing constants. */
18435 if (aarch64_sve_expand_vector_init_handle_trailing_constants
18436 (target
, builder
, nelts
, nelts_reqd
))
18439 /* Case 2: Vector contains leading constants. */
18441 rtx_vector_builder
rev_builder (mode
, 1, nelts_reqd
);
18442 for (int i
= 0; i
< nelts_reqd
; i
++)
18443 rev_builder
.quick_push (builder
.elt (nelts_reqd
- i
- 1));
18444 rev_builder
.finalize ();
18446 if (aarch64_sve_expand_vector_init_handle_trailing_constants
18447 (target
, rev_builder
, nelts
, nelts_reqd
))
18449 emit_insn (gen_aarch64_sve_rev (mode
, target
, target
));
18453 /* Case 3: Vector contains trailing same element. */
18455 if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
18456 (target
, builder
, nelts_reqd
))
18459 /* Case 4: Vector contains leading same element. */
18461 if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
18462 (target
, rev_builder
, nelts_reqd
) && nelts_reqd
== nelts
)
18464 emit_insn (gen_aarch64_sve_rev (mode
, target
, target
));
18468 /* Avoid recursing below 4-elements.
18469 ??? The threshold 4 may need fine-tuning. */
18471 if (nelts_reqd
<= 4)
18474 rtx_vector_builder
v_even (mode
, 1, nelts
);
18475 rtx_vector_builder
v_odd (mode
, 1, nelts
);
18477 for (int i
= 0; i
< nelts
* 2; i
+= 2)
18479 v_even
.quick_push (builder
.elt (i
));
18480 v_odd
.quick_push (builder
.elt (i
+ 1));
18483 v_even
.finalize ();
18486 rtx tmp1
= gen_reg_rtx (mode
);
18487 bool did_even_p
= aarch64_sve_expand_vector_init (tmp1
, v_even
,
18488 nelts
, nelts_reqd
/ 2);
18490 rtx tmp2
= gen_reg_rtx (mode
);
18491 bool did_odd_p
= aarch64_sve_expand_vector_init (tmp2
, v_odd
,
18492 nelts
, nelts_reqd
/ 2);
18494 if (!did_even_p
&& !did_odd_p
)
18497 /* Initialize v_even and v_odd using INSR if it didn't match any of the
18498 special cases and zip v_even, v_odd. */
18501 aarch64_sve_expand_vector_init_insert_elems (tmp1
, v_even
, nelts_reqd
/ 2);
18504 aarch64_sve_expand_vector_init_insert_elems (tmp2
, v_odd
, nelts_reqd
/ 2);
18506 rtvec v
= gen_rtvec (2, tmp1
, tmp2
);
18507 emit_set_insn (target
, gen_rtx_UNSPEC (mode
, v
, UNSPEC_ZIP1
));
18511 /* Initialize register TARGET from the elements in PARALLEL rtx VALS. */
18514 aarch64_sve_expand_vector_init (rtx target
, rtx vals
)
18516 machine_mode mode
= GET_MODE (target
);
18517 int nelts
= XVECLEN (vals
, 0);
18519 rtx_vector_builder
v (mode
, 1, nelts
);
18520 for (int i
= 0; i
< nelts
; i
++)
18521 v
.quick_push (XVECEXP (vals
, 0, i
));
18524 /* If neither sub-vectors of v could be initialized specially,
18525 then use INSR to insert all elements from v into TARGET.
18526 ??? This might not be optimal for vectors with large
18527 initializers like 16-element or above.
18528 For nelts < 4, it probably isn't useful to handle specially. */
18531 || !aarch64_sve_expand_vector_init (target
, v
, nelts
, nelts
))
18532 aarch64_sve_expand_vector_init_insert_elems (target
, v
, nelts
);
18535 /* Check whether VALUE is a vector constant in which every element
18536 is either a power of 2 or a negated power of 2. If so, return
18537 a constant vector of log2s, and flip CODE between PLUS and MINUS
18538 if VALUE contains negated powers of 2. Return NULL_RTX otherwise. */
18541 aarch64_convert_mult_to_shift (rtx value
, rtx_code
&code
)
18543 if (GET_CODE (value
) != CONST_VECTOR
)
18546 rtx_vector_builder builder
;
18547 if (!builder
.new_unary_operation (GET_MODE (value
), value
, false))
18550 scalar_mode int_mode
= GET_MODE_INNER (GET_MODE (value
));
18551 /* 1 if the result of the multiplication must be negated,
18552 0 if it mustn't, or -1 if we don't yet care. */
18554 unsigned int encoded_nelts
= const_vector_encoded_nelts (value
);
18555 for (unsigned int i
= 0; i
< encoded_nelts
; ++i
)
18557 rtx elt
= CONST_VECTOR_ENCODED_ELT (value
, i
);
18558 if (!CONST_SCALAR_INT_P (elt
))
18560 rtx_mode_t
val (elt
, int_mode
);
18561 wide_int pow2
= wi::neg (val
);
18564 /* It matters whether we negate or not. Make that choice,
18565 and make sure that it's consistent with previous elements. */
18566 if (negate
== !wi::neg_p (val
))
18568 negate
= wi::neg_p (val
);
18572 /* POW2 is now the value that we want to be a power of 2. */
18573 int shift
= wi::exact_log2 (pow2
);
18576 builder
.quick_push (gen_int_mode (shift
, int_mode
));
18579 /* PLUS and MINUS are equivalent; canonicalize on PLUS. */
18581 else if (negate
== 1)
18582 code
= code
== PLUS
? MINUS
: PLUS
;
18583 return builder
.build ();
18586 /* Prepare for an integer SVE multiply-add or multiply-subtract pattern;
18587 CODE is PLUS for the former and MINUS for the latter. OPERANDS is the
18588 operands array, in the same order as for fma_optab. Return true if
18589 the function emitted all the necessary instructions, false if the caller
18590 should generate the pattern normally with the new OPERANDS array. */
18593 aarch64_prepare_sve_int_fma (rtx
*operands
, rtx_code code
)
18595 machine_mode mode
= GET_MODE (operands
[0]);
18596 if (rtx shifts
= aarch64_convert_mult_to_shift (operands
[2], code
))
18598 rtx product
= expand_binop (mode
, vashl_optab
, operands
[1], shifts
,
18599 NULL_RTX
, true, OPTAB_DIRECT
);
18600 force_expand_binop (mode
, code
== PLUS
? add_optab
: sub_optab
,
18601 operands
[3], product
, operands
[0], true,
18605 operands
[2] = force_reg (mode
, operands
[2]);
18609 /* Likewise, but for a conditional pattern. */
18612 aarch64_prepare_sve_cond_int_fma (rtx
*operands
, rtx_code code
)
18614 machine_mode mode
= GET_MODE (operands
[0]);
18615 if (rtx shifts
= aarch64_convert_mult_to_shift (operands
[3], code
))
18617 rtx product
= expand_binop (mode
, vashl_optab
, operands
[2], shifts
,
18618 NULL_RTX
, true, OPTAB_DIRECT
);
18619 emit_insn (gen_cond (code
, mode
, operands
[0], operands
[1],
18620 operands
[4], product
, operands
[5]));
18623 operands
[3] = force_reg (mode
, operands
[3]);
18627 static unsigned HOST_WIDE_INT
18628 aarch64_shift_truncation_mask (machine_mode mode
)
18630 if (!SHIFT_COUNT_TRUNCATED
|| aarch64_vector_data_mode_p (mode
))
18632 return GET_MODE_UNIT_BITSIZE (mode
) - 1;
18635 /* Select a format to encode pointers in exception handling data. */
18637 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED
, int global
)
18640 switch (aarch64_cmodel
)
18642 case AARCH64_CMODEL_TINY
:
18643 case AARCH64_CMODEL_TINY_PIC
:
18644 case AARCH64_CMODEL_SMALL
:
18645 case AARCH64_CMODEL_SMALL_PIC
:
18646 case AARCH64_CMODEL_SMALL_SPIC
:
18647 /* text+got+data < 4Gb. 4-byte signed relocs are sufficient
18649 type
= DW_EH_PE_sdata4
;
18652 /* No assumptions here. 8-byte relocs required. */
18653 type
= DW_EH_PE_sdata8
;
18656 return (global
? DW_EH_PE_indirect
: 0) | DW_EH_PE_pcrel
| type
;
18659 /* Output .variant_pcs for aarch64_vector_pcs function symbols. */
18662 aarch64_asm_output_variant_pcs (FILE *stream
, const tree decl
, const char* name
)
18664 if (TREE_CODE (decl
) == FUNCTION_DECL
)
18666 arm_pcs pcs
= (arm_pcs
) fndecl_abi (decl
).id ();
18667 if (pcs
== ARM_PCS_SIMD
|| pcs
== ARM_PCS_SVE
)
18669 fprintf (stream
, "\t.variant_pcs\t");
18670 assemble_name (stream
, name
);
18671 fprintf (stream
, "\n");
18676 /* The last .arch and .tune assembly strings that we printed. */
18677 static std::string aarch64_last_printed_arch_string
;
18678 static std::string aarch64_last_printed_tune_string
;
18680 /* Implement ASM_DECLARE_FUNCTION_NAME. Output the ISA features used
18681 by the function fndecl. */
18684 aarch64_declare_function_name (FILE *stream
, const char* name
,
18687 tree target_parts
= DECL_FUNCTION_SPECIFIC_TARGET (fndecl
);
18689 struct cl_target_option
*targ_options
;
18691 targ_options
= TREE_TARGET_OPTION (target_parts
);
18693 targ_options
= TREE_TARGET_OPTION (target_option_current_node
);
18694 gcc_assert (targ_options
);
18696 const struct processor
*this_arch
18697 = aarch64_get_arch (targ_options
->x_explicit_arch
);
18699 uint64_t isa_flags
= targ_options
->x_aarch64_isa_flags
;
18700 std::string extension
18701 = aarch64_get_extension_string_for_isa_flags (isa_flags
,
18703 /* Only update the assembler .arch string if it is distinct from the last
18704 such string we printed. */
18705 std::string to_print
= this_arch
->name
+ extension
;
18706 if (to_print
!= aarch64_last_printed_arch_string
)
18708 asm_fprintf (asm_out_file
, "\t.arch %s\n", to_print
.c_str ());
18709 aarch64_last_printed_arch_string
= to_print
;
18712 /* Print the cpu name we're tuning for in the comments, might be
18713 useful to readers of the generated asm. Do it only when it changes
18714 from function to function and verbose assembly is requested. */
18715 const struct processor
*this_tune
18716 = aarch64_get_tune_cpu (targ_options
->x_explicit_tune_core
);
18718 if (flag_debug_asm
&& aarch64_last_printed_tune_string
!= this_tune
->name
)
18720 asm_fprintf (asm_out_file
, "\t" ASM_COMMENT_START
".tune %s\n",
18722 aarch64_last_printed_tune_string
= this_tune
->name
;
18725 aarch64_asm_output_variant_pcs (stream
, fndecl
, name
);
18727 /* Don't forget the type directive for ELF. */
18728 ASM_OUTPUT_TYPE_DIRECTIVE (stream
, name
, "function");
18729 ASM_OUTPUT_LABEL (stream
, name
);
18731 cfun
->machine
->label_is_assembled
= true;
18734 /* Implement PRINT_PATCHABLE_FUNCTION_ENTRY. Check if the patch area is after
18735 the function label and emit a BTI if necessary. */
18738 aarch64_print_patchable_function_entry (FILE *file
,
18739 unsigned HOST_WIDE_INT patch_area_size
,
18742 if (cfun
->machine
->label_is_assembled
18743 && aarch64_bti_enabled ()
18744 && !cgraph_node::get (cfun
->decl
)->only_called_directly_p ())
18746 /* Remove the BTI that follows the patch area and insert a new BTI
18747 before the patch area right after the function label. */
18748 rtx_insn
*insn
= next_real_nondebug_insn (get_insns ());
18751 && GET_CODE (PATTERN (insn
)) == UNSPEC_VOLATILE
18752 && XINT (PATTERN (insn
), 1) == UNSPECV_BTI_C
)
18753 delete_insn (insn
);
18754 asm_fprintf (file
, "\thint\t34 // bti c\n");
18757 default_print_patchable_function_entry (file
, patch_area_size
, record_p
);
18760 /* Implement ASM_OUTPUT_DEF_FROM_DECLS. Output .variant_pcs for aliases. */
18763 aarch64_asm_output_alias (FILE *stream
, const tree decl
, const tree target
)
18765 const char *name
= XSTR (XEXP (DECL_RTL (decl
), 0), 0);
18766 const char *value
= IDENTIFIER_POINTER (target
);
18767 aarch64_asm_output_variant_pcs (stream
, decl
, name
);
18768 ASM_OUTPUT_DEF (stream
, name
, value
);
18771 /* Implement ASM_OUTPUT_EXTERNAL. Output .variant_pcs for undefined
18772 function symbol references. */
18775 aarch64_asm_output_external (FILE *stream
, tree decl
, const char* name
)
18777 default_elf_asm_output_external (stream
, decl
, name
);
18778 aarch64_asm_output_variant_pcs (stream
, decl
, name
);
18781 /* Triggered after a .cfi_startproc directive is emitted into the assembly file.
18782 Used to output the .cfi_b_key_frame directive when signing the current
18783 function with the B key. */
18786 aarch64_post_cfi_startproc (FILE *f
, tree ignored ATTRIBUTE_UNUSED
)
18788 if (cfun
->machine
->frame
.laid_out
&& aarch64_return_address_signing_enabled ()
18789 && aarch64_ra_sign_key
== AARCH64_KEY_B
)
18790 asm_fprintf (f
, "\t.cfi_b_key_frame\n");
18793 /* Implements TARGET_ASM_FILE_START. Output the assembly header. */
18796 aarch64_start_file (void)
18798 struct cl_target_option
*default_options
18799 = TREE_TARGET_OPTION (target_option_default_node
);
18801 const struct processor
*default_arch
18802 = aarch64_get_arch (default_options
->x_explicit_arch
);
18803 uint64_t default_isa_flags
= default_options
->x_aarch64_isa_flags
;
18804 std::string extension
18805 = aarch64_get_extension_string_for_isa_flags (default_isa_flags
,
18806 default_arch
->flags
);
18808 aarch64_last_printed_arch_string
= default_arch
->name
+ extension
;
18809 aarch64_last_printed_tune_string
= "";
18810 asm_fprintf (asm_out_file
, "\t.arch %s\n",
18811 aarch64_last_printed_arch_string
.c_str ());
18813 default_file_start ();
18816 /* Emit load exclusive. */
18819 aarch64_emit_load_exclusive (machine_mode mode
, rtx rval
,
18820 rtx mem
, rtx model_rtx
)
18822 if (mode
== TImode
)
18823 emit_insn (gen_aarch64_load_exclusive_pair (gen_lowpart (DImode
, rval
),
18824 gen_highpart (DImode
, rval
),
18827 emit_insn (gen_aarch64_load_exclusive (mode
, rval
, mem
, model_rtx
));
18830 /* Emit store exclusive. */
18833 aarch64_emit_store_exclusive (machine_mode mode
, rtx bval
,
18834 rtx mem
, rtx rval
, rtx model_rtx
)
18836 if (mode
== TImode
)
18837 emit_insn (gen_aarch64_store_exclusive_pair
18838 (bval
, mem
, operand_subword (rval
, 0, 0, TImode
),
18839 operand_subword (rval
, 1, 0, TImode
), model_rtx
));
18841 emit_insn (gen_aarch64_store_exclusive (mode
, bval
, mem
, rval
, model_rtx
));
18844 /* Mark the previous jump instruction as unlikely. */
18847 aarch64_emit_unlikely_jump (rtx insn
)
18849 rtx_insn
*jump
= emit_jump_insn (insn
);
18850 add_reg_br_prob_note (jump
, profile_probability::very_unlikely ());
18853 /* We store the names of the various atomic helpers in a 5x4 array.
18854 Return the libcall function given MODE, MODEL and NAMES. */
18857 aarch64_atomic_ool_func(machine_mode mode
, rtx model_rtx
,
18858 const atomic_ool_names
*names
)
18860 memmodel model
= memmodel_base (INTVAL (model_rtx
));
18861 int mode_idx
, model_idx
;
18881 gcc_unreachable ();
18886 case MEMMODEL_RELAXED
:
18889 case MEMMODEL_CONSUME
:
18890 case MEMMODEL_ACQUIRE
:
18893 case MEMMODEL_RELEASE
:
18896 case MEMMODEL_ACQ_REL
:
18897 case MEMMODEL_SEQ_CST
:
18901 gcc_unreachable ();
18904 return init_one_libfunc_visibility (names
->str
[mode_idx
][model_idx
],
18905 VISIBILITY_HIDDEN
);
18908 #define DEF0(B, N) \
18909 { "__aarch64_" #B #N "_relax", \
18910 "__aarch64_" #B #N "_acq", \
18911 "__aarch64_" #B #N "_rel", \
18912 "__aarch64_" #B #N "_acq_rel" }
18914 #define DEF4(B) DEF0(B, 1), DEF0(B, 2), DEF0(B, 4), DEF0(B, 8), \
18915 { NULL, NULL, NULL, NULL }
18916 #define DEF5(B) DEF0(B, 1), DEF0(B, 2), DEF0(B, 4), DEF0(B, 8), DEF0(B, 16)
18918 static const atomic_ool_names aarch64_ool_cas_names
= { { DEF5(cas
) } };
18919 const atomic_ool_names aarch64_ool_swp_names
= { { DEF4(swp
) } };
18920 const atomic_ool_names aarch64_ool_ldadd_names
= { { DEF4(ldadd
) } };
18921 const atomic_ool_names aarch64_ool_ldset_names
= { { DEF4(ldset
) } };
18922 const atomic_ool_names aarch64_ool_ldclr_names
= { { DEF4(ldclr
) } };
18923 const atomic_ool_names aarch64_ool_ldeor_names
= { { DEF4(ldeor
) } };
18929 /* Expand a compare and swap pattern. */
18932 aarch64_expand_compare_and_swap (rtx operands
[])
18934 rtx bval
, rval
, mem
, oldval
, newval
, is_weak
, mod_s
, mod_f
, x
, cc_reg
;
18935 machine_mode mode
, r_mode
;
18937 bval
= operands
[0];
18938 rval
= operands
[1];
18940 oldval
= operands
[3];
18941 newval
= operands
[4];
18942 is_weak
= operands
[5];
18943 mod_s
= operands
[6];
18944 mod_f
= operands
[7];
18945 mode
= GET_MODE (mem
);
18947 /* Normally the succ memory model must be stronger than fail, but in the
18948 unlikely event of fail being ACQUIRE and succ being RELEASE we need to
18949 promote succ to ACQ_REL so that we don't lose the acquire semantics. */
18950 if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f
)))
18951 && is_mm_release (memmodel_from_int (INTVAL (mod_s
))))
18952 mod_s
= GEN_INT (MEMMODEL_ACQ_REL
);
18955 if (mode
== QImode
|| mode
== HImode
)
18958 rval
= gen_reg_rtx (r_mode
);
18963 /* The CAS insn requires oldval and rval overlap, but we need to
18964 have a copy of oldval saved across the operation to tell if
18965 the operation is successful. */
18966 if (reg_overlap_mentioned_p (rval
, oldval
))
18967 rval
= copy_to_mode_reg (r_mode
, oldval
);
18969 emit_move_insn (rval
, gen_lowpart (r_mode
, oldval
));
18971 emit_insn (gen_aarch64_compare_and_swap_lse (mode
, rval
, mem
,
18973 cc_reg
= aarch64_gen_compare_reg_maybe_ze (NE
, rval
, oldval
, mode
);
18975 else if (TARGET_OUTLINE_ATOMICS
)
18977 /* Oldval must satisfy compare afterward. */
18978 if (!aarch64_plus_operand (oldval
, mode
))
18979 oldval
= force_reg (mode
, oldval
);
18980 rtx func
= aarch64_atomic_ool_func (mode
, mod_s
, &aarch64_ool_cas_names
);
18981 rval
= emit_library_call_value (func
, NULL_RTX
, LCT_NORMAL
, r_mode
,
18982 oldval
, mode
, newval
, mode
,
18983 XEXP (mem
, 0), Pmode
);
18984 cc_reg
= aarch64_gen_compare_reg_maybe_ze (NE
, rval
, oldval
, mode
);
18988 /* The oldval predicate varies by mode. Test it and force to reg. */
18989 insn_code code
= code_for_aarch64_compare_and_swap (mode
);
18990 if (!insn_data
[code
].operand
[2].predicate (oldval
, mode
))
18991 oldval
= force_reg (mode
, oldval
);
18993 emit_insn (GEN_FCN (code
) (rval
, mem
, oldval
, newval
,
18994 is_weak
, mod_s
, mod_f
));
18995 cc_reg
= gen_rtx_REG (CCmode
, CC_REGNUM
);
18998 if (r_mode
!= mode
)
18999 rval
= gen_lowpart (mode
, rval
);
19000 emit_move_insn (operands
[1], rval
);
19002 x
= gen_rtx_EQ (SImode
, cc_reg
, const0_rtx
);
19003 emit_insn (gen_rtx_SET (bval
, x
));
19006 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
19007 sequence implementing an atomic operation. */
19010 aarch64_emit_post_barrier (enum memmodel model
)
19012 const enum memmodel base_model
= memmodel_base (model
);
19014 if (is_mm_sync (model
)
19015 && (base_model
== MEMMODEL_ACQUIRE
19016 || base_model
== MEMMODEL_ACQ_REL
19017 || base_model
== MEMMODEL_SEQ_CST
))
19019 emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST
)));
19023 /* Split a compare and swap pattern. */
19026 aarch64_split_compare_and_swap (rtx operands
[])
19028 /* Split after prolog/epilog to avoid interactions with shrinkwrapping. */
19029 gcc_assert (epilogue_completed
);
19031 rtx rval
, mem
, oldval
, newval
, scratch
, x
, model_rtx
;
19034 rtx_code_label
*label1
, *label2
;
19035 enum memmodel model
;
19037 rval
= operands
[0];
19039 oldval
= operands
[2];
19040 newval
= operands
[3];
19041 is_weak
= (operands
[4] != const0_rtx
);
19042 model_rtx
= operands
[5];
19043 scratch
= operands
[7];
19044 mode
= GET_MODE (mem
);
19045 model
= memmodel_from_int (INTVAL (model_rtx
));
19047 /* When OLDVAL is zero and we want the strong version we can emit a tighter
19050 LD[A]XR rval, [mem]
19052 ST[L]XR scratch, newval, [mem]
19053 CBNZ scratch, .label1
19056 bool strong_zero_p
= (!is_weak
&& !aarch64_track_speculation
&&
19057 oldval
== const0_rtx
&& mode
!= TImode
);
19062 label1
= gen_label_rtx ();
19063 emit_label (label1
);
19065 label2
= gen_label_rtx ();
19067 /* The initial load can be relaxed for a __sync operation since a final
19068 barrier will be emitted to stop code hoisting. */
19069 if (is_mm_sync (model
))
19070 aarch64_emit_load_exclusive (mode
, rval
, mem
, GEN_INT (MEMMODEL_RELAXED
));
19072 aarch64_emit_load_exclusive (mode
, rval
, mem
, model_rtx
);
19075 x
= gen_rtx_NE (VOIDmode
, rval
, const0_rtx
);
19078 rtx cc_reg
= aarch64_gen_compare_reg_maybe_ze (NE
, rval
, oldval
, mode
);
19079 x
= gen_rtx_NE (VOIDmode
, cc_reg
, const0_rtx
);
19081 x
= gen_rtx_IF_THEN_ELSE (VOIDmode
, x
,
19082 gen_rtx_LABEL_REF (Pmode
, label2
), pc_rtx
);
19083 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx
, x
));
19085 aarch64_emit_store_exclusive (mode
, scratch
, mem
, newval
, model_rtx
);
19089 if (aarch64_track_speculation
)
19091 /* Emit an explicit compare instruction, so that we can correctly
19092 track the condition codes. */
19093 rtx cc_reg
= aarch64_gen_compare_reg (NE
, scratch
, const0_rtx
);
19094 x
= gen_rtx_NE (GET_MODE (cc_reg
), cc_reg
, const0_rtx
);
19097 x
= gen_rtx_NE (VOIDmode
, scratch
, const0_rtx
);
19099 x
= gen_rtx_IF_THEN_ELSE (VOIDmode
, x
,
19100 gen_rtx_LABEL_REF (Pmode
, label1
), pc_rtx
);
19101 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx
, x
));
19104 aarch64_gen_compare_reg (NE
, scratch
, const0_rtx
);
19106 emit_label (label2
);
19108 /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
19109 to set the condition flags. If this is not used it will be removed by
19112 aarch64_gen_compare_reg (NE
, rval
, const0_rtx
);
19114 /* Emit any final barrier needed for a __sync operation. */
19115 if (is_mm_sync (model
))
19116 aarch64_emit_post_barrier (model
);
19119 /* Split an atomic operation. */
19122 aarch64_split_atomic_op (enum rtx_code code
, rtx old_out
, rtx new_out
, rtx mem
,
19123 rtx value
, rtx model_rtx
, rtx cond
)
19125 /* Split after prolog/epilog to avoid interactions with shrinkwrapping. */
19126 gcc_assert (epilogue_completed
);
19128 machine_mode mode
= GET_MODE (mem
);
19129 machine_mode wmode
= (mode
== DImode
? DImode
: SImode
);
19130 const enum memmodel model
= memmodel_from_int (INTVAL (model_rtx
));
19131 const bool is_sync
= is_mm_sync (model
);
19132 rtx_code_label
*label
;
19135 /* Split the atomic operation into a sequence. */
19136 label
= gen_label_rtx ();
19137 emit_label (label
);
19140 new_out
= gen_lowpart (wmode
, new_out
);
19142 old_out
= gen_lowpart (wmode
, old_out
);
19145 value
= simplify_gen_subreg (wmode
, value
, mode
, 0);
19147 /* The initial load can be relaxed for a __sync operation since a final
19148 barrier will be emitted to stop code hoisting. */
19150 aarch64_emit_load_exclusive (mode
, old_out
, mem
,
19151 GEN_INT (MEMMODEL_RELAXED
));
19153 aarch64_emit_load_exclusive (mode
, old_out
, mem
, model_rtx
);
19162 x
= gen_rtx_AND (wmode
, old_out
, value
);
19163 emit_insn (gen_rtx_SET (new_out
, x
));
19164 x
= gen_rtx_NOT (wmode
, new_out
);
19165 emit_insn (gen_rtx_SET (new_out
, x
));
19169 if (CONST_INT_P (value
))
19171 value
= GEN_INT (-INTVAL (value
));
19174 /* Fall through. */
19177 x
= gen_rtx_fmt_ee (code
, wmode
, old_out
, value
);
19178 emit_insn (gen_rtx_SET (new_out
, x
));
19182 aarch64_emit_store_exclusive (mode
, cond
, mem
,
19183 gen_lowpart (mode
, new_out
), model_rtx
);
19185 if (aarch64_track_speculation
)
19187 /* Emit an explicit compare instruction, so that we can correctly
19188 track the condition codes. */
19189 rtx cc_reg
= aarch64_gen_compare_reg (NE
, cond
, const0_rtx
);
19190 x
= gen_rtx_NE (GET_MODE (cc_reg
), cc_reg
, const0_rtx
);
19193 x
= gen_rtx_NE (VOIDmode
, cond
, const0_rtx
);
19195 x
= gen_rtx_IF_THEN_ELSE (VOIDmode
, x
,
19196 gen_rtx_LABEL_REF (Pmode
, label
), pc_rtx
);
19197 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx
, x
));
19199 /* Emit any final barrier needed for a __sync operation. */
19201 aarch64_emit_post_barrier (model
);
19205 aarch64_init_libfuncs (void)
19207 /* Half-precision float operations. The compiler handles all operations
19208 with NULL libfuncs by converting to SFmode. */
19211 set_conv_libfunc (trunc_optab
, HFmode
, SFmode
, "__gnu_f2h_ieee");
19212 set_conv_libfunc (sext_optab
, SFmode
, HFmode
, "__gnu_h2f_ieee");
19215 set_optab_libfunc (add_optab
, HFmode
, NULL
);
19216 set_optab_libfunc (sdiv_optab
, HFmode
, NULL
);
19217 set_optab_libfunc (smul_optab
, HFmode
, NULL
);
19218 set_optab_libfunc (neg_optab
, HFmode
, NULL
);
19219 set_optab_libfunc (sub_optab
, HFmode
, NULL
);
19222 set_optab_libfunc (eq_optab
, HFmode
, NULL
);
19223 set_optab_libfunc (ne_optab
, HFmode
, NULL
);
19224 set_optab_libfunc (lt_optab
, HFmode
, NULL
);
19225 set_optab_libfunc (le_optab
, HFmode
, NULL
);
19226 set_optab_libfunc (ge_optab
, HFmode
, NULL
);
19227 set_optab_libfunc (gt_optab
, HFmode
, NULL
);
19228 set_optab_libfunc (unord_optab
, HFmode
, NULL
);
19231 /* Target hook for c_mode_for_suffix. */
19232 static machine_mode
19233 aarch64_c_mode_for_suffix (char suffix
)
19241 /* We can only represent floating point constants which will fit in
19242 "quarter-precision" values. These values are characterised by
19243 a sign bit, a 4-bit mantissa and a 3-bit exponent. And are given
19246 (-1)^s * (n/16) * 2^r
19249 's' is the sign bit.
19250 'n' is an integer in the range 16 <= n <= 31.
19251 'r' is an integer in the range -3 <= r <= 4. */
19253 /* Return true iff X can be represented by a quarter-precision
19254 floating point immediate operand X. Note, we cannot represent 0.0. */
19256 aarch64_float_const_representable_p (rtx x
)
19258 /* This represents our current view of how many bits
19259 make up the mantissa. */
19260 int point_pos
= 2 * HOST_BITS_PER_WIDE_INT
- 1;
19262 unsigned HOST_WIDE_INT mantissa
, mask
;
19263 REAL_VALUE_TYPE r
, m
;
19266 x
= unwrap_const_vec_duplicate (x
);
19267 if (!CONST_DOUBLE_P (x
))
19270 if (GET_MODE (x
) == VOIDmode
19271 || (GET_MODE (x
) == HFmode
&& !TARGET_FP_F16INST
))
19274 r
= *CONST_DOUBLE_REAL_VALUE (x
);
19276 /* We cannot represent infinities, NaNs or +/-zero. We won't
19277 know if we have +zero until we analyse the mantissa, but we
19278 can reject the other invalid values. */
19279 if (REAL_VALUE_ISINF (r
) || REAL_VALUE_ISNAN (r
)
19280 || REAL_VALUE_MINUS_ZERO (r
))
19283 /* Extract exponent. */
19284 r
= real_value_abs (&r
);
19285 exponent
= REAL_EXP (&r
);
19287 /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
19288 highest (sign) bit, with a fixed binary point at bit point_pos.
19289 m1 holds the low part of the mantissa, m2 the high part.
19290 WARNING: If we ever have a representation using more than 2 * H_W_I - 1
19291 bits for the mantissa, this can fail (low bits will be lost). */
19292 real_ldexp (&m
, &r
, point_pos
- exponent
);
19293 wide_int w
= real_to_integer (&m
, &fail
, HOST_BITS_PER_WIDE_INT
* 2);
19295 /* If the low part of the mantissa has bits set we cannot represent
19297 if (w
.ulow () != 0)
19299 /* We have rejected the lower HOST_WIDE_INT, so update our
19300 understanding of how many bits lie in the mantissa and
19301 look only at the high HOST_WIDE_INT. */
19302 mantissa
= w
.elt (1);
19303 point_pos
-= HOST_BITS_PER_WIDE_INT
;
19305 /* We can only represent values with a mantissa of the form 1.xxxx. */
19306 mask
= ((unsigned HOST_WIDE_INT
)1 << (point_pos
- 5)) - 1;
19307 if ((mantissa
& mask
) != 0)
19310 /* Having filtered unrepresentable values, we may now remove all
19311 but the highest 5 bits. */
19312 mantissa
>>= point_pos
- 5;
19314 /* We cannot represent the value 0.0, so reject it. This is handled
19319 /* Then, as bit 4 is always set, we can mask it off, leaving
19320 the mantissa in the range [0, 15]. */
19321 mantissa
&= ~(1 << 4);
19322 gcc_assert (mantissa
<= 15);
19324 /* GCC internally does not use IEEE754-like encoding (where normalized
19325 significands are in the range [1, 2). GCC uses [0.5, 1) (see real.c).
19326 Our mantissa values are shifted 4 places to the left relative to
19327 normalized IEEE754 so we must modify the exponent returned by REAL_EXP
19328 by 5 places to correct for GCC's representation. */
19329 exponent
= 5 - exponent
;
19331 return (exponent
>= 0 && exponent
<= 7);
19334 /* Returns the string with the instruction for AdvSIMD MOVI, MVNI, ORR or BIC
19335 immediate with a CONST_VECTOR of MODE and WIDTH. WHICH selects whether to
19336 output MOVI/MVNI, ORR or BIC immediate. */
19338 aarch64_output_simd_mov_immediate (rtx const_vector
, unsigned width
,
19339 enum simd_immediate_check which
)
19342 static char templ
[40];
19343 const char *mnemonic
;
19344 const char *shift_op
;
19345 unsigned int lane_count
= 0;
19348 struct simd_immediate_info info
;
19350 /* This will return true to show const_vector is legal for use as either
19351 a AdvSIMD MOVI instruction (or, implicitly, MVNI), ORR or BIC immediate.
19352 It will also update INFO to show how the immediate should be generated.
19353 WHICH selects whether to check for MOVI/MVNI, ORR or BIC. */
19354 is_valid
= aarch64_simd_valid_immediate (const_vector
, &info
, which
);
19355 gcc_assert (is_valid
);
19357 element_char
= sizetochar (GET_MODE_BITSIZE (info
.elt_mode
));
19358 lane_count
= width
/ GET_MODE_BITSIZE (info
.elt_mode
);
19360 if (GET_MODE_CLASS (info
.elt_mode
) == MODE_FLOAT
)
19362 gcc_assert (info
.insn
== simd_immediate_info::MOV
19363 && info
.u
.mov
.shift
== 0);
19364 /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
19365 move immediate path. */
19366 if (aarch64_float_const_zero_rtx_p (info
.u
.mov
.value
))
19367 info
.u
.mov
.value
= GEN_INT (0);
19370 const unsigned int buf_size
= 20;
19371 char float_buf
[buf_size
] = {'\0'};
19372 real_to_decimal_for_mode (float_buf
,
19373 CONST_DOUBLE_REAL_VALUE (info
.u
.mov
.value
),
19374 buf_size
, buf_size
, 1, info
.elt_mode
);
19376 if (lane_count
== 1)
19377 snprintf (templ
, sizeof (templ
), "fmov\t%%d0, %s", float_buf
);
19379 snprintf (templ
, sizeof (templ
), "fmov\t%%0.%d%c, %s",
19380 lane_count
, element_char
, float_buf
);
19385 gcc_assert (CONST_INT_P (info
.u
.mov
.value
));
19387 if (which
== AARCH64_CHECK_MOV
)
19389 mnemonic
= info
.insn
== simd_immediate_info::MVN
? "mvni" : "movi";
19390 shift_op
= (info
.u
.mov
.modifier
== simd_immediate_info::MSL
19392 if (lane_count
== 1)
19393 snprintf (templ
, sizeof (templ
), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX
,
19394 mnemonic
, UINTVAL (info
.u
.mov
.value
));
19395 else if (info
.u
.mov
.shift
)
19396 snprintf (templ
, sizeof (templ
), "%s\t%%0.%d%c, "
19397 HOST_WIDE_INT_PRINT_HEX
", %s %d", mnemonic
, lane_count
,
19398 element_char
, UINTVAL (info
.u
.mov
.value
), shift_op
,
19401 snprintf (templ
, sizeof (templ
), "%s\t%%0.%d%c, "
19402 HOST_WIDE_INT_PRINT_HEX
, mnemonic
, lane_count
,
19403 element_char
, UINTVAL (info
.u
.mov
.value
));
19407 /* For AARCH64_CHECK_BIC and AARCH64_CHECK_ORR. */
19408 mnemonic
= info
.insn
== simd_immediate_info::MVN
? "bic" : "orr";
19409 if (info
.u
.mov
.shift
)
19410 snprintf (templ
, sizeof (templ
), "%s\t%%0.%d%c, #"
19411 HOST_WIDE_INT_PRINT_DEC
", %s #%d", mnemonic
, lane_count
,
19412 element_char
, UINTVAL (info
.u
.mov
.value
), "lsl",
19415 snprintf (templ
, sizeof (templ
), "%s\t%%0.%d%c, #"
19416 HOST_WIDE_INT_PRINT_DEC
, mnemonic
, lane_count
,
19417 element_char
, UINTVAL (info
.u
.mov
.value
));
19423 aarch64_output_scalar_simd_mov_immediate (rtx immediate
, scalar_int_mode mode
)
19426 /* If a floating point number was passed and we desire to use it in an
19427 integer mode do the conversion to integer. */
19428 if (CONST_DOUBLE_P (immediate
) && GET_MODE_CLASS (mode
) == MODE_INT
)
19430 unsigned HOST_WIDE_INT ival
;
19431 if (!aarch64_reinterpret_float_as_int (immediate
, &ival
))
19432 gcc_unreachable ();
19433 immediate
= gen_int_mode (ival
, mode
);
19436 machine_mode vmode
;
19437 /* use a 64 bit mode for everything except for DI/DF mode, where we use
19438 a 128 bit vector mode. */
19439 int width
= GET_MODE_BITSIZE (mode
) == 64 ? 128 : 64;
19441 vmode
= aarch64_simd_container_mode (mode
, width
);
19442 rtx v_op
= aarch64_simd_gen_const_vector_dup (vmode
, INTVAL (immediate
));
19443 return aarch64_output_simd_mov_immediate (v_op
, width
);
19446 /* Return the output string to use for moving immediate CONST_VECTOR
19447 into an SVE register. */
19450 aarch64_output_sve_mov_immediate (rtx const_vector
)
19452 static char templ
[40];
19453 struct simd_immediate_info info
;
19456 bool is_valid
= aarch64_simd_valid_immediate (const_vector
, &info
);
19457 gcc_assert (is_valid
);
19459 element_char
= sizetochar (GET_MODE_BITSIZE (info
.elt_mode
));
19461 machine_mode vec_mode
= GET_MODE (const_vector
);
19462 if (aarch64_sve_pred_mode_p (vec_mode
))
19464 static char buf
[sizeof ("ptrue\t%0.N, vlNNNNN")];
19465 if (info
.insn
== simd_immediate_info::MOV
)
19467 gcc_assert (info
.u
.mov
.value
== const0_rtx
);
19468 snprintf (buf
, sizeof (buf
), "pfalse\t%%0.b");
19472 gcc_assert (info
.insn
== simd_immediate_info::PTRUE
);
19473 unsigned int total_bytes
;
19474 if (info
.u
.pattern
== AARCH64_SV_ALL
19475 && BYTES_PER_SVE_VECTOR
.is_constant (&total_bytes
))
19476 snprintf (buf
, sizeof (buf
), "ptrue\t%%0.%c, vl%d", element_char
,
19477 total_bytes
/ GET_MODE_SIZE (info
.elt_mode
));
19479 snprintf (buf
, sizeof (buf
), "ptrue\t%%0.%c, %s", element_char
,
19480 svpattern_token (info
.u
.pattern
));
19485 if (info
.insn
== simd_immediate_info::INDEX
)
19487 snprintf (templ
, sizeof (templ
), "index\t%%0.%c, #"
19488 HOST_WIDE_INT_PRINT_DEC
", #" HOST_WIDE_INT_PRINT_DEC
,
19489 element_char
, INTVAL (info
.u
.index
.base
),
19490 INTVAL (info
.u
.index
.step
));
19494 if (GET_MODE_CLASS (info
.elt_mode
) == MODE_FLOAT
)
19496 if (aarch64_float_const_zero_rtx_p (info
.u
.mov
.value
))
19497 info
.u
.mov
.value
= GEN_INT (0);
19500 const int buf_size
= 20;
19501 char float_buf
[buf_size
] = {};
19502 real_to_decimal_for_mode (float_buf
,
19503 CONST_DOUBLE_REAL_VALUE (info
.u
.mov
.value
),
19504 buf_size
, buf_size
, 1, info
.elt_mode
);
19506 snprintf (templ
, sizeof (templ
), "fmov\t%%0.%c, #%s",
19507 element_char
, float_buf
);
19512 snprintf (templ
, sizeof (templ
), "mov\t%%0.%c, #" HOST_WIDE_INT_PRINT_DEC
,
19513 element_char
, INTVAL (info
.u
.mov
.value
));
19517 /* Return the asm template for a PTRUES. CONST_UNSPEC is the
19518 aarch64_sve_ptrue_svpattern_immediate that describes the predicate
19522 aarch64_output_sve_ptrues (rtx const_unspec
)
19524 static char templ
[40];
19526 struct simd_immediate_info info
;
19527 bool is_valid
= aarch64_simd_valid_immediate (const_unspec
, &info
);
19528 gcc_assert (is_valid
&& info
.insn
== simd_immediate_info::PTRUE
);
19530 char element_char
= sizetochar (GET_MODE_BITSIZE (info
.elt_mode
));
19531 snprintf (templ
, sizeof (templ
), "ptrues\t%%0.%c, %s", element_char
,
19532 svpattern_token (info
.u
.pattern
));
19536 /* Split operands into moves from op[1] + op[2] into op[0]. */
19539 aarch64_split_combinev16qi (rtx operands
[3])
19541 unsigned int dest
= REGNO (operands
[0]);
19542 unsigned int src1
= REGNO (operands
[1]);
19543 unsigned int src2
= REGNO (operands
[2]);
19544 machine_mode halfmode
= GET_MODE (operands
[1]);
19545 unsigned int halfregs
= REG_NREGS (operands
[1]);
19546 rtx destlo
, desthi
;
19548 gcc_assert (halfmode
== V16QImode
);
19550 if (src1
== dest
&& src2
== dest
+ halfregs
)
19552 /* No-op move. Can't split to nothing; emit something. */
19553 emit_note (NOTE_INSN_DELETED
);
19557 /* Preserve register attributes for variable tracking. */
19558 destlo
= gen_rtx_REG_offset (operands
[0], halfmode
, dest
, 0);
19559 desthi
= gen_rtx_REG_offset (operands
[0], halfmode
, dest
+ halfregs
,
19560 GET_MODE_SIZE (halfmode
));
19562 /* Special case of reversed high/low parts. */
19563 if (reg_overlap_mentioned_p (operands
[2], destlo
)
19564 && reg_overlap_mentioned_p (operands
[1], desthi
))
19566 emit_insn (gen_xorv16qi3 (operands
[1], operands
[1], operands
[2]));
19567 emit_insn (gen_xorv16qi3 (operands
[2], operands
[1], operands
[2]));
19568 emit_insn (gen_xorv16qi3 (operands
[1], operands
[1], operands
[2]));
19570 else if (!reg_overlap_mentioned_p (operands
[2], destlo
))
19572 /* Try to avoid unnecessary moves if part of the result
19573 is in the right place already. */
19575 emit_move_insn (destlo
, operands
[1]);
19576 if (src2
!= dest
+ halfregs
)
19577 emit_move_insn (desthi
, operands
[2]);
19581 if (src2
!= dest
+ halfregs
)
19582 emit_move_insn (desthi
, operands
[2]);
19584 emit_move_insn (destlo
, operands
[1]);
19588 /* vec_perm support. */
19590 struct expand_vec_perm_d
19592 rtx target
, op0
, op1
;
19593 vec_perm_indices perm
;
19594 machine_mode vmode
;
19595 unsigned int vec_flags
;
19600 /* Generate a variable permutation. */
19603 aarch64_expand_vec_perm_1 (rtx target
, rtx op0
, rtx op1
, rtx sel
)
19605 machine_mode vmode
= GET_MODE (target
);
19606 bool one_vector_p
= rtx_equal_p (op0
, op1
);
19608 gcc_checking_assert (vmode
== V8QImode
|| vmode
== V16QImode
);
19609 gcc_checking_assert (GET_MODE (op0
) == vmode
);
19610 gcc_checking_assert (GET_MODE (op1
) == vmode
);
19611 gcc_checking_assert (GET_MODE (sel
) == vmode
);
19612 gcc_checking_assert (TARGET_SIMD
);
19616 if (vmode
== V8QImode
)
19618 /* Expand the argument to a V16QI mode by duplicating it. */
19619 rtx pair
= gen_reg_rtx (V16QImode
);
19620 emit_insn (gen_aarch64_combinev8qi (pair
, op0
, op0
));
19621 emit_insn (gen_aarch64_tbl1v8qi (target
, pair
, sel
));
19625 emit_insn (gen_aarch64_tbl1v16qi (target
, op0
, sel
));
19632 if (vmode
== V8QImode
)
19634 pair
= gen_reg_rtx (V16QImode
);
19635 emit_insn (gen_aarch64_combinev8qi (pair
, op0
, op1
));
19636 emit_insn (gen_aarch64_tbl1v8qi (target
, pair
, sel
));
19640 pair
= gen_reg_rtx (OImode
);
19641 emit_insn (gen_aarch64_combinev16qi (pair
, op0
, op1
));
19642 emit_insn (gen_aarch64_tbl2v16qi (target
, pair
, sel
));
19647 /* Expand a vec_perm with the operands given by TARGET, OP0, OP1 and SEL.
19648 NELT is the number of elements in the vector. */
19651 aarch64_expand_vec_perm (rtx target
, rtx op0
, rtx op1
, rtx sel
,
19654 machine_mode vmode
= GET_MODE (target
);
19655 bool one_vector_p
= rtx_equal_p (op0
, op1
);
19658 /* The TBL instruction does not use a modulo index, so we must take care
19659 of that ourselves. */
19660 mask
= aarch64_simd_gen_const_vector_dup (vmode
,
19661 one_vector_p
? nelt
- 1 : 2 * nelt
- 1);
19662 sel
= expand_simple_binop (vmode
, AND
, sel
, mask
, NULL
, 0, OPTAB_LIB_WIDEN
);
19664 /* For big-endian, we also need to reverse the index within the vector
19665 (but not which vector). */
19666 if (BYTES_BIG_ENDIAN
)
19668 /* If one_vector_p, mask is a vector of (nelt - 1)'s already. */
19670 mask
= aarch64_simd_gen_const_vector_dup (vmode
, nelt
- 1);
19671 sel
= expand_simple_binop (vmode
, XOR
, sel
, mask
,
19672 NULL
, 0, OPTAB_LIB_WIDEN
);
19674 aarch64_expand_vec_perm_1 (target
, op0
, op1
, sel
);
19677 /* Generate (set TARGET (unspec [OP0 OP1] CODE)). */
19680 emit_unspec2 (rtx target
, int code
, rtx op0
, rtx op1
)
19682 emit_insn (gen_rtx_SET (target
,
19683 gen_rtx_UNSPEC (GET_MODE (target
),
19684 gen_rtvec (2, op0
, op1
), code
)));
19687 /* Expand an SVE vec_perm with the given operands. */
19690 aarch64_expand_sve_vec_perm (rtx target
, rtx op0
, rtx op1
, rtx sel
)
19692 machine_mode data_mode
= GET_MODE (target
);
19693 machine_mode sel_mode
= GET_MODE (sel
);
19694 /* Enforced by the pattern condition. */
19695 int nunits
= GET_MODE_NUNITS (sel_mode
).to_constant ();
19697 /* Note: vec_perm indices are supposed to wrap when they go beyond the
19698 size of the two value vectors, i.e. the upper bits of the indices
19699 are effectively ignored. SVE TBL instead produces 0 for any
19700 out-of-range indices, so we need to modulo all the vec_perm indices
19701 to ensure they are all in range. */
19702 rtx sel_reg
= force_reg (sel_mode
, sel
);
19704 /* Check if the sel only references the first values vector. */
19705 if (GET_CODE (sel
) == CONST_VECTOR
19706 && aarch64_const_vec_all_in_range_p (sel
, 0, nunits
- 1))
19708 emit_unspec2 (target
, UNSPEC_TBL
, op0
, sel_reg
);
19712 /* Check if the two values vectors are the same. */
19713 if (rtx_equal_p (op0
, op1
))
19715 rtx max_sel
= aarch64_simd_gen_const_vector_dup (sel_mode
, nunits
- 1);
19716 rtx sel_mod
= expand_simple_binop (sel_mode
, AND
, sel_reg
, max_sel
,
19717 NULL
, 0, OPTAB_DIRECT
);
19718 emit_unspec2 (target
, UNSPEC_TBL
, op0
, sel_mod
);
19722 /* Run TBL on for each value vector and combine the results. */
19724 rtx res0
= gen_reg_rtx (data_mode
);
19725 rtx res1
= gen_reg_rtx (data_mode
);
19726 rtx neg_num_elems
= aarch64_simd_gen_const_vector_dup (sel_mode
, -nunits
);
19727 if (GET_CODE (sel
) != CONST_VECTOR
19728 || !aarch64_const_vec_all_in_range_p (sel
, 0, 2 * nunits
- 1))
19730 rtx max_sel
= aarch64_simd_gen_const_vector_dup (sel_mode
,
19732 sel_reg
= expand_simple_binop (sel_mode
, AND
, sel_reg
, max_sel
,
19733 NULL
, 0, OPTAB_DIRECT
);
19735 emit_unspec2 (res0
, UNSPEC_TBL
, op0
, sel_reg
);
19736 rtx sel_sub
= expand_simple_binop (sel_mode
, PLUS
, sel_reg
, neg_num_elems
,
19737 NULL
, 0, OPTAB_DIRECT
);
19738 emit_unspec2 (res1
, UNSPEC_TBL
, op1
, sel_sub
);
19739 if (GET_MODE_CLASS (data_mode
) == MODE_VECTOR_INT
)
19740 emit_insn (gen_rtx_SET (target
, gen_rtx_IOR (data_mode
, res0
, res1
)));
19742 emit_unspec2 (target
, UNSPEC_IORF
, res0
, res1
);
19745 /* Recognize patterns suitable for the TRN instructions. */
19747 aarch64_evpc_trn (struct expand_vec_perm_d
*d
)
19750 poly_uint64 nelt
= d
->perm
.length ();
19751 rtx out
, in0
, in1
, x
;
19752 machine_mode vmode
= d
->vmode
;
19754 if (GET_MODE_UNIT_SIZE (vmode
) > 8)
19757 /* Note that these are little-endian tests.
19758 We correct for big-endian later. */
19759 if (!d
->perm
[0].is_constant (&odd
)
19760 || (odd
!= 0 && odd
!= 1)
19761 || !d
->perm
.series_p (0, 2, odd
, 2)
19762 || !d
->perm
.series_p (1, 2, nelt
+ odd
, 2))
19771 /* We don't need a big-endian lane correction for SVE; see the comment
19772 at the head of aarch64-sve.md for details. */
19773 if (BYTES_BIG_ENDIAN
&& d
->vec_flags
== VEC_ADVSIMD
)
19775 x
= in0
, in0
= in1
, in1
= x
;
19780 emit_set_insn (out
, gen_rtx_UNSPEC (vmode
, gen_rtvec (2, in0
, in1
),
19781 odd
? UNSPEC_TRN2
: UNSPEC_TRN1
));
19785 /* Recognize patterns suitable for the UZP instructions. */
19787 aarch64_evpc_uzp (struct expand_vec_perm_d
*d
)
19790 rtx out
, in0
, in1
, x
;
19791 machine_mode vmode
= d
->vmode
;
19793 if (GET_MODE_UNIT_SIZE (vmode
) > 8)
19796 /* Note that these are little-endian tests.
19797 We correct for big-endian later. */
19798 if (!d
->perm
[0].is_constant (&odd
)
19799 || (odd
!= 0 && odd
!= 1)
19800 || !d
->perm
.series_p (0, 1, odd
, 2))
19809 /* We don't need a big-endian lane correction for SVE; see the comment
19810 at the head of aarch64-sve.md for details. */
19811 if (BYTES_BIG_ENDIAN
&& d
->vec_flags
== VEC_ADVSIMD
)
19813 x
= in0
, in0
= in1
, in1
= x
;
19818 emit_set_insn (out
, gen_rtx_UNSPEC (vmode
, gen_rtvec (2, in0
, in1
),
19819 odd
? UNSPEC_UZP2
: UNSPEC_UZP1
));
19823 /* Recognize patterns suitable for the ZIP instructions. */
19825 aarch64_evpc_zip (struct expand_vec_perm_d
*d
)
19828 poly_uint64 nelt
= d
->perm
.length ();
19829 rtx out
, in0
, in1
, x
;
19830 machine_mode vmode
= d
->vmode
;
19832 if (GET_MODE_UNIT_SIZE (vmode
) > 8)
19835 /* Note that these are little-endian tests.
19836 We correct for big-endian later. */
19837 poly_uint64 first
= d
->perm
[0];
19838 if ((maybe_ne (first
, 0U) && maybe_ne (first
* 2, nelt
))
19839 || !d
->perm
.series_p (0, 2, first
, 1)
19840 || !d
->perm
.series_p (1, 2, first
+ nelt
, 1))
19842 high
= maybe_ne (first
, 0U);
19850 /* We don't need a big-endian lane correction for SVE; see the comment
19851 at the head of aarch64-sve.md for details. */
19852 if (BYTES_BIG_ENDIAN
&& d
->vec_flags
== VEC_ADVSIMD
)
19854 x
= in0
, in0
= in1
, in1
= x
;
19859 emit_set_insn (out
, gen_rtx_UNSPEC (vmode
, gen_rtvec (2, in0
, in1
),
19860 high
? UNSPEC_ZIP2
: UNSPEC_ZIP1
));
19864 /* Recognize patterns for the EXT insn. */
19867 aarch64_evpc_ext (struct expand_vec_perm_d
*d
)
19869 HOST_WIDE_INT location
;
19872 /* The first element always refers to the first vector.
19873 Check if the extracted indices are increasing by one. */
19874 if (d
->vec_flags
== VEC_SVE_PRED
19875 || !d
->perm
[0].is_constant (&location
)
19876 || !d
->perm
.series_p (0, 1, location
, 1))
19883 /* The case where (location == 0) is a no-op for both big- and little-endian,
19884 and is removed by the mid-end at optimization levels -O1 and higher.
19886 We don't need a big-endian lane correction for SVE; see the comment
19887 at the head of aarch64-sve.md for details. */
19888 if (BYTES_BIG_ENDIAN
&& location
!= 0 && d
->vec_flags
== VEC_ADVSIMD
)
19890 /* After setup, we want the high elements of the first vector (stored
19891 at the LSB end of the register), and the low elements of the second
19892 vector (stored at the MSB end of the register). So swap. */
19893 std::swap (d
->op0
, d
->op1
);
19894 /* location != 0 (above), so safe to assume (nelt - location) < nelt.
19895 to_constant () is safe since this is restricted to Advanced SIMD
19897 location
= d
->perm
.length ().to_constant () - location
;
19900 offset
= GEN_INT (location
);
19901 emit_set_insn (d
->target
,
19902 gen_rtx_UNSPEC (d
->vmode
,
19903 gen_rtvec (3, d
->op0
, d
->op1
, offset
),
19908 /* Recognize patterns for the REV{64,32,16} insns, which reverse elements
19909 within each 64-bit, 32-bit or 16-bit granule. */
19912 aarch64_evpc_rev_local (struct expand_vec_perm_d
*d
)
19914 HOST_WIDE_INT diff
;
19915 unsigned int i
, size
, unspec
;
19916 machine_mode pred_mode
;
19918 if (d
->vec_flags
== VEC_SVE_PRED
19919 || !d
->one_vector_p
19920 || !d
->perm
[0].is_constant (&diff
))
19923 size
= (diff
+ 1) * GET_MODE_UNIT_SIZE (d
->vmode
);
19926 unspec
= UNSPEC_REV64
;
19927 pred_mode
= VNx2BImode
;
19929 else if (size
== 4)
19931 unspec
= UNSPEC_REV32
;
19932 pred_mode
= VNx4BImode
;
19934 else if (size
== 2)
19936 unspec
= UNSPEC_REV16
;
19937 pred_mode
= VNx8BImode
;
19942 unsigned int step
= diff
+ 1;
19943 for (i
= 0; i
< step
; ++i
)
19944 if (!d
->perm
.series_p (i
, step
, diff
- i
, step
))
19951 if (d
->vec_flags
== VEC_SVE_DATA
)
19953 machine_mode int_mode
= aarch64_sve_int_mode (pred_mode
);
19954 rtx target
= gen_reg_rtx (int_mode
);
19955 if (BYTES_BIG_ENDIAN
)
19956 /* The act of taking a subreg between INT_MODE and d->vmode
19957 is itself a reversing operation on big-endian targets;
19958 see the comment at the head of aarch64-sve.md for details.
19959 First reinterpret OP0 as INT_MODE without using a subreg
19960 and without changing the contents. */
19961 emit_insn (gen_aarch64_sve_reinterpret (int_mode
, target
, d
->op0
));
19964 /* For SVE we use REV[BHW] unspecs derived from the element size
19965 of v->mode and vector modes whose elements have SIZE bytes.
19966 This ensures that the vector modes match the predicate modes. */
19967 int unspec
= aarch64_sve_rev_unspec (d
->vmode
);
19968 rtx pred
= aarch64_ptrue_reg (pred_mode
);
19969 emit_insn (gen_aarch64_pred (unspec
, int_mode
, target
, pred
,
19970 gen_lowpart (int_mode
, d
->op0
)));
19972 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, target
));
19975 rtx src
= gen_rtx_UNSPEC (d
->vmode
, gen_rtvec (1, d
->op0
), unspec
);
19976 emit_set_insn (d
->target
, src
);
19980 /* Recognize patterns for the REV insn, which reverses elements within
19984 aarch64_evpc_rev_global (struct expand_vec_perm_d
*d
)
19986 poly_uint64 nelt
= d
->perm
.length ();
19988 if (!d
->one_vector_p
|| d
->vec_flags
== VEC_ADVSIMD
)
19991 if (!d
->perm
.series_p (0, 1, nelt
- 1, -1))
19998 rtx src
= gen_rtx_UNSPEC (d
->vmode
, gen_rtvec (1, d
->op0
), UNSPEC_REV
);
19999 emit_set_insn (d
->target
, src
);
20004 aarch64_evpc_dup (struct expand_vec_perm_d
*d
)
20006 rtx out
= d
->target
;
20009 machine_mode vmode
= d
->vmode
;
20012 if (d
->vec_flags
== VEC_SVE_PRED
20013 || d
->perm
.encoding ().encoded_nelts () != 1
20014 || !d
->perm
[0].is_constant (&elt
))
20017 if (d
->vec_flags
== VEC_SVE_DATA
&& elt
>= 64 * GET_MODE_UNIT_SIZE (vmode
))
20024 /* The generic preparation in aarch64_expand_vec_perm_const_1
20025 swaps the operand order and the permute indices if it finds
20026 d->perm[0] to be in the second operand. Thus, we can always
20027 use d->op0 and need not do any extra arithmetic to get the
20028 correct lane number. */
20030 lane
= GEN_INT (elt
); /* The pattern corrects for big-endian. */
20032 rtx parallel
= gen_rtx_PARALLEL (vmode
, gen_rtvec (1, lane
));
20033 rtx select
= gen_rtx_VEC_SELECT (GET_MODE_INNER (vmode
), in0
, parallel
);
20034 emit_set_insn (out
, gen_rtx_VEC_DUPLICATE (vmode
, select
));
20039 aarch64_evpc_tbl (struct expand_vec_perm_d
*d
)
20041 rtx rperm
[MAX_COMPILE_TIME_VEC_BYTES
], sel
;
20042 machine_mode vmode
= d
->vmode
;
20044 /* Make sure that the indices are constant. */
20045 unsigned int encoded_nelts
= d
->perm
.encoding ().encoded_nelts ();
20046 for (unsigned int i
= 0; i
< encoded_nelts
; ++i
)
20047 if (!d
->perm
[i
].is_constant ())
20053 /* Generic code will try constant permutation twice. Once with the
20054 original mode and again with the elements lowered to QImode.
20055 So wait and don't do the selector expansion ourselves. */
20056 if (vmode
!= V8QImode
&& vmode
!= V16QImode
)
20059 /* to_constant is safe since this routine is specific to Advanced SIMD
20061 unsigned int nelt
= d
->perm
.length ().to_constant ();
20062 for (unsigned int i
= 0; i
< nelt
; ++i
)
20063 /* If big-endian and two vectors we end up with a weird mixed-endian
20064 mode on NEON. Reverse the index within each word but not the word
20065 itself. to_constant is safe because we checked is_constant above. */
20066 rperm
[i
] = GEN_INT (BYTES_BIG_ENDIAN
20067 ? d
->perm
[i
].to_constant () ^ (nelt
- 1)
20068 : d
->perm
[i
].to_constant ());
20070 sel
= gen_rtx_CONST_VECTOR (vmode
, gen_rtvec_v (nelt
, rperm
));
20071 sel
= force_reg (vmode
, sel
);
20073 aarch64_expand_vec_perm_1 (d
->target
, d
->op0
, d
->op1
, sel
);
20077 /* Try to implement D using an SVE TBL instruction. */
20080 aarch64_evpc_sve_tbl (struct expand_vec_perm_d
*d
)
20082 unsigned HOST_WIDE_INT nelt
;
20084 /* Permuting two variable-length vectors could overflow the
20086 if (!d
->one_vector_p
&& !d
->perm
.length ().is_constant (&nelt
))
20092 machine_mode sel_mode
= related_int_vector_mode (d
->vmode
).require ();
20093 rtx sel
= vec_perm_indices_to_rtx (sel_mode
, d
->perm
);
20094 if (d
->one_vector_p
)
20095 emit_unspec2 (d
->target
, UNSPEC_TBL
, d
->op0
, force_reg (sel_mode
, sel
));
20097 aarch64_expand_sve_vec_perm (d
->target
, d
->op0
, d
->op1
, sel
);
20101 /* Try to implement D using SVE SEL instruction. */
20104 aarch64_evpc_sel (struct expand_vec_perm_d
*d
)
20106 machine_mode vmode
= d
->vmode
;
20107 int unit_size
= GET_MODE_UNIT_SIZE (vmode
);
20109 if (d
->vec_flags
!= VEC_SVE_DATA
20113 int n_patterns
= d
->perm
.encoding ().npatterns ();
20114 poly_int64 vec_len
= d
->perm
.length ();
20116 for (int i
= 0; i
< n_patterns
; ++i
)
20117 if (!known_eq (d
->perm
[i
], i
)
20118 && !known_eq (d
->perm
[i
], vec_len
+ i
))
20121 for (int i
= n_patterns
; i
< n_patterns
* 2; i
++)
20122 if (!d
->perm
.series_p (i
, n_patterns
, i
, n_patterns
)
20123 && !d
->perm
.series_p (i
, n_patterns
, vec_len
+ i
, n_patterns
))
20129 machine_mode pred_mode
= aarch64_sve_pred_mode (vmode
);
20131 /* Build a predicate that is true when op0 elements should be used. */
20132 rtx_vector_builder
builder (pred_mode
, n_patterns
, 2);
20133 for (int i
= 0; i
< n_patterns
* 2; i
++)
20135 rtx elem
= known_eq (d
->perm
[i
], i
) ? CONST1_RTX (BImode
)
20136 : CONST0_RTX (BImode
);
20137 builder
.quick_push (elem
);
20140 rtx const_vec
= builder
.build ();
20141 rtx pred
= force_reg (pred_mode
, const_vec
);
20142 /* TARGET = PRED ? OP0 : OP1. */
20143 emit_insn (gen_vcond_mask (vmode
, vmode
, d
->target
, d
->op0
, d
->op1
, pred
));
20148 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d
*d
)
20150 /* The pattern matching functions above are written to look for a small
20151 number to begin the sequence (0, 1, N/2). If we begin with an index
20152 from the second operand, we can swap the operands. */
20153 poly_int64 nelt
= d
->perm
.length ();
20154 if (known_ge (d
->perm
[0], nelt
))
20156 d
->perm
.rotate_inputs (1);
20157 std::swap (d
->op0
, d
->op1
);
20160 if ((d
->vec_flags
== VEC_ADVSIMD
20161 || d
->vec_flags
== VEC_SVE_DATA
20162 || d
->vec_flags
== VEC_SVE_PRED
)
20163 && known_gt (nelt
, 1))
20165 if (aarch64_evpc_rev_local (d
))
20167 else if (aarch64_evpc_rev_global (d
))
20169 else if (aarch64_evpc_ext (d
))
20171 else if (aarch64_evpc_dup (d
))
20173 else if (aarch64_evpc_zip (d
))
20175 else if (aarch64_evpc_uzp (d
))
20177 else if (aarch64_evpc_trn (d
))
20179 else if (aarch64_evpc_sel (d
))
20181 if (d
->vec_flags
== VEC_SVE_DATA
)
20182 return aarch64_evpc_sve_tbl (d
);
20183 else if (d
->vec_flags
== VEC_ADVSIMD
)
20184 return aarch64_evpc_tbl (d
);
20189 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST. */
20192 aarch64_vectorize_vec_perm_const (machine_mode vmode
, rtx target
, rtx op0
,
20193 rtx op1
, const vec_perm_indices
&sel
)
20195 struct expand_vec_perm_d d
;
20197 /* Check whether the mask can be applied to a single vector. */
20198 if (sel
.ninputs () == 1
20199 || (op0
&& rtx_equal_p (op0
, op1
)))
20200 d
.one_vector_p
= true;
20201 else if (sel
.all_from_input_p (0))
20203 d
.one_vector_p
= true;
20206 else if (sel
.all_from_input_p (1))
20208 d
.one_vector_p
= true;
20212 d
.one_vector_p
= false;
20214 d
.perm
.new_vector (sel
.encoding (), d
.one_vector_p
? 1 : 2,
20215 sel
.nelts_per_input ());
20217 d
.vec_flags
= aarch64_classify_vector_mode (d
.vmode
);
20221 d
.testing_p
= !target
;
20224 return aarch64_expand_vec_perm_const_1 (&d
);
20226 rtx_insn
*last
= get_last_insn ();
20227 bool ret
= aarch64_expand_vec_perm_const_1 (&d
);
20228 gcc_assert (last
== get_last_insn ());
20233 /* Generate a byte permute mask for a register of mode MODE,
20234 which has NUNITS units. */
20237 aarch64_reverse_mask (machine_mode mode
, unsigned int nunits
)
20239 /* We have to reverse each vector because we dont have
20240 a permuted load that can reverse-load according to ABI rules. */
20242 rtvec v
= rtvec_alloc (16);
20244 unsigned int usize
= GET_MODE_UNIT_SIZE (mode
);
20246 gcc_assert (BYTES_BIG_ENDIAN
);
20247 gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode
));
20249 for (i
= 0; i
< nunits
; i
++)
20250 for (j
= 0; j
< usize
; j
++)
20251 RTVEC_ELT (v
, i
* usize
+ j
) = GEN_INT ((i
+ 1) * usize
- 1 - j
);
20252 mask
= gen_rtx_CONST_VECTOR (V16QImode
, v
);
20253 return force_reg (V16QImode
, mask
);
20256 /* Expand an SVE integer comparison using the SVE equivalent of:
20258 (set TARGET (CODE OP0 OP1)). */
20261 aarch64_expand_sve_vec_cmp_int (rtx target
, rtx_code code
, rtx op0
, rtx op1
)
20263 machine_mode pred_mode
= GET_MODE (target
);
20264 machine_mode data_mode
= GET_MODE (op0
);
20265 rtx res
= aarch64_sve_emit_int_cmp (target
, pred_mode
, code
, data_mode
,
20267 if (!rtx_equal_p (target
, res
))
20268 emit_move_insn (target
, res
);
20271 /* Return the UNSPEC_COND_* code for comparison CODE. */
20273 static unsigned int
20274 aarch64_unspec_cond_code (rtx_code code
)
20279 return UNSPEC_COND_FCMNE
;
20281 return UNSPEC_COND_FCMEQ
;
20283 return UNSPEC_COND_FCMLT
;
20285 return UNSPEC_COND_FCMGT
;
20287 return UNSPEC_COND_FCMLE
;
20289 return UNSPEC_COND_FCMGE
;
20291 return UNSPEC_COND_FCMUO
;
20293 gcc_unreachable ();
20299 (set TARGET (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X>))
20301 where <X> is the operation associated with comparison CODE.
20302 KNOWN_PTRUE_P is true if PRED is known to be a PTRUE. */
20305 aarch64_emit_sve_fp_cond (rtx target
, rtx_code code
, rtx pred
,
20306 bool known_ptrue_p
, rtx op0
, rtx op1
)
20308 rtx flag
= gen_int_mode (known_ptrue_p
, SImode
);
20309 rtx unspec
= gen_rtx_UNSPEC (GET_MODE (pred
),
20310 gen_rtvec (4, pred
, flag
, op0
, op1
),
20311 aarch64_unspec_cond_code (code
));
20312 emit_set_insn (target
, unspec
);
20315 /* Emit the SVE equivalent of:
20317 (set TMP1 (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X1>))
20318 (set TMP2 (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X2>))
20319 (set TARGET (ior:PRED_MODE TMP1 TMP2))
20321 where <Xi> is the operation associated with comparison CODEi.
20322 KNOWN_PTRUE_P is true if PRED is known to be a PTRUE. */
20325 aarch64_emit_sve_or_fp_conds (rtx target
, rtx_code code1
, rtx_code code2
,
20326 rtx pred
, bool known_ptrue_p
, rtx op0
, rtx op1
)
20328 machine_mode pred_mode
= GET_MODE (pred
);
20329 rtx tmp1
= gen_reg_rtx (pred_mode
);
20330 aarch64_emit_sve_fp_cond (tmp1
, code1
, pred
, known_ptrue_p
, op0
, op1
);
20331 rtx tmp2
= gen_reg_rtx (pred_mode
);
20332 aarch64_emit_sve_fp_cond (tmp2
, code2
, pred
, known_ptrue_p
, op0
, op1
);
20333 aarch64_emit_binop (target
, ior_optab
, tmp1
, tmp2
);
20336 /* Emit the SVE equivalent of:
20338 (set TMP (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X>))
20339 (set TARGET (not TMP))
20341 where <X> is the operation associated with comparison CODE.
20342 KNOWN_PTRUE_P is true if PRED is known to be a PTRUE. */
20345 aarch64_emit_sve_invert_fp_cond (rtx target
, rtx_code code
, rtx pred
,
20346 bool known_ptrue_p
, rtx op0
, rtx op1
)
20348 machine_mode pred_mode
= GET_MODE (pred
);
20349 rtx tmp
= gen_reg_rtx (pred_mode
);
20350 aarch64_emit_sve_fp_cond (tmp
, code
, pred
, known_ptrue_p
, op0
, op1
);
20351 aarch64_emit_unop (target
, one_cmpl_optab
, tmp
);
20354 /* Expand an SVE floating-point comparison using the SVE equivalent of:
20356 (set TARGET (CODE OP0 OP1))
20358 If CAN_INVERT_P is true, the caller can also handle inverted results;
20359 return true if the result is in fact inverted. */
20362 aarch64_expand_sve_vec_cmp_float (rtx target
, rtx_code code
,
20363 rtx op0
, rtx op1
, bool can_invert_p
)
20365 machine_mode pred_mode
= GET_MODE (target
);
20366 machine_mode data_mode
= GET_MODE (op0
);
20368 rtx ptrue
= aarch64_ptrue_reg (pred_mode
);
20372 /* UNORDERED has no immediate form. */
20373 op1
= force_reg (data_mode
, op1
);
20382 /* There is native support for the comparison. */
20383 aarch64_emit_sve_fp_cond (target
, code
, ptrue
, true, op0
, op1
);
20388 /* This is a trapping operation (LT or GT). */
20389 aarch64_emit_sve_or_fp_conds (target
, LT
, GT
, ptrue
, true, op0
, op1
);
20393 if (!flag_trapping_math
)
20395 /* This would trap for signaling NaNs. */
20396 op1
= force_reg (data_mode
, op1
);
20397 aarch64_emit_sve_or_fp_conds (target
, UNORDERED
, EQ
,
20398 ptrue
, true, op0
, op1
);
20406 if (flag_trapping_math
)
20408 /* Work out which elements are ordered. */
20409 rtx ordered
= gen_reg_rtx (pred_mode
);
20410 op1
= force_reg (data_mode
, op1
);
20411 aarch64_emit_sve_invert_fp_cond (ordered
, UNORDERED
,
20412 ptrue
, true, op0
, op1
);
20414 /* Test the opposite condition for the ordered elements,
20415 then invert the result. */
20419 code
= reverse_condition_maybe_unordered (code
);
20422 aarch64_emit_sve_fp_cond (target
, code
,
20423 ordered
, false, op0
, op1
);
20426 aarch64_emit_sve_invert_fp_cond (target
, code
,
20427 ordered
, false, op0
, op1
);
20433 /* ORDERED has no immediate form. */
20434 op1
= force_reg (data_mode
, op1
);
20438 gcc_unreachable ();
20441 /* There is native support for the inverse comparison. */
20442 code
= reverse_condition_maybe_unordered (code
);
20445 aarch64_emit_sve_fp_cond (target
, code
, ptrue
, true, op0
, op1
);
20448 aarch64_emit_sve_invert_fp_cond (target
, code
, ptrue
, true, op0
, op1
);
20452 /* Expand an SVE vcond pattern with operands OPS. DATA_MODE is the mode
20453 of the data being selected and CMP_MODE is the mode of the values being
20457 aarch64_expand_sve_vcond (machine_mode data_mode
, machine_mode cmp_mode
,
20460 machine_mode pred_mode
= aarch64_get_mask_mode (cmp_mode
).require ();
20461 rtx pred
= gen_reg_rtx (pred_mode
);
20462 if (FLOAT_MODE_P (cmp_mode
))
20464 if (aarch64_expand_sve_vec_cmp_float (pred
, GET_CODE (ops
[3]),
20465 ops
[4], ops
[5], true))
20466 std::swap (ops
[1], ops
[2]);
20469 aarch64_expand_sve_vec_cmp_int (pred
, GET_CODE (ops
[3]), ops
[4], ops
[5]);
20471 if (!aarch64_sve_reg_or_dup_imm (ops
[1], data_mode
))
20472 ops
[1] = force_reg (data_mode
, ops
[1]);
20473 /* The "false" value can only be zero if the "true" value is a constant. */
20474 if (register_operand (ops
[1], data_mode
)
20475 || !aarch64_simd_reg_or_zero (ops
[2], data_mode
))
20476 ops
[2] = force_reg (data_mode
, ops
[2]);
20478 rtvec vec
= gen_rtvec (3, pred
, ops
[1], ops
[2]);
20479 emit_set_insn (ops
[0], gen_rtx_UNSPEC (data_mode
, vec
, UNSPEC_SEL
));
20482 /* Implement TARGET_MODES_TIEABLE_P. In principle we should always return
20483 true. However due to issues with register allocation it is preferable
20484 to avoid tieing integer scalar and FP scalar modes. Executing integer
20485 operations in general registers is better than treating them as scalar
20486 vector operations. This reduces latency and avoids redundant int<->FP
20487 moves. So tie modes if they are either the same class, or vector modes
20488 with other vector modes, vector structs or any scalar mode. */
20491 aarch64_modes_tieable_p (machine_mode mode1
, machine_mode mode2
)
20493 if (GET_MODE_CLASS (mode1
) == GET_MODE_CLASS (mode2
))
20496 /* We specifically want to allow elements of "structure" modes to
20497 be tieable to the structure. This more general condition allows
20498 other rarer situations too. The reason we don't extend this to
20499 predicate modes is that there are no predicate structure modes
20500 nor any specific instructions for extracting part of a predicate
20502 if (aarch64_vector_data_mode_p (mode1
)
20503 && aarch64_vector_data_mode_p (mode2
))
20506 /* Also allow any scalar modes with vectors. */
20507 if (aarch64_vector_mode_supported_p (mode1
)
20508 || aarch64_vector_mode_supported_p (mode2
))
20514 /* Return a new RTX holding the result of moving POINTER forward by
20518 aarch64_move_pointer (rtx pointer
, poly_int64 amount
)
20520 rtx next
= plus_constant (Pmode
, XEXP (pointer
, 0), amount
);
20522 return adjust_automodify_address (pointer
, GET_MODE (pointer
),
20526 /* Return a new RTX holding the result of moving POINTER forward by the
20527 size of the mode it points to. */
20530 aarch64_progress_pointer (rtx pointer
)
20532 return aarch64_move_pointer (pointer
, GET_MODE_SIZE (GET_MODE (pointer
)));
20535 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
20539 aarch64_copy_one_block_and_progress_pointers (rtx
*src
, rtx
*dst
,
20542 rtx reg
= gen_reg_rtx (mode
);
20544 /* "Cast" the pointers to the correct mode. */
20545 *src
= adjust_address (*src
, mode
, 0);
20546 *dst
= adjust_address (*dst
, mode
, 0);
20547 /* Emit the memcpy. */
20548 emit_move_insn (reg
, *src
);
20549 emit_move_insn (*dst
, reg
);
20550 /* Move the pointers forward. */
20551 *src
= aarch64_progress_pointer (*src
);
20552 *dst
= aarch64_progress_pointer (*dst
);
20555 /* Expand cpymem, as if from a __builtin_memcpy. Return true if
20556 we succeed, otherwise return false. */
20559 aarch64_expand_cpymem (rtx
*operands
)
20562 rtx dst
= operands
[0];
20563 rtx src
= operands
[1];
20565 machine_mode cur_mode
= BLKmode
, next_mode
;
20566 bool speed_p
= !optimize_function_for_size_p (cfun
);
20568 /* When optimizing for size, give a better estimate of the length of a
20569 memcpy call, but use the default otherwise. Moves larger than 8 bytes
20570 will always require an even number of instructions to do now. And each
20571 operation requires both a load+store, so devide the max number by 2. */
20572 int max_num_moves
= (speed_p
? 16 : AARCH64_CALL_RATIO
) / 2;
20574 /* We can't do anything smart if the amount to copy is not constant. */
20575 if (!CONST_INT_P (operands
[2]))
20578 n
= INTVAL (operands
[2]);
20580 /* Try to keep the number of instructions low. For all cases we will do at
20581 most two moves for the residual amount, since we'll always overlap the
20583 if (((n
/ 16) + (n
% 16 ? 2 : 0)) > max_num_moves
)
20586 base
= copy_to_mode_reg (Pmode
, XEXP (dst
, 0));
20587 dst
= adjust_automodify_address (dst
, VOIDmode
, base
, 0);
20589 base
= copy_to_mode_reg (Pmode
, XEXP (src
, 0));
20590 src
= adjust_automodify_address (src
, VOIDmode
, base
, 0);
20592 /* Convert n to bits to make the rest of the code simpler. */
20593 n
= n
* BITS_PER_UNIT
;
20595 /* Maximum amount to copy in one go. The AArch64 back-end has integer modes
20596 larger than TImode, but we should not use them for loads/stores here. */
20597 const int copy_limit
= GET_MODE_BITSIZE (TImode
);
20601 /* Find the largest mode in which to do the copy in without over reading
20603 opt_scalar_int_mode mode_iter
;
20604 FOR_EACH_MODE_IN_CLASS (mode_iter
, MODE_INT
)
20605 if (GET_MODE_BITSIZE (mode_iter
.require ()) <= MIN (n
, copy_limit
))
20606 cur_mode
= mode_iter
.require ();
20608 gcc_assert (cur_mode
!= BLKmode
);
20610 mode_bits
= GET_MODE_BITSIZE (cur_mode
).to_constant ();
20611 aarch64_copy_one_block_and_progress_pointers (&src
, &dst
, cur_mode
);
20615 /* Do certain trailing copies as overlapping if it's going to be
20616 cheaper. i.e. less instructions to do so. For instance doing a 15
20617 byte copy it's more efficient to do two overlapping 8 byte copies than
20619 if (n
> 0 && n
<= 8 * BITS_PER_UNIT
)
20621 next_mode
= smallest_mode_for_size (n
, MODE_INT
);
20622 int n_bits
= GET_MODE_BITSIZE (next_mode
).to_constant ();
20623 src
= aarch64_move_pointer (src
, (n
- n_bits
) / BITS_PER_UNIT
);
20624 dst
= aarch64_move_pointer (dst
, (n
- n_bits
) / BITS_PER_UNIT
);
20632 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
20633 SImode stores. Handle the case when the constant has identical
20634 bottom and top halves. This is beneficial when the two stores can be
20635 merged into an STP and we avoid synthesising potentially expensive
20636 immediates twice. Return true if such a split is possible. */
20639 aarch64_split_dimode_const_store (rtx dst
, rtx src
)
20641 rtx lo
= gen_lowpart (SImode
, src
);
20642 rtx hi
= gen_highpart_mode (SImode
, DImode
, src
);
20644 bool size_p
= optimize_function_for_size_p (cfun
);
20646 if (!rtx_equal_p (lo
, hi
))
20649 unsigned int orig_cost
20650 = aarch64_internal_mov_immediate (NULL_RTX
, src
, false, DImode
);
20651 unsigned int lo_cost
20652 = aarch64_internal_mov_immediate (NULL_RTX
, lo
, false, SImode
);
20654 /* We want to transform:
20656 MOVK x1, 0x140, lsl 16
20657 MOVK x1, 0xc0da, lsl 32
20658 MOVK x1, 0x140, lsl 48
20662 MOVK w1, 0x140, lsl 16
20664 So we want to perform this only when we save two instructions
20665 or more. When optimizing for size, however, accept any code size
20667 if (size_p
&& orig_cost
<= lo_cost
)
20671 && (orig_cost
<= lo_cost
+ 1))
20674 rtx mem_lo
= adjust_address (dst
, SImode
, 0);
20675 if (!aarch64_mem_pair_operand (mem_lo
, SImode
))
20678 rtx tmp_reg
= gen_reg_rtx (SImode
);
20679 aarch64_expand_mov_immediate (tmp_reg
, lo
);
20680 rtx mem_hi
= aarch64_move_pointer (mem_lo
, GET_MODE_SIZE (SImode
));
20681 /* Don't emit an explicit store pair as this may not be always profitable.
20682 Let the sched-fusion logic decide whether to merge them. */
20683 emit_move_insn (mem_lo
, tmp_reg
);
20684 emit_move_insn (mem_hi
, tmp_reg
);
20689 /* Generate RTL for a conditional branch with rtx comparison CODE in
20690 mode CC_MODE. The destination of the unlikely conditional branch
20694 aarch64_gen_unlikely_cbranch (enum rtx_code code
, machine_mode cc_mode
,
20698 x
= gen_rtx_fmt_ee (code
, VOIDmode
,
20699 gen_rtx_REG (cc_mode
, CC_REGNUM
),
20702 x
= gen_rtx_IF_THEN_ELSE (VOIDmode
, x
,
20703 gen_rtx_LABEL_REF (VOIDmode
, label_ref
),
20705 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx
, x
));
20708 /* Generate DImode scratch registers for 128-bit (TImode) addition.
20710 OP1 represents the TImode destination operand 1
20711 OP2 represents the TImode destination operand 2
20712 LOW_DEST represents the low half (DImode) of TImode operand 0
20713 LOW_IN1 represents the low half (DImode) of TImode operand 1
20714 LOW_IN2 represents the low half (DImode) of TImode operand 2
20715 HIGH_DEST represents the high half (DImode) of TImode operand 0
20716 HIGH_IN1 represents the high half (DImode) of TImode operand 1
20717 HIGH_IN2 represents the high half (DImode) of TImode operand 2. */
20720 aarch64_addti_scratch_regs (rtx op1
, rtx op2
, rtx
*low_dest
,
20721 rtx
*low_in1
, rtx
*low_in2
,
20722 rtx
*high_dest
, rtx
*high_in1
,
20725 *low_dest
= gen_reg_rtx (DImode
);
20726 *low_in1
= gen_lowpart (DImode
, op1
);
20727 *low_in2
= simplify_gen_subreg (DImode
, op2
, TImode
,
20728 subreg_lowpart_offset (DImode
, TImode
));
20729 *high_dest
= gen_reg_rtx (DImode
);
20730 *high_in1
= gen_highpart (DImode
, op1
);
20731 *high_in2
= simplify_gen_subreg (DImode
, op2
, TImode
,
20732 subreg_highpart_offset (DImode
, TImode
));
20735 /* Generate DImode scratch registers for 128-bit (TImode) subtraction.
20737 This function differs from 'arch64_addti_scratch_regs' in that
20738 OP1 can be an immediate constant (zero). We must call
20739 subreg_highpart_offset with DImode and TImode arguments, otherwise
20740 VOIDmode will be used for the const_int which generates an internal
20741 error from subreg_size_highpart_offset which does not expect a size of zero.
20743 OP1 represents the TImode destination operand 1
20744 OP2 represents the TImode destination operand 2
20745 LOW_DEST represents the low half (DImode) of TImode operand 0
20746 LOW_IN1 represents the low half (DImode) of TImode operand 1
20747 LOW_IN2 represents the low half (DImode) of TImode operand 2
20748 HIGH_DEST represents the high half (DImode) of TImode operand 0
20749 HIGH_IN1 represents the high half (DImode) of TImode operand 1
20750 HIGH_IN2 represents the high half (DImode) of TImode operand 2. */
20754 aarch64_subvti_scratch_regs (rtx op1
, rtx op2
, rtx
*low_dest
,
20755 rtx
*low_in1
, rtx
*low_in2
,
20756 rtx
*high_dest
, rtx
*high_in1
,
20759 *low_dest
= gen_reg_rtx (DImode
);
20760 *low_in1
= simplify_gen_subreg (DImode
, op1
, TImode
,
20761 subreg_lowpart_offset (DImode
, TImode
));
20763 *low_in2
= simplify_gen_subreg (DImode
, op2
, TImode
,
20764 subreg_lowpart_offset (DImode
, TImode
));
20765 *high_dest
= gen_reg_rtx (DImode
);
20767 *high_in1
= simplify_gen_subreg (DImode
, op1
, TImode
,
20768 subreg_highpart_offset (DImode
, TImode
));
20769 *high_in2
= simplify_gen_subreg (DImode
, op2
, TImode
,
20770 subreg_highpart_offset (DImode
, TImode
));
20773 /* Generate RTL for 128-bit (TImode) subtraction with overflow.
20775 OP0 represents the TImode destination operand 0
20776 LOW_DEST represents the low half (DImode) of TImode operand 0
20777 LOW_IN1 represents the low half (DImode) of TImode operand 1
20778 LOW_IN2 represents the low half (DImode) of TImode operand 2
20779 HIGH_DEST represents the high half (DImode) of TImode operand 0
20780 HIGH_IN1 represents the high half (DImode) of TImode operand 1
20781 HIGH_IN2 represents the high half (DImode) of TImode operand 2
20782 UNSIGNED_P is true if the operation is being performed on unsigned
20785 aarch64_expand_subvti (rtx op0
, rtx low_dest
, rtx low_in1
,
20786 rtx low_in2
, rtx high_dest
, rtx high_in1
,
20787 rtx high_in2
, bool unsigned_p
)
20789 if (low_in2
== const0_rtx
)
20791 low_dest
= low_in1
;
20792 high_in2
= force_reg (DImode
, high_in2
);
20794 emit_insn (gen_subdi3_compare1 (high_dest
, high_in1
, high_in2
));
20796 emit_insn (gen_subvdi_insn (high_dest
, high_in1
, high_in2
));
20800 if (aarch64_plus_immediate (low_in2
, DImode
))
20801 emit_insn (gen_subdi3_compare1_imm (low_dest
, low_in1
, low_in2
,
20802 GEN_INT (-INTVAL (low_in2
))));
20805 low_in2
= force_reg (DImode
, low_in2
);
20806 emit_insn (gen_subdi3_compare1 (low_dest
, low_in1
, low_in2
));
20808 high_in2
= force_reg (DImode
, high_in2
);
20811 emit_insn (gen_usubdi3_carryinC (high_dest
, high_in1
, high_in2
));
20813 emit_insn (gen_subdi3_carryinV (high_dest
, high_in1
, high_in2
));
20816 emit_move_insn (gen_lowpart (DImode
, op0
), low_dest
);
20817 emit_move_insn (gen_highpart (DImode
, op0
), high_dest
);
20821 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
20823 static unsigned HOST_WIDE_INT
20824 aarch64_asan_shadow_offset (void)
20827 return (HOST_WIDE_INT_1
<< 29);
20829 return (HOST_WIDE_INT_1
<< 36);
20833 aarch64_gen_ccmp_first (rtx_insn
**prep_seq
, rtx_insn
**gen_seq
,
20834 int code
, tree treeop0
, tree treeop1
)
20836 machine_mode op_mode
, cmp_mode
, cc_mode
= CCmode
;
20838 int unsignedp
= TYPE_UNSIGNED (TREE_TYPE (treeop0
));
20840 struct expand_operand ops
[4];
20843 expand_operands (treeop0
, treeop1
, NULL_RTX
, &op0
, &op1
, EXPAND_NORMAL
);
20845 op_mode
= GET_MODE (op0
);
20846 if (op_mode
== VOIDmode
)
20847 op_mode
= GET_MODE (op1
);
20855 icode
= CODE_FOR_cmpsi
;
20860 icode
= CODE_FOR_cmpdi
;
20865 cc_mode
= aarch64_select_cc_mode ((rtx_code
) code
, op0
, op1
);
20866 icode
= cc_mode
== CCFPEmode
? CODE_FOR_fcmpesf
: CODE_FOR_fcmpsf
;
20871 cc_mode
= aarch64_select_cc_mode ((rtx_code
) code
, op0
, op1
);
20872 icode
= cc_mode
== CCFPEmode
? CODE_FOR_fcmpedf
: CODE_FOR_fcmpdf
;
20880 op0
= prepare_operand (icode
, op0
, 0, op_mode
, cmp_mode
, unsignedp
);
20881 op1
= prepare_operand (icode
, op1
, 1, op_mode
, cmp_mode
, unsignedp
);
20887 *prep_seq
= get_insns ();
20890 create_fixed_operand (&ops
[0], op0
);
20891 create_fixed_operand (&ops
[1], op1
);
20894 if (!maybe_expand_insn (icode
, 2, ops
))
20899 *gen_seq
= get_insns ();
20902 return gen_rtx_fmt_ee ((rtx_code
) code
, cc_mode
,
20903 gen_rtx_REG (cc_mode
, CC_REGNUM
), const0_rtx
);
20907 aarch64_gen_ccmp_next (rtx_insn
**prep_seq
, rtx_insn
**gen_seq
, rtx prev
,
20908 int cmp_code
, tree treeop0
, tree treeop1
, int bit_code
)
20910 rtx op0
, op1
, target
;
20911 machine_mode op_mode
, cmp_mode
, cc_mode
= CCmode
;
20912 int unsignedp
= TYPE_UNSIGNED (TREE_TYPE (treeop0
));
20914 struct expand_operand ops
[6];
20917 push_to_sequence (*prep_seq
);
20918 expand_operands (treeop0
, treeop1
, NULL_RTX
, &op0
, &op1
, EXPAND_NORMAL
);
20920 op_mode
= GET_MODE (op0
);
20921 if (op_mode
== VOIDmode
)
20922 op_mode
= GET_MODE (op1
);
20938 cc_mode
= aarch64_select_cc_mode ((rtx_code
) cmp_code
, op0
, op1
);
20943 cc_mode
= aarch64_select_cc_mode ((rtx_code
) cmp_code
, op0
, op1
);
20951 icode
= code_for_ccmp (cc_mode
, cmp_mode
);
20953 op0
= prepare_operand (icode
, op0
, 2, op_mode
, cmp_mode
, unsignedp
);
20954 op1
= prepare_operand (icode
, op1
, 3, op_mode
, cmp_mode
, unsignedp
);
20960 *prep_seq
= get_insns ();
20963 target
= gen_rtx_REG (cc_mode
, CC_REGNUM
);
20964 aarch64_cond
= aarch64_get_condition_code_1 (cc_mode
, (rtx_code
) cmp_code
);
20966 if (bit_code
!= AND
)
20968 /* Treat the ccmp patterns as canonical and use them where possible,
20969 but fall back to ccmp_rev patterns if there's no other option. */
20970 rtx_code prev_code
= GET_CODE (prev
);
20971 machine_mode prev_mode
= GET_MODE (XEXP (prev
, 0));
20972 if ((prev_mode
== CCFPmode
|| prev_mode
== CCFPEmode
)
20973 && !(prev_code
== EQ
20975 || prev_code
== ORDERED
20976 || prev_code
== UNORDERED
))
20977 icode
= code_for_ccmp_rev (cc_mode
, cmp_mode
);
20980 rtx_code code
= reverse_condition (prev_code
);
20981 prev
= gen_rtx_fmt_ee (code
, VOIDmode
, XEXP (prev
, 0), const0_rtx
);
20983 aarch64_cond
= AARCH64_INVERSE_CONDITION_CODE (aarch64_cond
);
20986 create_fixed_operand (&ops
[0], XEXP (prev
, 0));
20987 create_fixed_operand (&ops
[1], target
);
20988 create_fixed_operand (&ops
[2], op0
);
20989 create_fixed_operand (&ops
[3], op1
);
20990 create_fixed_operand (&ops
[4], prev
);
20991 create_fixed_operand (&ops
[5], GEN_INT (aarch64_cond
));
20993 push_to_sequence (*gen_seq
);
20994 if (!maybe_expand_insn (icode
, 6, ops
))
21000 *gen_seq
= get_insns ();
21003 return gen_rtx_fmt_ee ((rtx_code
) cmp_code
, VOIDmode
, target
, const0_rtx
);
21006 #undef TARGET_GEN_CCMP_FIRST
21007 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
21009 #undef TARGET_GEN_CCMP_NEXT
21010 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
21012 /* Implement TARGET_SCHED_MACRO_FUSION_P. Return true if target supports
21013 instruction fusion of some sort. */
21016 aarch64_macro_fusion_p (void)
21018 return aarch64_tune_params
.fusible_ops
!= AARCH64_FUSE_NOTHING
;
21022 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P. Return true if PREV and CURR
21023 should be kept together during scheduling. */
21026 aarch_macro_fusion_pair_p (rtx_insn
*prev
, rtx_insn
*curr
)
21029 rtx prev_set
= single_set (prev
);
21030 rtx curr_set
= single_set (curr
);
21031 /* prev and curr are simple SET insns i.e. no flag setting or branching. */
21032 bool simple_sets_p
= prev_set
&& curr_set
&& !any_condjump_p (curr
);
21034 if (!aarch64_macro_fusion_p ())
21037 if (simple_sets_p
&& aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK
))
21039 /* We are trying to match:
21040 prev (mov) == (set (reg r0) (const_int imm16))
21041 curr (movk) == (set (zero_extract (reg r0)
21044 (const_int imm16_1)) */
21046 set_dest
= SET_DEST (curr_set
);
21048 if (GET_CODE (set_dest
) == ZERO_EXTRACT
21049 && CONST_INT_P (SET_SRC (curr_set
))
21050 && CONST_INT_P (SET_SRC (prev_set
))
21051 && CONST_INT_P (XEXP (set_dest
, 2))
21052 && INTVAL (XEXP (set_dest
, 2)) == 16
21053 && REG_P (XEXP (set_dest
, 0))
21054 && REG_P (SET_DEST (prev_set
))
21055 && REGNO (XEXP (set_dest
, 0)) == REGNO (SET_DEST (prev_set
)))
21061 if (simple_sets_p
&& aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD
))
21064 /* We're trying to match:
21065 prev (adrp) == (set (reg r1)
21066 (high (symbol_ref ("SYM"))))
21067 curr (add) == (set (reg r0)
21069 (symbol_ref ("SYM"))))
21070 Note that r0 need not necessarily be the same as r1, especially
21071 during pre-regalloc scheduling. */
21073 if (satisfies_constraint_Ush (SET_SRC (prev_set
))
21074 && REG_P (SET_DEST (prev_set
)) && REG_P (SET_DEST (curr_set
)))
21076 if (GET_CODE (SET_SRC (curr_set
)) == LO_SUM
21077 && REG_P (XEXP (SET_SRC (curr_set
), 0))
21078 && REGNO (XEXP (SET_SRC (curr_set
), 0))
21079 == REGNO (SET_DEST (prev_set
))
21080 && rtx_equal_p (XEXP (SET_SRC (prev_set
), 0),
21081 XEXP (SET_SRC (curr_set
), 1)))
21086 if (simple_sets_p
&& aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK
))
21089 /* We're trying to match:
21090 prev (movk) == (set (zero_extract (reg r0)
21093 (const_int imm16_1))
21094 curr (movk) == (set (zero_extract (reg r0)
21097 (const_int imm16_2)) */
21099 if (GET_CODE (SET_DEST (prev_set
)) == ZERO_EXTRACT
21100 && GET_CODE (SET_DEST (curr_set
)) == ZERO_EXTRACT
21101 && REG_P (XEXP (SET_DEST (prev_set
), 0))
21102 && REG_P (XEXP (SET_DEST (curr_set
), 0))
21103 && REGNO (XEXP (SET_DEST (prev_set
), 0))
21104 == REGNO (XEXP (SET_DEST (curr_set
), 0))
21105 && CONST_INT_P (XEXP (SET_DEST (prev_set
), 2))
21106 && CONST_INT_P (XEXP (SET_DEST (curr_set
), 2))
21107 && INTVAL (XEXP (SET_DEST (prev_set
), 2)) == 32
21108 && INTVAL (XEXP (SET_DEST (curr_set
), 2)) == 48
21109 && CONST_INT_P (SET_SRC (prev_set
))
21110 && CONST_INT_P (SET_SRC (curr_set
)))
21114 if (simple_sets_p
&& aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR
))
21116 /* We're trying to match:
21117 prev (adrp) == (set (reg r0)
21118 (high (symbol_ref ("SYM"))))
21119 curr (ldr) == (set (reg r1)
21120 (mem (lo_sum (reg r0)
21121 (symbol_ref ("SYM")))))
21123 curr (ldr) == (set (reg r1)
21126 (symbol_ref ("SYM")))))) */
21127 if (satisfies_constraint_Ush (SET_SRC (prev_set
))
21128 && REG_P (SET_DEST (prev_set
)) && REG_P (SET_DEST (curr_set
)))
21130 rtx curr_src
= SET_SRC (curr_set
);
21132 if (GET_CODE (curr_src
) == ZERO_EXTEND
)
21133 curr_src
= XEXP (curr_src
, 0);
21135 if (MEM_P (curr_src
) && GET_CODE (XEXP (curr_src
, 0)) == LO_SUM
21136 && REG_P (XEXP (XEXP (curr_src
, 0), 0))
21137 && REGNO (XEXP (XEXP (curr_src
, 0), 0))
21138 == REGNO (SET_DEST (prev_set
))
21139 && rtx_equal_p (XEXP (XEXP (curr_src
, 0), 1),
21140 XEXP (SET_SRC (prev_set
), 0)))
21145 /* Fuse compare (CMP/CMN/TST/BICS) and conditional branch. */
21146 if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH
)
21147 && prev_set
&& curr_set
&& any_condjump_p (curr
)
21148 && GET_CODE (SET_SRC (prev_set
)) == COMPARE
21149 && SCALAR_INT_MODE_P (GET_MODE (XEXP (SET_SRC (prev_set
), 0)))
21150 && reg_referenced_p (SET_DEST (prev_set
), PATTERN (curr
)))
21153 /* Fuse flag-setting ALU instructions and conditional branch. */
21154 if (aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH
)
21155 && any_condjump_p (curr
))
21157 unsigned int condreg1
, condreg2
;
21159 aarch64_fixed_condition_code_regs (&condreg1
, &condreg2
);
21160 cc_reg_1
= gen_rtx_REG (CCmode
, condreg1
);
21162 if (reg_referenced_p (cc_reg_1
, PATTERN (curr
))
21164 && modified_in_p (cc_reg_1
, prev
))
21166 enum attr_type prev_type
= get_attr_type (prev
);
21168 /* FIXME: this misses some which is considered simple arthematic
21169 instructions for ThunderX. Simple shifts are missed here. */
21170 if (prev_type
== TYPE_ALUS_SREG
21171 || prev_type
== TYPE_ALUS_IMM
21172 || prev_type
== TYPE_LOGICS_REG
21173 || prev_type
== TYPE_LOGICS_IMM
)
21178 /* Fuse ALU instructions and CBZ/CBNZ. */
21181 && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_CBZ
)
21182 && any_condjump_p (curr
))
21184 /* We're trying to match:
21185 prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
21186 curr (cbz) == (set (pc) (if_then_else (eq/ne) (r0)
21188 (label_ref ("SYM"))
21190 if (SET_DEST (curr_set
) == (pc_rtx
)
21191 && GET_CODE (SET_SRC (curr_set
)) == IF_THEN_ELSE
21192 && REG_P (XEXP (XEXP (SET_SRC (curr_set
), 0), 0))
21193 && REG_P (SET_DEST (prev_set
))
21194 && REGNO (SET_DEST (prev_set
))
21195 == REGNO (XEXP (XEXP (SET_SRC (curr_set
), 0), 0)))
21197 /* Fuse ALU operations followed by conditional branch instruction. */
21198 switch (get_attr_type (prev
))
21201 case TYPE_ALU_SREG
:
21204 case TYPE_ADCS_REG
:
21205 case TYPE_ADCS_IMM
:
21206 case TYPE_LOGIC_REG
:
21207 case TYPE_LOGIC_IMM
:
21211 case TYPE_SHIFT_REG
:
21212 case TYPE_SHIFT_IMM
:
21227 /* Return true iff the instruction fusion described by OP is enabled. */
21230 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op
)
21232 return (aarch64_tune_params
.fusible_ops
& op
) != 0;
21235 /* If MEM is in the form of [base+offset], extract the two parts
21236 of address and set to BASE and OFFSET, otherwise return false
21237 after clearing BASE and OFFSET. */
21240 extract_base_offset_in_addr (rtx mem
, rtx
*base
, rtx
*offset
)
21244 gcc_assert (MEM_P (mem
));
21246 addr
= XEXP (mem
, 0);
21251 *offset
= const0_rtx
;
21255 if (GET_CODE (addr
) == PLUS
21256 && REG_P (XEXP (addr
, 0)) && CONST_INT_P (XEXP (addr
, 1)))
21258 *base
= XEXP (addr
, 0);
21259 *offset
= XEXP (addr
, 1);
21264 *offset
= NULL_RTX
;
21269 /* Types for scheduling fusion. */
21270 enum sched_fusion_type
21272 SCHED_FUSION_NONE
= 0,
21273 SCHED_FUSION_LD_SIGN_EXTEND
,
21274 SCHED_FUSION_LD_ZERO_EXTEND
,
21280 /* If INSN is a load or store of address in the form of [base+offset],
21281 extract the two parts and set to BASE and OFFSET. Return scheduling
21282 fusion type this INSN is. */
21284 static enum sched_fusion_type
21285 fusion_load_store (rtx_insn
*insn
, rtx
*base
, rtx
*offset
)
21288 enum sched_fusion_type fusion
= SCHED_FUSION_LD
;
21290 gcc_assert (INSN_P (insn
));
21291 x
= PATTERN (insn
);
21292 if (GET_CODE (x
) != SET
)
21293 return SCHED_FUSION_NONE
;
21296 dest
= SET_DEST (x
);
21298 machine_mode dest_mode
= GET_MODE (dest
);
21300 if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode
))
21301 return SCHED_FUSION_NONE
;
21303 if (GET_CODE (src
) == SIGN_EXTEND
)
21305 fusion
= SCHED_FUSION_LD_SIGN_EXTEND
;
21306 src
= XEXP (src
, 0);
21307 if (GET_CODE (src
) != MEM
|| GET_MODE (src
) != SImode
)
21308 return SCHED_FUSION_NONE
;
21310 else if (GET_CODE (src
) == ZERO_EXTEND
)
21312 fusion
= SCHED_FUSION_LD_ZERO_EXTEND
;
21313 src
= XEXP (src
, 0);
21314 if (GET_CODE (src
) != MEM
|| GET_MODE (src
) != SImode
)
21315 return SCHED_FUSION_NONE
;
21318 if (GET_CODE (src
) == MEM
&& REG_P (dest
))
21319 extract_base_offset_in_addr (src
, base
, offset
);
21320 else if (GET_CODE (dest
) == MEM
&& (REG_P (src
) || src
== const0_rtx
))
21322 fusion
= SCHED_FUSION_ST
;
21323 extract_base_offset_in_addr (dest
, base
, offset
);
21326 return SCHED_FUSION_NONE
;
21328 if (*base
== NULL_RTX
|| *offset
== NULL_RTX
)
21329 fusion
= SCHED_FUSION_NONE
;
21334 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
21336 Currently we only support to fuse ldr or str instructions, so FUSION_PRI
21337 and PRI are only calculated for these instructions. For other instruction,
21338 FUSION_PRI and PRI are simply set to MAX_PRI - 1. In the future, other
21339 type instruction fusion can be added by returning different priorities.
21341 It's important that irrelevant instructions get the largest FUSION_PRI. */
21344 aarch64_sched_fusion_priority (rtx_insn
*insn
, int max_pri
,
21345 int *fusion_pri
, int *pri
)
21349 enum sched_fusion_type fusion
;
21351 gcc_assert (INSN_P (insn
));
21354 fusion
= fusion_load_store (insn
, &base
, &offset
);
21355 if (fusion
== SCHED_FUSION_NONE
)
21362 /* Set FUSION_PRI according to fusion type and base register. */
21363 *fusion_pri
= tmp
- fusion
* FIRST_PSEUDO_REGISTER
- REGNO (base
);
21365 /* Calculate PRI. */
21368 /* INSN with smaller offset goes first. */
21369 off_val
= (int)(INTVAL (offset
));
21371 tmp
-= (off_val
& 0xfffff);
21373 tmp
+= ((- off_val
) & 0xfffff);
21379 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
21380 Adjust priority of sha1h instructions so they are scheduled before
21381 other SHA1 instructions. */
21384 aarch64_sched_adjust_priority (rtx_insn
*insn
, int priority
)
21386 rtx x
= PATTERN (insn
);
21388 if (GET_CODE (x
) == SET
)
21392 if (GET_CODE (x
) == UNSPEC
&& XINT (x
, 1) == UNSPEC_SHA1H
)
21393 return priority
+ 10;
21399 /* Given OPERANDS of consecutive load/store, check if we can merge
21400 them into ldp/stp. LOAD is true if they are load instructions.
21401 MODE is the mode of memory operands. */
21404 aarch64_operands_ok_for_ldpstp (rtx
*operands
, bool load
,
21407 HOST_WIDE_INT offval_1
, offval_2
, msize
;
21408 enum reg_class rclass_1
, rclass_2
;
21409 rtx mem_1
, mem_2
, reg_1
, reg_2
, base_1
, base_2
, offset_1
, offset_2
;
21413 mem_1
= operands
[1];
21414 mem_2
= operands
[3];
21415 reg_1
= operands
[0];
21416 reg_2
= operands
[2];
21417 gcc_assert (REG_P (reg_1
) && REG_P (reg_2
));
21418 if (REGNO (reg_1
) == REGNO (reg_2
))
21423 mem_1
= operands
[0];
21424 mem_2
= operands
[2];
21425 reg_1
= operands
[1];
21426 reg_2
= operands
[3];
21429 /* The mems cannot be volatile. */
21430 if (MEM_VOLATILE_P (mem_1
) || MEM_VOLATILE_P (mem_2
))
21433 /* If we have SImode and slow unaligned ldp,
21434 check the alignment to be at least 8 byte. */
21436 && (aarch64_tune_params
.extra_tuning_flags
21437 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
)
21439 && MEM_ALIGN (mem_1
) < 8 * BITS_PER_UNIT
)
21442 /* Check if the addresses are in the form of [base+offset]. */
21443 extract_base_offset_in_addr (mem_1
, &base_1
, &offset_1
);
21444 if (base_1
== NULL_RTX
|| offset_1
== NULL_RTX
)
21446 extract_base_offset_in_addr (mem_2
, &base_2
, &offset_2
);
21447 if (base_2
== NULL_RTX
|| offset_2
== NULL_RTX
)
21450 /* Check if the bases are same. */
21451 if (!rtx_equal_p (base_1
, base_2
))
21454 /* The operands must be of the same size. */
21455 gcc_assert (known_eq (GET_MODE_SIZE (GET_MODE (mem_1
)),
21456 GET_MODE_SIZE (GET_MODE (mem_2
))));
21458 offval_1
= INTVAL (offset_1
);
21459 offval_2
= INTVAL (offset_2
);
21460 /* We should only be trying this for fixed-sized modes. There is no
21461 SVE LDP/STP instruction. */
21462 msize
= GET_MODE_SIZE (mode
).to_constant ();
21463 /* Check if the offsets are consecutive. */
21464 if (offval_1
!= (offval_2
+ msize
) && offval_2
!= (offval_1
+ msize
))
21467 /* Check if the addresses are clobbered by load. */
21470 if (reg_mentioned_p (reg_1
, mem_1
))
21473 /* In increasing order, the last load can clobber the address. */
21474 if (offval_1
> offval_2
&& reg_mentioned_p (reg_2
, mem_2
))
21478 /* One of the memory accesses must be a mempair operand.
21479 If it is not the first one, they need to be swapped by the
21481 if (!aarch64_mem_pair_operand (mem_1
, GET_MODE (mem_1
))
21482 && !aarch64_mem_pair_operand (mem_2
, GET_MODE (mem_2
)))
21485 if (REG_P (reg_1
) && FP_REGNUM_P (REGNO (reg_1
)))
21486 rclass_1
= FP_REGS
;
21488 rclass_1
= GENERAL_REGS
;
21490 if (REG_P (reg_2
) && FP_REGNUM_P (REGNO (reg_2
)))
21491 rclass_2
= FP_REGS
;
21493 rclass_2
= GENERAL_REGS
;
21495 /* Check if the registers are of same class. */
21496 if (rclass_1
!= rclass_2
)
21502 /* Given OPERANDS of consecutive load/store that can be merged,
21503 swap them if they are not in ascending order. */
21505 aarch64_swap_ldrstr_operands (rtx
* operands
, bool load
)
21507 rtx mem_1
, mem_2
, base_1
, base_2
, offset_1
, offset_2
;
21508 HOST_WIDE_INT offval_1
, offval_2
;
21512 mem_1
= operands
[1];
21513 mem_2
= operands
[3];
21517 mem_1
= operands
[0];
21518 mem_2
= operands
[2];
21521 extract_base_offset_in_addr (mem_1
, &base_1
, &offset_1
);
21522 extract_base_offset_in_addr (mem_2
, &base_2
, &offset_2
);
21524 offval_1
= INTVAL (offset_1
);
21525 offval_2
= INTVAL (offset_2
);
21527 if (offval_1
> offval_2
)
21529 /* Irrespective of whether this is a load or a store,
21530 we do the same swap. */
21531 std::swap (operands
[0], operands
[2]);
21532 std::swap (operands
[1], operands
[3]);
21536 /* Taking X and Y to be HOST_WIDE_INT pointers, return the result of a
21537 comparison between the two. */
21539 aarch64_host_wide_int_compare (const void *x
, const void *y
)
21541 return wi::cmps (* ((const HOST_WIDE_INT
*) x
),
21542 * ((const HOST_WIDE_INT
*) y
));
21545 /* Taking X and Y to be pairs of RTX, one pointing to a MEM rtx and the
21546 other pointing to a REG rtx containing an offset, compare the offsets
21551 1 iff offset (X) > offset (Y)
21552 0 iff offset (X) == offset (Y)
21553 -1 iff offset (X) < offset (Y) */
21555 aarch64_ldrstr_offset_compare (const void *x
, const void *y
)
21557 const rtx
* operands_1
= (const rtx
*) x
;
21558 const rtx
* operands_2
= (const rtx
*) y
;
21559 rtx mem_1
, mem_2
, base
, offset_1
, offset_2
;
21561 if (MEM_P (operands_1
[0]))
21562 mem_1
= operands_1
[0];
21564 mem_1
= operands_1
[1];
21566 if (MEM_P (operands_2
[0]))
21567 mem_2
= operands_2
[0];
21569 mem_2
= operands_2
[1];
21571 /* Extract the offsets. */
21572 extract_base_offset_in_addr (mem_1
, &base
, &offset_1
);
21573 extract_base_offset_in_addr (mem_2
, &base
, &offset_2
);
21575 gcc_assert (offset_1
!= NULL_RTX
&& offset_2
!= NULL_RTX
);
21577 return wi::cmps (INTVAL (offset_1
), INTVAL (offset_2
));
21580 /* Given OPERANDS of consecutive load/store, check if we can merge
21581 them into ldp/stp by adjusting the offset. LOAD is true if they
21582 are load instructions. MODE is the mode of memory operands.
21584 Given below consecutive stores:
21586 str w1, [xb, 0x100]
21587 str w1, [xb, 0x104]
21588 str w1, [xb, 0x108]
21589 str w1, [xb, 0x10c]
21591 Though the offsets are out of the range supported by stp, we can
21592 still pair them after adjusting the offset, like:
21594 add scratch, xb, 0x100
21595 stp w1, w1, [scratch]
21596 stp w1, w1, [scratch, 0x8]
21598 The peephole patterns detecting this opportunity should guarantee
21599 the scratch register is avaliable. */
21602 aarch64_operands_adjust_ok_for_ldpstp (rtx
*operands
, bool load
,
21605 const int num_insns
= 4;
21606 enum reg_class rclass
;
21607 HOST_WIDE_INT offvals
[num_insns
], msize
;
21608 rtx mem
[num_insns
], reg
[num_insns
], base
[num_insns
], offset
[num_insns
];
21612 for (int i
= 0; i
< num_insns
; i
++)
21614 reg
[i
] = operands
[2 * i
];
21615 mem
[i
] = operands
[2 * i
+ 1];
21617 gcc_assert (REG_P (reg
[i
]));
21620 /* Do not attempt to merge the loads if the loads clobber each other. */
21621 for (int i
= 0; i
< 8; i
+= 2)
21622 for (int j
= i
+ 2; j
< 8; j
+= 2)
21623 if (reg_overlap_mentioned_p (operands
[i
], operands
[j
]))
21627 for (int i
= 0; i
< num_insns
; i
++)
21629 mem
[i
] = operands
[2 * i
];
21630 reg
[i
] = operands
[2 * i
+ 1];
21633 /* Skip if memory operand is by itself valid for ldp/stp. */
21634 if (!MEM_P (mem
[0]) || aarch64_mem_pair_operand (mem
[0], mode
))
21637 for (int i
= 0; i
< num_insns
; i
++)
21639 /* The mems cannot be volatile. */
21640 if (MEM_VOLATILE_P (mem
[i
]))
21643 /* Check if the addresses are in the form of [base+offset]. */
21644 extract_base_offset_in_addr (mem
[i
], base
+ i
, offset
+ i
);
21645 if (base
[i
] == NULL_RTX
|| offset
[i
] == NULL_RTX
)
21649 /* Check if the registers are of same class. */
21650 rclass
= REG_P (reg
[0]) && FP_REGNUM_P (REGNO (reg
[0]))
21651 ? FP_REGS
: GENERAL_REGS
;
21653 for (int i
= 1; i
< num_insns
; i
++)
21654 if (REG_P (reg
[i
]) && FP_REGNUM_P (REGNO (reg
[i
])))
21656 if (rclass
!= FP_REGS
)
21661 if (rclass
!= GENERAL_REGS
)
21665 /* Only the last register in the order in which they occur
21666 may be clobbered by the load. */
21667 if (rclass
== GENERAL_REGS
&& load
)
21668 for (int i
= 0; i
< num_insns
- 1; i
++)
21669 if (reg_mentioned_p (reg
[i
], mem
[i
]))
21672 /* Check if the bases are same. */
21673 for (int i
= 0; i
< num_insns
- 1; i
++)
21674 if (!rtx_equal_p (base
[i
], base
[i
+ 1]))
21677 for (int i
= 0; i
< num_insns
; i
++)
21678 offvals
[i
] = INTVAL (offset
[i
]);
21680 msize
= GET_MODE_SIZE (mode
);
21682 /* Check if the offsets can be put in the right order to do a ldp/stp. */
21683 qsort (offvals
, num_insns
, sizeof (HOST_WIDE_INT
),
21684 aarch64_host_wide_int_compare
);
21686 if (!(offvals
[1] == offvals
[0] + msize
21687 && offvals
[3] == offvals
[2] + msize
))
21690 /* Check that offsets are within range of each other. The ldp/stp
21691 instructions have 7 bit immediate offsets, so use 0x80. */
21692 if (offvals
[2] - offvals
[0] >= msize
* 0x80)
21695 /* The offsets must be aligned with respect to each other. */
21696 if (offvals
[0] % msize
!= offvals
[2] % msize
)
21699 /* If we have SImode and slow unaligned ldp,
21700 check the alignment to be at least 8 byte. */
21702 && (aarch64_tune_params
.extra_tuning_flags
21703 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
)
21705 && MEM_ALIGN (mem
[0]) < 8 * BITS_PER_UNIT
)
21711 /* Given OPERANDS of consecutive load/store, this function pairs them
21712 into LDP/STP after adjusting the offset. It depends on the fact
21713 that the operands can be sorted so the offsets are correct for STP.
21714 MODE is the mode of memory operands. CODE is the rtl operator
21715 which should be applied to all memory operands, it's SIGN_EXTEND,
21716 ZERO_EXTEND or UNKNOWN. */
21719 aarch64_gen_adjusted_ldpstp (rtx
*operands
, bool load
,
21720 scalar_mode mode
, RTX_CODE code
)
21722 rtx base
, offset_1
, offset_3
, t1
, t2
;
21723 rtx mem_1
, mem_2
, mem_3
, mem_4
;
21724 rtx temp_operands
[8];
21725 HOST_WIDE_INT off_val_1
, off_val_3
, base_off
, new_off_1
, new_off_3
,
21726 stp_off_upper_limit
, stp_off_lower_limit
, msize
;
21728 /* We make changes on a copy as we may still bail out. */
21729 for (int i
= 0; i
< 8; i
++)
21730 temp_operands
[i
] = operands
[i
];
21732 /* Sort the operands. */
21733 qsort (temp_operands
, 4, 2 * sizeof (rtx
*), aarch64_ldrstr_offset_compare
);
21735 /* Copy the memory operands so that if we have to bail for some
21736 reason the original addresses are unchanged. */
21739 mem_1
= copy_rtx (temp_operands
[1]);
21740 mem_2
= copy_rtx (temp_operands
[3]);
21741 mem_3
= copy_rtx (temp_operands
[5]);
21742 mem_4
= copy_rtx (temp_operands
[7]);
21746 mem_1
= copy_rtx (temp_operands
[0]);
21747 mem_2
= copy_rtx (temp_operands
[2]);
21748 mem_3
= copy_rtx (temp_operands
[4]);
21749 mem_4
= copy_rtx (temp_operands
[6]);
21750 gcc_assert (code
== UNKNOWN
);
21753 extract_base_offset_in_addr (mem_1
, &base
, &offset_1
);
21754 extract_base_offset_in_addr (mem_3
, &base
, &offset_3
);
21755 gcc_assert (base
!= NULL_RTX
&& offset_1
!= NULL_RTX
21756 && offset_3
!= NULL_RTX
);
21758 /* Adjust offset so it can fit in LDP/STP instruction. */
21759 msize
= GET_MODE_SIZE (mode
);
21760 stp_off_upper_limit
= msize
* (0x40 - 1);
21761 stp_off_lower_limit
= - msize
* 0x40;
21763 off_val_1
= INTVAL (offset_1
);
21764 off_val_3
= INTVAL (offset_3
);
21766 /* The base offset is optimally half way between the two STP/LDP offsets. */
21768 base_off
= (off_val_1
+ off_val_3
) / 2;
21770 /* However, due to issues with negative LDP/STP offset generation for
21771 larger modes, for DF, DI and vector modes. we must not use negative
21772 addresses smaller than 9 signed unadjusted bits can store. This
21773 provides the most range in this case. */
21774 base_off
= off_val_1
;
21776 /* Adjust the base so that it is aligned with the addresses but still
21778 if (base_off
% msize
!= off_val_1
% msize
)
21779 /* Fix the offset, bearing in mind we want to make it bigger not
21781 base_off
+= (((base_off
% msize
) - (off_val_1
% msize
)) + msize
) % msize
;
21782 else if (msize
<= 4)
21783 /* The negative range of LDP/STP is one larger than the positive range. */
21786 /* Check if base offset is too big or too small. We can attempt to resolve
21787 this issue by setting it to the maximum value and seeing if the offsets
21789 if (base_off
>= 0x1000)
21791 base_off
= 0x1000 - 1;
21792 /* We must still make sure that the base offset is aligned with respect
21793 to the address. But it may not be made any bigger. */
21794 base_off
-= (((base_off
% msize
) - (off_val_1
% msize
)) + msize
) % msize
;
21797 /* Likewise for the case where the base is too small. */
21798 if (base_off
<= -0x1000)
21800 base_off
= -0x1000 + 1;
21801 base_off
+= (((base_off
% msize
) - (off_val_1
% msize
)) + msize
) % msize
;
21804 /* Offset of the first STP/LDP. */
21805 new_off_1
= off_val_1
- base_off
;
21807 /* Offset of the second STP/LDP. */
21808 new_off_3
= off_val_3
- base_off
;
21810 /* The offsets must be within the range of the LDP/STP instructions. */
21811 if (new_off_1
> stp_off_upper_limit
|| new_off_1
< stp_off_lower_limit
21812 || new_off_3
> stp_off_upper_limit
|| new_off_3
< stp_off_lower_limit
)
21815 replace_equiv_address_nv (mem_1
, plus_constant (Pmode
, operands
[8],
21817 replace_equiv_address_nv (mem_2
, plus_constant (Pmode
, operands
[8],
21818 new_off_1
+ msize
), true);
21819 replace_equiv_address_nv (mem_3
, plus_constant (Pmode
, operands
[8],
21821 replace_equiv_address_nv (mem_4
, plus_constant (Pmode
, operands
[8],
21822 new_off_3
+ msize
), true);
21824 if (!aarch64_mem_pair_operand (mem_1
, mode
)
21825 || !aarch64_mem_pair_operand (mem_3
, mode
))
21828 if (code
== ZERO_EXTEND
)
21830 mem_1
= gen_rtx_ZERO_EXTEND (DImode
, mem_1
);
21831 mem_2
= gen_rtx_ZERO_EXTEND (DImode
, mem_2
);
21832 mem_3
= gen_rtx_ZERO_EXTEND (DImode
, mem_3
);
21833 mem_4
= gen_rtx_ZERO_EXTEND (DImode
, mem_4
);
21835 else if (code
== SIGN_EXTEND
)
21837 mem_1
= gen_rtx_SIGN_EXTEND (DImode
, mem_1
);
21838 mem_2
= gen_rtx_SIGN_EXTEND (DImode
, mem_2
);
21839 mem_3
= gen_rtx_SIGN_EXTEND (DImode
, mem_3
);
21840 mem_4
= gen_rtx_SIGN_EXTEND (DImode
, mem_4
);
21845 operands
[0] = temp_operands
[0];
21846 operands
[1] = mem_1
;
21847 operands
[2] = temp_operands
[2];
21848 operands
[3] = mem_2
;
21849 operands
[4] = temp_operands
[4];
21850 operands
[5] = mem_3
;
21851 operands
[6] = temp_operands
[6];
21852 operands
[7] = mem_4
;
21856 operands
[0] = mem_1
;
21857 operands
[1] = temp_operands
[1];
21858 operands
[2] = mem_2
;
21859 operands
[3] = temp_operands
[3];
21860 operands
[4] = mem_3
;
21861 operands
[5] = temp_operands
[5];
21862 operands
[6] = mem_4
;
21863 operands
[7] = temp_operands
[7];
21866 /* Emit adjusting instruction. */
21867 emit_insn (gen_rtx_SET (operands
[8], plus_constant (DImode
, base
, base_off
)));
21868 /* Emit ldp/stp instructions. */
21869 t1
= gen_rtx_SET (operands
[0], operands
[1]);
21870 t2
= gen_rtx_SET (operands
[2], operands
[3]);
21871 emit_insn (gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (2, t1
, t2
)));
21872 t1
= gen_rtx_SET (operands
[4], operands
[5]);
21873 t2
= gen_rtx_SET (operands
[6], operands
[7]);
21874 emit_insn (gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (2, t1
, t2
)));
21878 /* Implement TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE. Assume for now that
21879 it isn't worth branching around empty masked ops (including masked
21883 aarch64_empty_mask_is_expensive (unsigned)
21888 /* Return 1 if pseudo register should be created and used to hold
21889 GOT address for PIC code. */
21892 aarch64_use_pseudo_pic_reg (void)
21894 return aarch64_cmodel
== AARCH64_CMODEL_SMALL_SPIC
;
21897 /* Implement TARGET_UNSPEC_MAY_TRAP_P. */
21900 aarch64_unspec_may_trap_p (const_rtx x
, unsigned flags
)
21902 switch (XINT (x
, 1))
21904 case UNSPEC_GOTSMALLPIC
:
21905 case UNSPEC_GOTSMALLPIC28K
:
21906 case UNSPEC_GOTTINYPIC
:
21912 return default_unspec_may_trap_p (x
, flags
);
21916 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
21917 return the log2 of that value. Otherwise return -1. */
21920 aarch64_fpconst_pow_of_2 (rtx x
)
21922 const REAL_VALUE_TYPE
*r
;
21924 if (!CONST_DOUBLE_P (x
))
21927 r
= CONST_DOUBLE_REAL_VALUE (x
);
21929 if (REAL_VALUE_NEGATIVE (*r
)
21930 || REAL_VALUE_ISNAN (*r
)
21931 || REAL_VALUE_ISINF (*r
)
21932 || !real_isinteger (r
, DFmode
))
21935 return exact_log2 (real_to_integer (r
));
21938 /* If X is a positive CONST_DOUBLE with a value that is the reciprocal of a
21939 power of 2 (i.e 1/2^n) return the number of float bits. e.g. for x==(1/2^n)
21940 return n. Otherwise return -1. */
21943 aarch64_fpconst_pow2_recip (rtx x
)
21945 REAL_VALUE_TYPE r0
;
21947 if (!CONST_DOUBLE_P (x
))
21950 r0
= *CONST_DOUBLE_REAL_VALUE (x
);
21951 if (exact_real_inverse (DFmode
, &r0
)
21952 && !REAL_VALUE_NEGATIVE (r0
))
21954 int ret
= exact_log2 (real_to_integer (&r0
));
21955 if (ret
>= 1 && ret
<= 32)
21961 /* If X is a vector of equal CONST_DOUBLE values and that value is
21962 Y, return the aarch64_fpconst_pow_of_2 of Y. Otherwise return -1. */
21965 aarch64_vec_fpconst_pow_of_2 (rtx x
)
21968 if (GET_CODE (x
) != CONST_VECTOR
21969 || !CONST_VECTOR_NUNITS (x
).is_constant (&nelts
))
21972 if (GET_MODE_CLASS (GET_MODE (x
)) != MODE_VECTOR_FLOAT
)
21975 int firstval
= aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x
, 0));
21979 for (int i
= 1; i
< nelts
; i
++)
21980 if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x
, i
)) != firstval
)
21986 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
21989 __fp16 always promotes through this hook.
21990 _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
21991 through the generic excess precision logic rather than here. */
21994 aarch64_promoted_type (const_tree t
)
21996 if (SCALAR_FLOAT_TYPE_P (t
)
21997 && TYPE_MAIN_VARIANT (t
) == aarch64_fp16_type_node
)
21998 return float_type_node
;
22003 /* Implement the TARGET_OPTAB_SUPPORTED_P hook. */
22006 aarch64_optab_supported_p (int op
, machine_mode mode1
, machine_mode
,
22007 optimization_type opt_type
)
22012 return opt_type
== OPTIMIZE_FOR_SPEED
&& use_rsqrt_p (mode1
);
22019 /* Implement the TARGET_DWARF_POLY_INDETERMINATE_VALUE hook. */
22021 static unsigned int
22022 aarch64_dwarf_poly_indeterminate_value (unsigned int i
, unsigned int *factor
,
22025 /* Polynomial invariant 1 == (VG / 2) - 1. */
22026 gcc_assert (i
== 1);
22029 return AARCH64_DWARF_VG
;
22032 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
22033 if MODE is HFmode, and punt to the generic implementation otherwise. */
22036 aarch64_libgcc_floating_mode_supported_p (scalar_float_mode mode
)
22038 return (mode
== HFmode
22040 : default_libgcc_floating_mode_supported_p (mode
));
22043 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
22044 if MODE is HFmode, and punt to the generic implementation otherwise. */
22047 aarch64_scalar_mode_supported_p (scalar_mode mode
)
22049 return (mode
== HFmode
22051 : default_scalar_mode_supported_p (mode
));
22054 /* Set the value of FLT_EVAL_METHOD.
22055 ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
22057 0: evaluate all operations and constants, whose semantic type has at
22058 most the range and precision of type float, to the range and
22059 precision of float; evaluate all other operations and constants to
22060 the range and precision of the semantic type;
22062 N, where _FloatN is a supported interchange floating type
22063 evaluate all operations and constants, whose semantic type has at
22064 most the range and precision of _FloatN type, to the range and
22065 precision of the _FloatN type; evaluate all other operations and
22066 constants to the range and precision of the semantic type;
22068 If we have the ARMv8.2-A extensions then we support _Float16 in native
22069 precision, so we should set this to 16. Otherwise, we support the type,
22070 but want to evaluate expressions in float precision, so set this to
22073 static enum flt_eval_method
22074 aarch64_excess_precision (enum excess_precision_type type
)
22078 case EXCESS_PRECISION_TYPE_FAST
:
22079 case EXCESS_PRECISION_TYPE_STANDARD
:
22080 /* We can calculate either in 16-bit range and precision or
22081 32-bit range and precision. Make that decision based on whether
22082 we have native support for the ARMv8.2-A 16-bit floating-point
22083 instructions or not. */
22084 return (TARGET_FP_F16INST
22085 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
22086 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT
);
22087 case EXCESS_PRECISION_TYPE_IMPLICIT
:
22088 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
;
22090 gcc_unreachable ();
22092 return FLT_EVAL_METHOD_UNPREDICTABLE
;
22095 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN. Return true if INSN can be
22096 scheduled for speculative execution. Reject the long-running division
22097 and square-root instructions. */
22100 aarch64_sched_can_speculate_insn (rtx_insn
*insn
)
22102 switch (get_attr_type (insn
))
22110 case TYPE_NEON_FP_SQRT_S
:
22111 case TYPE_NEON_FP_SQRT_D
:
22112 case TYPE_NEON_FP_SQRT_S_Q
:
22113 case TYPE_NEON_FP_SQRT_D_Q
:
22114 case TYPE_NEON_FP_DIV_S
:
22115 case TYPE_NEON_FP_DIV_D
:
22116 case TYPE_NEON_FP_DIV_S_Q
:
22117 case TYPE_NEON_FP_DIV_D_Q
:
22124 /* Implement TARGET_COMPUTE_PRESSURE_CLASSES. */
22127 aarch64_compute_pressure_classes (reg_class
*classes
)
22130 classes
[i
++] = GENERAL_REGS
;
22131 classes
[i
++] = FP_REGS
;
22132 /* PR_REGS isn't a useful pressure class because many predicate pseudo
22133 registers need to go in PR_LO_REGS at some point during their
22134 lifetime. Splitting it into two halves has the effect of making
22135 all predicates count against PR_LO_REGS, so that we try whenever
22136 possible to restrict the number of live predicates to 8. This
22137 greatly reduces the amount of spilling in certain loops. */
22138 classes
[i
++] = PR_LO_REGS
;
22139 classes
[i
++] = PR_HI_REGS
;
22143 /* Implement TARGET_CAN_CHANGE_MODE_CLASS. */
22146 aarch64_can_change_mode_class (machine_mode from
,
22147 machine_mode to
, reg_class_t
)
22149 unsigned int from_flags
= aarch64_classify_vector_mode (from
);
22150 unsigned int to_flags
= aarch64_classify_vector_mode (to
);
22152 bool from_sve_p
= (from_flags
& VEC_ANY_SVE
);
22153 bool to_sve_p
= (to_flags
& VEC_ANY_SVE
);
22155 bool from_partial_sve_p
= from_sve_p
&& (from_flags
& VEC_PARTIAL
);
22156 bool to_partial_sve_p
= to_sve_p
&& (to_flags
& VEC_PARTIAL
);
22158 bool from_pred_p
= (from_flags
& VEC_SVE_PRED
);
22159 bool to_pred_p
= (to_flags
& VEC_SVE_PRED
);
22161 /* Don't allow changes between predicate modes and other modes.
22162 Only predicate registers can hold predicate modes and only
22163 non-predicate registers can hold non-predicate modes, so any
22164 attempt to mix them would require a round trip through memory. */
22165 if (from_pred_p
!= to_pred_p
)
22168 /* Don't allow changes between partial SVE modes and other modes.
22169 The contents of partial SVE modes are distributed evenly across
22170 the register, whereas GCC expects them to be clustered together. */
22171 if (from_partial_sve_p
!= to_partial_sve_p
)
22174 /* Similarly reject changes between partial SVE modes that have
22175 different patterns of significant and insignificant bits. */
22176 if (from_partial_sve_p
22177 && (aarch64_sve_container_bits (from
) != aarch64_sve_container_bits (to
)
22178 || GET_MODE_UNIT_SIZE (from
) != GET_MODE_UNIT_SIZE (to
)))
22181 if (maybe_ne (BITS_PER_SVE_VECTOR
, 128u))
22183 /* Don't allow changes between SVE modes and other modes that might
22184 be bigger than 128 bits. In particular, OImode, CImode and XImode
22185 divide into 128-bit quantities while SVE modes divide into
22186 BITS_PER_SVE_VECTOR quantities. */
22187 if (from_sve_p
&& !to_sve_p
&& maybe_gt (GET_MODE_BITSIZE (to
), 128))
22189 if (to_sve_p
&& !from_sve_p
&& maybe_gt (GET_MODE_BITSIZE (from
), 128))
22193 if (BYTES_BIG_ENDIAN
)
22195 /* Don't allow changes between SVE data modes and non-SVE modes.
22196 See the comment at the head of aarch64-sve.md for details. */
22197 if (from_sve_p
!= to_sve_p
)
22200 /* Don't allow changes in element size: lane 0 of the new vector
22201 would not then be lane 0 of the old vector. See the comment
22202 above aarch64_maybe_expand_sve_subreg_move for a more detailed
22205 In the worst case, this forces a register to be spilled in
22206 one mode and reloaded in the other, which handles the
22207 endianness correctly. */
22208 if (from_sve_p
&& GET_MODE_UNIT_SIZE (from
) != GET_MODE_UNIT_SIZE (to
))
22214 /* Implement TARGET_EARLY_REMAT_MODES. */
22217 aarch64_select_early_remat_modes (sbitmap modes
)
22219 /* SVE values are not normally live across a call, so it should be
22220 worth doing early rematerialization even in VL-specific mode. */
22221 for (int i
= 0; i
< NUM_MACHINE_MODES
; ++i
)
22222 if (aarch64_sve_mode_p ((machine_mode
) i
))
22223 bitmap_set_bit (modes
, i
);
22226 /* Override the default target speculation_safe_value. */
22228 aarch64_speculation_safe_value (machine_mode mode
,
22229 rtx result
, rtx val
, rtx failval
)
22231 /* Maybe we should warn if falling back to hard barriers. They are
22232 likely to be noticably more expensive than the alternative below. */
22233 if (!aarch64_track_speculation
)
22234 return default_speculation_safe_value (mode
, result
, val
, failval
);
22237 val
= copy_to_mode_reg (mode
, val
);
22239 if (!aarch64_reg_or_zero (failval
, mode
))
22240 failval
= copy_to_mode_reg (mode
, failval
);
22242 emit_insn (gen_despeculate_copy (mode
, result
, val
, failval
));
22246 /* Implement TARGET_ESTIMATED_POLY_VALUE.
22247 Look into the tuning structure for an estimate.
22248 VAL.coeffs[1] is multiplied by the number of VQ chunks over the initial
22249 Advanced SIMD 128 bits. */
22251 static HOST_WIDE_INT
22252 aarch64_estimated_poly_value (poly_int64 val
)
22254 enum aarch64_sve_vector_bits_enum width_source
22255 = aarch64_tune_params
.sve_width
;
22257 /* If we still don't have an estimate, use the default. */
22258 if (width_source
== SVE_SCALABLE
)
22259 return default_estimated_poly_value (val
);
22261 HOST_WIDE_INT over_128
= width_source
- 128;
22262 return val
.coeffs
[0] + val
.coeffs
[1] * over_128
/ 128;
22266 /* Return true for types that could be supported as SIMD return or
22270 supported_simd_type (tree t
)
22272 if (SCALAR_FLOAT_TYPE_P (t
) || INTEGRAL_TYPE_P (t
) || POINTER_TYPE_P (t
))
22274 HOST_WIDE_INT s
= tree_to_shwi (TYPE_SIZE_UNIT (t
));
22275 return s
== 1 || s
== 2 || s
== 4 || s
== 8;
22280 /* Return true for types that currently are supported as SIMD return
22281 or argument types. */
22284 currently_supported_simd_type (tree t
, tree b
)
22286 if (COMPLEX_FLOAT_TYPE_P (t
))
22289 if (TYPE_SIZE (t
) != TYPE_SIZE (b
))
22292 return supported_simd_type (t
);
22295 /* Implement TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN. */
22298 aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node
*node
,
22299 struct cgraph_simd_clone
*clonei
,
22300 tree base_type
, int num
)
22302 tree t
, ret_type
, arg_type
;
22303 unsigned int elt_bits
, vec_bits
, count
;
22308 if (clonei
->simdlen
22309 && (clonei
->simdlen
< 2
22310 || clonei
->simdlen
> 1024
22311 || (clonei
->simdlen
& (clonei
->simdlen
- 1)) != 0))
22313 warning_at (DECL_SOURCE_LOCATION (node
->decl
), 0,
22314 "unsupported simdlen %d", clonei
->simdlen
);
22318 ret_type
= TREE_TYPE (TREE_TYPE (node
->decl
));
22319 if (TREE_CODE (ret_type
) != VOID_TYPE
22320 && !currently_supported_simd_type (ret_type
, base_type
))
22322 if (TYPE_SIZE (ret_type
) != TYPE_SIZE (base_type
))
22323 warning_at (DECL_SOURCE_LOCATION (node
->decl
), 0,
22324 "GCC does not currently support mixed size types "
22325 "for %<simd%> functions");
22326 else if (supported_simd_type (ret_type
))
22327 warning_at (DECL_SOURCE_LOCATION (node
->decl
), 0,
22328 "GCC does not currently support return type %qT "
22329 "for %<simd%> functions", ret_type
);
22331 warning_at (DECL_SOURCE_LOCATION (node
->decl
), 0,
22332 "unsupported return type %qT for %<simd%> functions",
22337 for (t
= DECL_ARGUMENTS (node
->decl
); t
; t
= DECL_CHAIN (t
))
22339 arg_type
= TREE_TYPE (t
);
22341 if (!currently_supported_simd_type (arg_type
, base_type
))
22343 if (TYPE_SIZE (arg_type
) != TYPE_SIZE (base_type
))
22344 warning_at (DECL_SOURCE_LOCATION (node
->decl
), 0,
22345 "GCC does not currently support mixed size types "
22346 "for %<simd%> functions");
22348 warning_at (DECL_SOURCE_LOCATION (node
->decl
), 0,
22349 "GCC does not currently support argument type %qT "
22350 "for %<simd%> functions", arg_type
);
22355 clonei
->vecsize_mangle
= 'n';
22356 clonei
->mask_mode
= VOIDmode
;
22357 elt_bits
= GET_MODE_BITSIZE (SCALAR_TYPE_MODE (base_type
));
22358 if (clonei
->simdlen
== 0)
22361 vec_bits
= (num
== 0 ? 64 : 128);
22362 clonei
->simdlen
= vec_bits
/ elt_bits
;
22367 vec_bits
= clonei
->simdlen
* elt_bits
;
22368 if (vec_bits
!= 64 && vec_bits
!= 128)
22370 warning_at (DECL_SOURCE_LOCATION (node
->decl
), 0,
22371 "GCC does not currently support simdlen %d for type %qT",
22372 clonei
->simdlen
, base_type
);
22376 clonei
->vecsize_int
= vec_bits
;
22377 clonei
->vecsize_float
= vec_bits
;
22381 /* Implement TARGET_SIMD_CLONE_ADJUST. */
22384 aarch64_simd_clone_adjust (struct cgraph_node
*node
)
22386 /* Add aarch64_vector_pcs target attribute to SIMD clones so they
22387 use the correct ABI. */
22389 tree t
= TREE_TYPE (node
->decl
);
22390 TYPE_ATTRIBUTES (t
) = make_attribute ("aarch64_vector_pcs", "default",
22391 TYPE_ATTRIBUTES (t
));
22394 /* Implement TARGET_SIMD_CLONE_USABLE. */
22397 aarch64_simd_clone_usable (struct cgraph_node
*node
)
22399 switch (node
->simdclone
->vecsize_mangle
)
22406 gcc_unreachable ();
22410 /* Implement TARGET_COMP_TYPE_ATTRIBUTES */
22413 aarch64_comp_type_attributes (const_tree type1
, const_tree type2
)
22415 if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (type1
))
22416 != lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (type2
)))
22421 /* Implement TARGET_GET_MULTILIB_ABI_NAME */
22423 static const char *
22424 aarch64_get_multilib_abi_name (void)
22426 if (TARGET_BIG_END
)
22427 return TARGET_ILP32
? "aarch64_be_ilp32" : "aarch64_be";
22428 return TARGET_ILP32
? "aarch64_ilp32" : "aarch64";
22431 /* Implement TARGET_STACK_PROTECT_GUARD. In case of a
22432 global variable based guard use the default else
22433 return a null tree. */
22435 aarch64_stack_protect_guard (void)
22437 if (aarch64_stack_protector_guard
== SSP_GLOBAL
)
22438 return default_stack_protect_guard ();
22443 /* Return the diagnostic message string if conversion from FROMTYPE to
22444 TOTYPE is not allowed, NULL otherwise. */
22446 static const char *
22447 aarch64_invalid_conversion (const_tree fromtype
, const_tree totype
)
22449 if (element_mode (fromtype
) != element_mode (totype
))
22451 /* Do no allow conversions to/from BFmode scalar types. */
22452 if (TYPE_MODE (fromtype
) == BFmode
)
22453 return N_("invalid conversion from type %<bfloat16_t%>");
22454 if (TYPE_MODE (totype
) == BFmode
)
22455 return N_("invalid conversion to type %<bfloat16_t%>");
22458 /* Conversion allowed. */
22462 /* Return the diagnostic message string if the unary operation OP is
22463 not permitted on TYPE, NULL otherwise. */
22465 static const char *
22466 aarch64_invalid_unary_op (int op
, const_tree type
)
22468 /* Reject all single-operand operations on BFmode except for &. */
22469 if (element_mode (type
) == BFmode
&& op
!= ADDR_EXPR
)
22470 return N_("operation not permitted on type %<bfloat16_t%>");
22472 /* Operation allowed. */
22476 /* Return the diagnostic message string if the binary operation OP is
22477 not permitted on TYPE1 and TYPE2, NULL otherwise. */
22479 static const char *
22480 aarch64_invalid_binary_op (int op ATTRIBUTE_UNUSED
, const_tree type1
,
22483 /* Reject all 2-operand operations on BFmode. */
22484 if (element_mode (type1
) == BFmode
22485 || element_mode (type2
) == BFmode
)
22486 return N_("operation not permitted on type %<bfloat16_t%>");
22488 if (VECTOR_TYPE_P (type1
)
22489 && VECTOR_TYPE_P (type2
)
22490 && !TYPE_INDIVISIBLE_P (type1
)
22491 && !TYPE_INDIVISIBLE_P (type2
)
22492 && (aarch64_sve::builtin_type_p (type1
)
22493 != aarch64_sve::builtin_type_p (type2
)))
22494 return N_("cannot combine GNU and SVE vectors in a binary operation");
22496 /* Operation allowed. */
22500 /* Implement TARGET_ASM_FILE_END for AArch64. This adds the AArch64 GNU NOTE
22501 section at the end if needed. */
22502 #define GNU_PROPERTY_AARCH64_FEATURE_1_AND 0xc0000000
22503 #define GNU_PROPERTY_AARCH64_FEATURE_1_BTI (1U << 0)
22504 #define GNU_PROPERTY_AARCH64_FEATURE_1_PAC (1U << 1)
22506 aarch64_file_end_indicate_exec_stack ()
22508 file_end_indicate_exec_stack ();
22510 unsigned feature_1_and
= 0;
22511 if (aarch64_bti_enabled ())
22512 feature_1_and
|= GNU_PROPERTY_AARCH64_FEATURE_1_BTI
;
22514 if (aarch64_ra_sign_scope
!= AARCH64_FUNCTION_NONE
)
22515 feature_1_and
|= GNU_PROPERTY_AARCH64_FEATURE_1_PAC
;
22519 /* Generate .note.gnu.property section. */
22520 switch_to_section (get_section (".note.gnu.property",
22521 SECTION_NOTYPE
, NULL
));
22523 /* PT_NOTE header: namesz, descsz, type.
22524 namesz = 4 ("GNU\0")
22525 descsz = 16 (Size of the program property array)
22526 [(12 + padding) * Number of array elements]
22527 type = 5 (NT_GNU_PROPERTY_TYPE_0). */
22528 assemble_align (POINTER_SIZE
);
22529 assemble_integer (GEN_INT (4), 4, 32, 1);
22530 assemble_integer (GEN_INT (ROUND_UP (12, POINTER_BYTES
)), 4, 32, 1);
22531 assemble_integer (GEN_INT (5), 4, 32, 1);
22533 /* PT_NOTE name. */
22534 assemble_string ("GNU", 4);
22536 /* PT_NOTE contents for NT_GNU_PROPERTY_TYPE_0:
22537 type = GNU_PROPERTY_AARCH64_FEATURE_1_AND
22539 data = feature_1_and. */
22540 assemble_integer (GEN_INT (GNU_PROPERTY_AARCH64_FEATURE_1_AND
), 4, 32, 1);
22541 assemble_integer (GEN_INT (4), 4, 32, 1);
22542 assemble_integer (GEN_INT (feature_1_and
), 4, 32, 1);
22544 /* Pad the size of the note to the required alignment. */
22545 assemble_align (POINTER_SIZE
);
22548 #undef GNU_PROPERTY_AARCH64_FEATURE_1_PAC
22549 #undef GNU_PROPERTY_AARCH64_FEATURE_1_BTI
22550 #undef GNU_PROPERTY_AARCH64_FEATURE_1_AND
22552 /* Target-specific selftests. */
22556 namespace selftest
{
22558 /* Selftest for the RTL loader.
22559 Verify that the RTL loader copes with a dump from
22560 print_rtx_function. This is essentially just a test that class
22561 function_reader can handle a real dump, but it also verifies
22562 that lookup_reg_by_dump_name correctly handles hard regs.
22563 The presence of hard reg names in the dump means that the test is
22564 target-specific, hence it is in this file. */
22567 aarch64_test_loading_full_dump ()
22569 rtl_dump_test
t (SELFTEST_LOCATION
, locate_file ("aarch64/times-two.rtl"));
22571 ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun
->decl
)));
22573 rtx_insn
*insn_1
= get_insn_by_uid (1);
22574 ASSERT_EQ (NOTE
, GET_CODE (insn_1
));
22576 rtx_insn
*insn_15
= get_insn_by_uid (15);
22577 ASSERT_EQ (INSN
, GET_CODE (insn_15
));
22578 ASSERT_EQ (USE
, GET_CODE (PATTERN (insn_15
)));
22580 /* Verify crtl->return_rtx. */
22581 ASSERT_EQ (REG
, GET_CODE (crtl
->return_rtx
));
22582 ASSERT_EQ (0, REGNO (crtl
->return_rtx
));
22583 ASSERT_EQ (SImode
, GET_MODE (crtl
->return_rtx
));
22586 /* Run all target-specific selftests. */
22589 aarch64_run_selftests (void)
22591 aarch64_test_loading_full_dump ();
22594 } // namespace selftest
22596 #endif /* #if CHECKING_P */
22598 #undef TARGET_STACK_PROTECT_GUARD
22599 #define TARGET_STACK_PROTECT_GUARD aarch64_stack_protect_guard
22601 #undef TARGET_ADDRESS_COST
22602 #define TARGET_ADDRESS_COST aarch64_address_cost
22604 /* This hook will determines whether unnamed bitfields affect the alignment
22605 of the containing structure. The hook returns true if the structure
22606 should inherit the alignment requirements of an unnamed bitfield's
22608 #undef TARGET_ALIGN_ANON_BITFIELD
22609 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
22611 #undef TARGET_ASM_ALIGNED_DI_OP
22612 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
22614 #undef TARGET_ASM_ALIGNED_HI_OP
22615 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
22617 #undef TARGET_ASM_ALIGNED_SI_OP
22618 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
22620 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
22621 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
22622 hook_bool_const_tree_hwi_hwi_const_tree_true
22624 #undef TARGET_ASM_FILE_START
22625 #define TARGET_ASM_FILE_START aarch64_start_file
22627 #undef TARGET_ASM_OUTPUT_MI_THUNK
22628 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
22630 #undef TARGET_ASM_SELECT_RTX_SECTION
22631 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
22633 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
22634 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
22636 #undef TARGET_ASM_PRINT_PATCHABLE_FUNCTION_ENTRY
22637 #define TARGET_ASM_PRINT_PATCHABLE_FUNCTION_ENTRY aarch64_print_patchable_function_entry
22639 #undef TARGET_BUILD_BUILTIN_VA_LIST
22640 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
22642 #undef TARGET_CALLEE_COPIES
22643 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_arg_info_false
22645 #undef TARGET_CAN_ELIMINATE
22646 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
22648 #undef TARGET_CAN_INLINE_P
22649 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
22651 #undef TARGET_CANNOT_FORCE_CONST_MEM
22652 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
22654 #undef TARGET_CASE_VALUES_THRESHOLD
22655 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
22657 #undef TARGET_CONDITIONAL_REGISTER_USAGE
22658 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
22660 #undef TARGET_MEMBER_TYPE_FORCES_BLK
22661 #define TARGET_MEMBER_TYPE_FORCES_BLK aarch64_member_type_forces_blk
22663 /* Only the least significant bit is used for initialization guard
22665 #undef TARGET_CXX_GUARD_MASK_BIT
22666 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
22668 #undef TARGET_C_MODE_FOR_SUFFIX
22669 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
22671 #ifdef TARGET_BIG_ENDIAN_DEFAULT
22672 #undef TARGET_DEFAULT_TARGET_FLAGS
22673 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
22676 #undef TARGET_CLASS_MAX_NREGS
22677 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
22679 #undef TARGET_BUILTIN_DECL
22680 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
22682 #undef TARGET_BUILTIN_RECIPROCAL
22683 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
22685 #undef TARGET_C_EXCESS_PRECISION
22686 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
22688 #undef TARGET_EXPAND_BUILTIN
22689 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
22691 #undef TARGET_EXPAND_BUILTIN_VA_START
22692 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
22694 #undef TARGET_FOLD_BUILTIN
22695 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
22697 #undef TARGET_FUNCTION_ARG
22698 #define TARGET_FUNCTION_ARG aarch64_function_arg
22700 #undef TARGET_FUNCTION_ARG_ADVANCE
22701 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
22703 #undef TARGET_FUNCTION_ARG_BOUNDARY
22704 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
22706 #undef TARGET_FUNCTION_ARG_PADDING
22707 #define TARGET_FUNCTION_ARG_PADDING aarch64_function_arg_padding
22709 #undef TARGET_GET_RAW_RESULT_MODE
22710 #define TARGET_GET_RAW_RESULT_MODE aarch64_get_reg_raw_mode
22711 #undef TARGET_GET_RAW_ARG_MODE
22712 #define TARGET_GET_RAW_ARG_MODE aarch64_get_reg_raw_mode
22714 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
22715 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
22717 #undef TARGET_FUNCTION_VALUE
22718 #define TARGET_FUNCTION_VALUE aarch64_function_value
22720 #undef TARGET_FUNCTION_VALUE_REGNO_P
22721 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
22723 #undef TARGET_GIMPLE_FOLD_BUILTIN
22724 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
22726 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
22727 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
22729 #undef TARGET_INIT_BUILTINS
22730 #define TARGET_INIT_BUILTINS aarch64_init_builtins
22732 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
22733 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
22734 aarch64_ira_change_pseudo_allocno_class
22736 #undef TARGET_LEGITIMATE_ADDRESS_P
22737 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
22739 #undef TARGET_LEGITIMATE_CONSTANT_P
22740 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
22742 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
22743 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
22744 aarch64_legitimize_address_displacement
22746 #undef TARGET_LIBGCC_CMP_RETURN_MODE
22747 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
22749 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
22750 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
22751 aarch64_libgcc_floating_mode_supported_p
22753 #undef TARGET_MANGLE_TYPE
22754 #define TARGET_MANGLE_TYPE aarch64_mangle_type
22756 #undef TARGET_INVALID_CONVERSION
22757 #define TARGET_INVALID_CONVERSION aarch64_invalid_conversion
22759 #undef TARGET_INVALID_UNARY_OP
22760 #define TARGET_INVALID_UNARY_OP aarch64_invalid_unary_op
22762 #undef TARGET_INVALID_BINARY_OP
22763 #define TARGET_INVALID_BINARY_OP aarch64_invalid_binary_op
22765 #undef TARGET_VERIFY_TYPE_CONTEXT
22766 #define TARGET_VERIFY_TYPE_CONTEXT aarch64_verify_type_context
22768 #undef TARGET_MEMORY_MOVE_COST
22769 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
22771 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
22772 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
22774 #undef TARGET_MUST_PASS_IN_STACK
22775 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
22777 /* This target hook should return true if accesses to volatile bitfields
22778 should use the narrowest mode possible. It should return false if these
22779 accesses should use the bitfield container type. */
22780 #undef TARGET_NARROW_VOLATILE_BITFIELD
22781 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
22783 #undef TARGET_OPTION_OVERRIDE
22784 #define TARGET_OPTION_OVERRIDE aarch64_override_options
22786 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
22787 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
22788 aarch64_override_options_after_change
22790 #undef TARGET_OPTION_SAVE
22791 #define TARGET_OPTION_SAVE aarch64_option_save
22793 #undef TARGET_OPTION_RESTORE
22794 #define TARGET_OPTION_RESTORE aarch64_option_restore
22796 #undef TARGET_OPTION_PRINT
22797 #define TARGET_OPTION_PRINT aarch64_option_print
22799 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
22800 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
22802 #undef TARGET_SET_CURRENT_FUNCTION
22803 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
22805 #undef TARGET_PASS_BY_REFERENCE
22806 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
22808 #undef TARGET_PREFERRED_RELOAD_CLASS
22809 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
22811 #undef TARGET_SCHED_REASSOCIATION_WIDTH
22812 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
22814 #undef TARGET_PROMOTED_TYPE
22815 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
22817 #undef TARGET_SECONDARY_RELOAD
22818 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
22820 #undef TARGET_SHIFT_TRUNCATION_MASK
22821 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
22823 #undef TARGET_SETUP_INCOMING_VARARGS
22824 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
22826 #undef TARGET_STRUCT_VALUE_RTX
22827 #define TARGET_STRUCT_VALUE_RTX aarch64_struct_value_rtx
22829 #undef TARGET_REGISTER_MOVE_COST
22830 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
22832 #undef TARGET_RETURN_IN_MEMORY
22833 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
22835 #undef TARGET_RETURN_IN_MSB
22836 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
22838 #undef TARGET_RTX_COSTS
22839 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
22841 #undef TARGET_SCALAR_MODE_SUPPORTED_P
22842 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
22844 #undef TARGET_SCHED_ISSUE_RATE
22845 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
22847 #undef TARGET_SCHED_VARIABLE_ISSUE
22848 #define TARGET_SCHED_VARIABLE_ISSUE aarch64_sched_variable_issue
22850 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
22851 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
22852 aarch64_sched_first_cycle_multipass_dfa_lookahead
22854 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
22855 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
22856 aarch64_first_cycle_multipass_dfa_lookahead_guard
22858 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
22859 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
22860 aarch64_get_separate_components
22862 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
22863 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
22864 aarch64_components_for_bb
22866 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
22867 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
22868 aarch64_disqualify_components
22870 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
22871 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
22872 aarch64_emit_prologue_components
22874 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
22875 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
22876 aarch64_emit_epilogue_components
22878 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
22879 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
22880 aarch64_set_handled_components
22882 #undef TARGET_TRAMPOLINE_INIT
22883 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
22885 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
22886 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
22888 #undef TARGET_VECTOR_MODE_SUPPORTED_P
22889 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
22891 #undef TARGET_COMPATIBLE_VECTOR_TYPES_P
22892 #define TARGET_COMPATIBLE_VECTOR_TYPES_P aarch64_compatible_vector_types_p
22894 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
22895 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
22896 aarch64_builtin_support_vector_misalignment
22898 #undef TARGET_ARRAY_MODE
22899 #define TARGET_ARRAY_MODE aarch64_array_mode
22901 #undef TARGET_ARRAY_MODE_SUPPORTED_P
22902 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
22904 #undef TARGET_VECTORIZE_ADD_STMT_COST
22905 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
22907 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
22908 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
22909 aarch64_builtin_vectorization_cost
22911 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
22912 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
22914 #undef TARGET_VECTORIZE_BUILTINS
22915 #define TARGET_VECTORIZE_BUILTINS
22917 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
22918 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
22919 aarch64_builtin_vectorized_function
22921 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_MODES
22922 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_MODES \
22923 aarch64_autovectorize_vector_modes
22925 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
22926 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
22927 aarch64_atomic_assign_expand_fenv
22929 /* Section anchor support. */
22931 #undef TARGET_MIN_ANCHOR_OFFSET
22932 #define TARGET_MIN_ANCHOR_OFFSET -256
22934 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
22935 byte offset; we can do much more for larger data types, but have no way
22936 to determine the size of the access. We assume accesses are aligned. */
22937 #undef TARGET_MAX_ANCHOR_OFFSET
22938 #define TARGET_MAX_ANCHOR_OFFSET 4095
22940 #undef TARGET_VECTOR_ALIGNMENT
22941 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
22943 #undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
22944 #define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
22945 aarch64_vectorize_preferred_vector_alignment
22946 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
22947 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
22948 aarch64_simd_vector_alignment_reachable
22950 /* vec_perm support. */
22952 #undef TARGET_VECTORIZE_VEC_PERM_CONST
22953 #define TARGET_VECTORIZE_VEC_PERM_CONST \
22954 aarch64_vectorize_vec_perm_const
22956 #undef TARGET_VECTORIZE_RELATED_MODE
22957 #define TARGET_VECTORIZE_RELATED_MODE aarch64_vectorize_related_mode
22958 #undef TARGET_VECTORIZE_GET_MASK_MODE
22959 #define TARGET_VECTORIZE_GET_MASK_MODE aarch64_get_mask_mode
22960 #undef TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE
22961 #define TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE \
22962 aarch64_empty_mask_is_expensive
22963 #undef TARGET_PREFERRED_ELSE_VALUE
22964 #define TARGET_PREFERRED_ELSE_VALUE \
22965 aarch64_preferred_else_value
22967 #undef TARGET_INIT_LIBFUNCS
22968 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
22970 #undef TARGET_FIXED_CONDITION_CODE_REGS
22971 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
22973 #undef TARGET_FLAGS_REGNUM
22974 #define TARGET_FLAGS_REGNUM CC_REGNUM
22976 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
22977 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
22979 #undef TARGET_ASAN_SHADOW_OFFSET
22980 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
22982 #undef TARGET_LEGITIMIZE_ADDRESS
22983 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
22985 #undef TARGET_SCHED_CAN_SPECULATE_INSN
22986 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
22988 #undef TARGET_CAN_USE_DOLOOP_P
22989 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
22991 #undef TARGET_SCHED_ADJUST_PRIORITY
22992 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
22994 #undef TARGET_SCHED_MACRO_FUSION_P
22995 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
22997 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
22998 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
23000 #undef TARGET_SCHED_FUSION_PRIORITY
23001 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
23003 #undef TARGET_UNSPEC_MAY_TRAP_P
23004 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
23006 #undef TARGET_USE_PSEUDO_PIC_REG
23007 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
23009 #undef TARGET_PRINT_OPERAND
23010 #define TARGET_PRINT_OPERAND aarch64_print_operand
23012 #undef TARGET_PRINT_OPERAND_ADDRESS
23013 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
23015 #undef TARGET_OPTAB_SUPPORTED_P
23016 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
23018 #undef TARGET_OMIT_STRUCT_RETURN_REG
23019 #define TARGET_OMIT_STRUCT_RETURN_REG true
23021 #undef TARGET_DWARF_POLY_INDETERMINATE_VALUE
23022 #define TARGET_DWARF_POLY_INDETERMINATE_VALUE \
23023 aarch64_dwarf_poly_indeterminate_value
23025 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors. */
23026 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
23027 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
23029 #undef TARGET_HARD_REGNO_NREGS
23030 #define TARGET_HARD_REGNO_NREGS aarch64_hard_regno_nregs
23031 #undef TARGET_HARD_REGNO_MODE_OK
23032 #define TARGET_HARD_REGNO_MODE_OK aarch64_hard_regno_mode_ok
23034 #undef TARGET_MODES_TIEABLE_P
23035 #define TARGET_MODES_TIEABLE_P aarch64_modes_tieable_p
23037 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
23038 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
23039 aarch64_hard_regno_call_part_clobbered
23041 #undef TARGET_INSN_CALLEE_ABI
23042 #define TARGET_INSN_CALLEE_ABI aarch64_insn_callee_abi
23044 #undef TARGET_CONSTANT_ALIGNMENT
23045 #define TARGET_CONSTANT_ALIGNMENT aarch64_constant_alignment
23047 #undef TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE
23048 #define TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE \
23049 aarch64_stack_clash_protection_alloca_probe_range
23051 #undef TARGET_COMPUTE_PRESSURE_CLASSES
23052 #define TARGET_COMPUTE_PRESSURE_CLASSES aarch64_compute_pressure_classes
23054 #undef TARGET_CAN_CHANGE_MODE_CLASS
23055 #define TARGET_CAN_CHANGE_MODE_CLASS aarch64_can_change_mode_class
23057 #undef TARGET_SELECT_EARLY_REMAT_MODES
23058 #define TARGET_SELECT_EARLY_REMAT_MODES aarch64_select_early_remat_modes
23060 #undef TARGET_SPECULATION_SAFE_VALUE
23061 #define TARGET_SPECULATION_SAFE_VALUE aarch64_speculation_safe_value
23063 #undef TARGET_ESTIMATED_POLY_VALUE
23064 #define TARGET_ESTIMATED_POLY_VALUE aarch64_estimated_poly_value
23066 #undef TARGET_ATTRIBUTE_TABLE
23067 #define TARGET_ATTRIBUTE_TABLE aarch64_attribute_table
23069 #undef TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN
23070 #define TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN \
23071 aarch64_simd_clone_compute_vecsize_and_simdlen
23073 #undef TARGET_SIMD_CLONE_ADJUST
23074 #define TARGET_SIMD_CLONE_ADJUST aarch64_simd_clone_adjust
23076 #undef TARGET_SIMD_CLONE_USABLE
23077 #define TARGET_SIMD_CLONE_USABLE aarch64_simd_clone_usable
23079 #undef TARGET_COMP_TYPE_ATTRIBUTES
23080 #define TARGET_COMP_TYPE_ATTRIBUTES aarch64_comp_type_attributes
23082 #undef TARGET_GET_MULTILIB_ABI_NAME
23083 #define TARGET_GET_MULTILIB_ABI_NAME aarch64_get_multilib_abi_name
23085 #undef TARGET_FNTYPE_ABI
23086 #define TARGET_FNTYPE_ABI aarch64_fntype_abi
23089 #undef TARGET_RUN_TARGET_SELFTESTS
23090 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
23091 #endif /* #if CHECKING_P */
23093 #undef TARGET_ASM_POST_CFI_STARTPROC
23094 #define TARGET_ASM_POST_CFI_STARTPROC aarch64_post_cfi_startproc
23096 #undef TARGET_STRICT_ARGUMENT_NAMING
23097 #define TARGET_STRICT_ARGUMENT_NAMING hook_bool_CUMULATIVE_ARGS_true
23099 #undef TARGET_MD_ASM_ADJUST
23100 #define TARGET_MD_ASM_ADJUST arm_md_asm_adjust
23102 struct gcc_target targetm
= TARGET_INITIALIZER
;
23104 #include "gt-aarch64.h"