1 /* Machine description for AArch64 architecture.
2 Copyright (C) 2009-2019 Free Software Foundation, Inc.
3 Contributed by ARM Ltd.
5 This file is part of GCC.
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3, or (at your option)
12 GCC is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
21 #define IN_TARGET_CODE 1
24 #define INCLUDE_STRING
26 #include "coretypes.h"
37 #include "stringpool.h"
44 #include "diagnostic.h"
45 #include "insn-attr.h"
47 #include "fold-const.h"
48 #include "stor-layout.h"
56 #include "langhooks.h"
61 #include "gimple-iterator.h"
62 #include "tree-vectorizer.h"
63 #include "aarch64-cost-tables.h"
67 #include "tm-constrs.h"
68 #include "sched-int.h"
69 #include "target-globals.h"
70 #include "common/common-target.h"
73 #include "selftest-rtl.h"
74 #include "rtx-vector-builder.h"
77 /* This file should be included last. */
78 #include "target-def.h"
80 /* Defined for convenience. */
81 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
83 /* Information about a legitimate vector immediate operand. */
84 struct simd_immediate_info
86 enum insn_type
{ MOV
, MVN
, INDEX
, PTRUE
};
87 enum modifier_type
{ LSL
, MSL
};
89 simd_immediate_info () {}
90 simd_immediate_info (scalar_float_mode
, rtx
);
91 simd_immediate_info (scalar_int_mode
, unsigned HOST_WIDE_INT
,
92 insn_type
= MOV
, modifier_type
= LSL
,
94 simd_immediate_info (scalar_mode
, rtx
, rtx
);
95 simd_immediate_info (scalar_int_mode
, aarch64_svpattern
);
97 /* The mode of the elements. */
100 /* The instruction to use to move the immediate into a vector. */
105 /* For MOV and MVN. */
108 /* The value of each element. */
111 /* The kind of shift modifier to use, and the number of bits to shift.
112 This is (LSL, 0) if no shift is needed. */
113 modifier_type modifier
;
120 /* The value of the first element and the step to be added for each
121 subsequent element. */
126 aarch64_svpattern pattern
;
130 /* Construct a floating-point immediate in which each element has mode
131 ELT_MODE_IN and value VALUE_IN. */
132 inline simd_immediate_info
133 ::simd_immediate_info (scalar_float_mode elt_mode_in
, rtx value_in
)
134 : elt_mode (elt_mode_in
), insn (MOV
)
136 u
.mov
.value
= value_in
;
137 u
.mov
.modifier
= LSL
;
141 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
142 and value VALUE_IN. The other parameters are as for the structure
144 inline simd_immediate_info
145 ::simd_immediate_info (scalar_int_mode elt_mode_in
,
146 unsigned HOST_WIDE_INT value_in
,
147 insn_type insn_in
, modifier_type modifier_in
,
148 unsigned int shift_in
)
149 : elt_mode (elt_mode_in
), insn (insn_in
)
151 u
.mov
.value
= gen_int_mode (value_in
, elt_mode_in
);
152 u
.mov
.modifier
= modifier_in
;
153 u
.mov
.shift
= shift_in
;
156 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
157 and where element I is equal to BASE_IN + I * STEP_IN. */
158 inline simd_immediate_info
159 ::simd_immediate_info (scalar_mode elt_mode_in
, rtx base_in
, rtx step_in
)
160 : elt_mode (elt_mode_in
), insn (INDEX
)
162 u
.index
.base
= base_in
;
163 u
.index
.step
= step_in
;
166 /* Construct a predicate that controls elements of mode ELT_MODE_IN
167 and has PTRUE pattern PATTERN_IN. */
168 inline simd_immediate_info
169 ::simd_immediate_info (scalar_int_mode elt_mode_in
,
170 aarch64_svpattern pattern_in
)
171 : elt_mode (elt_mode_in
), insn (PTRUE
)
173 u
.pattern
= pattern_in
;
176 /* The current code model. */
177 enum aarch64_code_model aarch64_cmodel
;
179 /* The number of 64-bit elements in an SVE vector. */
180 poly_uint16 aarch64_sve_vg
;
183 #undef TARGET_HAVE_TLS
184 #define TARGET_HAVE_TLS 1
187 static bool aarch64_composite_type_p (const_tree
, machine_mode
);
188 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode
,
190 machine_mode
*, int *,
192 static void aarch64_elf_asm_constructor (rtx
, int) ATTRIBUTE_UNUSED
;
193 static void aarch64_elf_asm_destructor (rtx
, int) ATTRIBUTE_UNUSED
;
194 static void aarch64_override_options_after_change (void);
195 static bool aarch64_vector_mode_supported_p (machine_mode
);
196 static int aarch64_address_cost (rtx
, machine_mode
, addr_space_t
, bool);
197 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode
,
201 static machine_mode
aarch64_simd_container_mode (scalar_mode
, poly_int64
);
202 static bool aarch64_print_address_internal (FILE*, machine_mode
, rtx
,
203 aarch64_addr_query_type
);
204 static HOST_WIDE_INT
aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val
);
206 /* Major revision number of the ARM Architecture implemented by the target. */
207 unsigned aarch64_architecture_version
;
209 /* The processor for which instructions should be scheduled. */
210 enum aarch64_processor aarch64_tune
= cortexa53
;
212 /* Mask to specify which instruction scheduling options should be used. */
213 uint64_t aarch64_tune_flags
= 0;
215 /* Global flag for PC relative loads. */
216 bool aarch64_pcrelative_literal_loads
;
218 /* Global flag for whether frame pointer is enabled. */
219 bool aarch64_use_frame_pointer
;
221 #define BRANCH_PROTECT_STR_MAX 255
222 char *accepted_branch_protection_string
= NULL
;
224 static enum aarch64_parse_opt_result
225 aarch64_parse_branch_protection (const char*, char**);
227 /* Support for command line parsing of boolean flags in the tuning
229 struct aarch64_flag_desc
235 #define AARCH64_FUSION_PAIR(name, internal_name) \
236 { name, AARCH64_FUSE_##internal_name },
237 static const struct aarch64_flag_desc aarch64_fusible_pairs
[] =
239 { "none", AARCH64_FUSE_NOTHING
},
240 #include "aarch64-fusion-pairs.def"
241 { "all", AARCH64_FUSE_ALL
},
242 { NULL
, AARCH64_FUSE_NOTHING
}
245 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
246 { name, AARCH64_EXTRA_TUNE_##internal_name },
247 static const struct aarch64_flag_desc aarch64_tuning_flags
[] =
249 { "none", AARCH64_EXTRA_TUNE_NONE
},
250 #include "aarch64-tuning-flags.def"
251 { "all", AARCH64_EXTRA_TUNE_ALL
},
252 { NULL
, AARCH64_EXTRA_TUNE_NONE
}
255 /* Tuning parameters. */
257 static const struct cpu_addrcost_table generic_addrcost_table
=
267 0, /* register_offset */
268 0, /* register_sextend */
269 0, /* register_zextend */
273 static const struct cpu_addrcost_table exynosm1_addrcost_table
=
283 1, /* register_offset */
284 1, /* register_sextend */
285 2, /* register_zextend */
289 static const struct cpu_addrcost_table xgene1_addrcost_table
=
299 0, /* register_offset */
300 1, /* register_sextend */
301 1, /* register_zextend */
305 static const struct cpu_addrcost_table thunderx2t99_addrcost_table
=
315 2, /* register_offset */
316 3, /* register_sextend */
317 3, /* register_zextend */
321 static const struct cpu_addrcost_table tsv110_addrcost_table
=
331 0, /* register_offset */
332 1, /* register_sextend */
333 1, /* register_zextend */
337 static const struct cpu_addrcost_table qdf24xx_addrcost_table
=
347 3, /* register_offset */
348 3, /* register_sextend */
349 3, /* register_zextend */
353 static const struct cpu_regmove_cost generic_regmove_cost
=
356 /* Avoid the use of slow int<->fp moves for spilling by setting
357 their cost higher than memmov_cost. */
363 static const struct cpu_regmove_cost cortexa57_regmove_cost
=
366 /* Avoid the use of slow int<->fp moves for spilling by setting
367 their cost higher than memmov_cost. */
373 static const struct cpu_regmove_cost cortexa53_regmove_cost
=
376 /* Avoid the use of slow int<->fp moves for spilling by setting
377 their cost higher than memmov_cost. */
383 static const struct cpu_regmove_cost exynosm1_regmove_cost
=
386 /* Avoid the use of slow int<->fp moves for spilling by setting
387 their cost higher than memmov_cost (actual, 4 and 9). */
393 static const struct cpu_regmove_cost thunderx_regmove_cost
=
401 static const struct cpu_regmove_cost xgene1_regmove_cost
=
404 /* Avoid the use of slow int<->fp moves for spilling by setting
405 their cost higher than memmov_cost. */
411 static const struct cpu_regmove_cost qdf24xx_regmove_cost
=
414 /* Avoid the use of int<->fp moves for spilling. */
420 static const struct cpu_regmove_cost thunderx2t99_regmove_cost
=
423 /* Avoid the use of int<->fp moves for spilling. */
429 static const struct cpu_regmove_cost tsv110_regmove_cost
=
432 /* Avoid the use of slow int<->fp moves for spilling by setting
433 their cost higher than memmov_cost. */
439 /* Generic costs for vector insn classes. */
440 static const struct cpu_vector_cost generic_vector_cost
=
442 1, /* scalar_int_stmt_cost */
443 1, /* scalar_fp_stmt_cost */
444 1, /* scalar_load_cost */
445 1, /* scalar_store_cost */
446 1, /* vec_int_stmt_cost */
447 1, /* vec_fp_stmt_cost */
448 2, /* vec_permute_cost */
449 1, /* vec_to_scalar_cost */
450 1, /* scalar_to_vec_cost */
451 1, /* vec_align_load_cost */
452 1, /* vec_unalign_load_cost */
453 1, /* vec_unalign_store_cost */
454 1, /* vec_store_cost */
455 3, /* cond_taken_branch_cost */
456 1 /* cond_not_taken_branch_cost */
459 /* QDF24XX costs for vector insn classes. */
460 static const struct cpu_vector_cost qdf24xx_vector_cost
=
462 1, /* scalar_int_stmt_cost */
463 1, /* scalar_fp_stmt_cost */
464 1, /* scalar_load_cost */
465 1, /* scalar_store_cost */
466 1, /* vec_int_stmt_cost */
467 3, /* vec_fp_stmt_cost */
468 2, /* vec_permute_cost */
469 1, /* vec_to_scalar_cost */
470 1, /* scalar_to_vec_cost */
471 1, /* vec_align_load_cost */
472 1, /* vec_unalign_load_cost */
473 1, /* vec_unalign_store_cost */
474 1, /* vec_store_cost */
475 3, /* cond_taken_branch_cost */
476 1 /* cond_not_taken_branch_cost */
479 /* ThunderX costs for vector insn classes. */
480 static const struct cpu_vector_cost thunderx_vector_cost
=
482 1, /* scalar_int_stmt_cost */
483 1, /* scalar_fp_stmt_cost */
484 3, /* scalar_load_cost */
485 1, /* scalar_store_cost */
486 4, /* vec_int_stmt_cost */
487 1, /* vec_fp_stmt_cost */
488 4, /* vec_permute_cost */
489 2, /* vec_to_scalar_cost */
490 2, /* scalar_to_vec_cost */
491 3, /* vec_align_load_cost */
492 5, /* vec_unalign_load_cost */
493 5, /* vec_unalign_store_cost */
494 1, /* vec_store_cost */
495 3, /* cond_taken_branch_cost */
496 3 /* cond_not_taken_branch_cost */
499 static const struct cpu_vector_cost tsv110_vector_cost
=
501 1, /* scalar_int_stmt_cost */
502 1, /* scalar_fp_stmt_cost */
503 5, /* scalar_load_cost */
504 1, /* scalar_store_cost */
505 2, /* vec_int_stmt_cost */
506 2, /* vec_fp_stmt_cost */
507 2, /* vec_permute_cost */
508 3, /* vec_to_scalar_cost */
509 2, /* scalar_to_vec_cost */
510 5, /* vec_align_load_cost */
511 5, /* vec_unalign_load_cost */
512 1, /* vec_unalign_store_cost */
513 1, /* vec_store_cost */
514 1, /* cond_taken_branch_cost */
515 1 /* cond_not_taken_branch_cost */
518 /* Generic costs for vector insn classes. */
519 static const struct cpu_vector_cost cortexa57_vector_cost
=
521 1, /* scalar_int_stmt_cost */
522 1, /* scalar_fp_stmt_cost */
523 4, /* scalar_load_cost */
524 1, /* scalar_store_cost */
525 2, /* vec_int_stmt_cost */
526 2, /* vec_fp_stmt_cost */
527 3, /* vec_permute_cost */
528 8, /* vec_to_scalar_cost */
529 8, /* scalar_to_vec_cost */
530 4, /* vec_align_load_cost */
531 4, /* vec_unalign_load_cost */
532 1, /* vec_unalign_store_cost */
533 1, /* vec_store_cost */
534 1, /* cond_taken_branch_cost */
535 1 /* cond_not_taken_branch_cost */
538 static const struct cpu_vector_cost exynosm1_vector_cost
=
540 1, /* scalar_int_stmt_cost */
541 1, /* scalar_fp_stmt_cost */
542 5, /* scalar_load_cost */
543 1, /* scalar_store_cost */
544 3, /* vec_int_stmt_cost */
545 3, /* vec_fp_stmt_cost */
546 3, /* vec_permute_cost */
547 3, /* vec_to_scalar_cost */
548 3, /* scalar_to_vec_cost */
549 5, /* vec_align_load_cost */
550 5, /* vec_unalign_load_cost */
551 1, /* vec_unalign_store_cost */
552 1, /* vec_store_cost */
553 1, /* cond_taken_branch_cost */
554 1 /* cond_not_taken_branch_cost */
557 /* Generic costs for vector insn classes. */
558 static const struct cpu_vector_cost xgene1_vector_cost
=
560 1, /* scalar_int_stmt_cost */
561 1, /* scalar_fp_stmt_cost */
562 5, /* scalar_load_cost */
563 1, /* scalar_store_cost */
564 2, /* vec_int_stmt_cost */
565 2, /* vec_fp_stmt_cost */
566 2, /* vec_permute_cost */
567 4, /* vec_to_scalar_cost */
568 4, /* scalar_to_vec_cost */
569 10, /* vec_align_load_cost */
570 10, /* vec_unalign_load_cost */
571 2, /* vec_unalign_store_cost */
572 2, /* vec_store_cost */
573 2, /* cond_taken_branch_cost */
574 1 /* cond_not_taken_branch_cost */
577 /* Costs for vector insn classes for Vulcan. */
578 static const struct cpu_vector_cost thunderx2t99_vector_cost
=
580 1, /* scalar_int_stmt_cost */
581 6, /* scalar_fp_stmt_cost */
582 4, /* scalar_load_cost */
583 1, /* scalar_store_cost */
584 5, /* vec_int_stmt_cost */
585 6, /* vec_fp_stmt_cost */
586 3, /* vec_permute_cost */
587 6, /* vec_to_scalar_cost */
588 5, /* scalar_to_vec_cost */
589 8, /* vec_align_load_cost */
590 8, /* vec_unalign_load_cost */
591 4, /* vec_unalign_store_cost */
592 4, /* vec_store_cost */
593 2, /* cond_taken_branch_cost */
594 1 /* cond_not_taken_branch_cost */
597 /* Generic costs for branch instructions. */
598 static const struct cpu_branch_cost generic_branch_cost
=
600 1, /* Predictable. */
601 3 /* Unpredictable. */
604 /* Generic approximation modes. */
605 static const cpu_approx_modes generic_approx_modes
=
607 AARCH64_APPROX_NONE
, /* division */
608 AARCH64_APPROX_NONE
, /* sqrt */
609 AARCH64_APPROX_NONE
/* recip_sqrt */
612 /* Approximation modes for Exynos M1. */
613 static const cpu_approx_modes exynosm1_approx_modes
=
615 AARCH64_APPROX_NONE
, /* division */
616 AARCH64_APPROX_ALL
, /* sqrt */
617 AARCH64_APPROX_ALL
/* recip_sqrt */
620 /* Approximation modes for X-Gene 1. */
621 static const cpu_approx_modes xgene1_approx_modes
=
623 AARCH64_APPROX_NONE
, /* division */
624 AARCH64_APPROX_NONE
, /* sqrt */
625 AARCH64_APPROX_ALL
/* recip_sqrt */
628 /* Generic prefetch settings (which disable prefetch). */
629 static const cpu_prefetch_tune generic_prefetch_tune
=
632 -1, /* l1_cache_size */
633 -1, /* l1_cache_line_size */
634 -1, /* l2_cache_size */
635 true, /* prefetch_dynamic_strides */
636 -1, /* minimum_stride */
637 -1 /* default_opt_level */
640 static const cpu_prefetch_tune exynosm1_prefetch_tune
=
643 -1, /* l1_cache_size */
644 64, /* l1_cache_line_size */
645 -1, /* l2_cache_size */
646 true, /* prefetch_dynamic_strides */
647 -1, /* minimum_stride */
648 -1 /* default_opt_level */
651 static const cpu_prefetch_tune qdf24xx_prefetch_tune
=
654 32, /* l1_cache_size */
655 64, /* l1_cache_line_size */
656 512, /* l2_cache_size */
657 false, /* prefetch_dynamic_strides */
658 2048, /* minimum_stride */
659 3 /* default_opt_level */
662 static const cpu_prefetch_tune thunderxt88_prefetch_tune
=
665 32, /* l1_cache_size */
666 128, /* l1_cache_line_size */
667 16*1024, /* l2_cache_size */
668 true, /* prefetch_dynamic_strides */
669 -1, /* minimum_stride */
670 3 /* default_opt_level */
673 static const cpu_prefetch_tune thunderx_prefetch_tune
=
676 32, /* l1_cache_size */
677 128, /* l1_cache_line_size */
678 -1, /* l2_cache_size */
679 true, /* prefetch_dynamic_strides */
680 -1, /* minimum_stride */
681 -1 /* default_opt_level */
684 static const cpu_prefetch_tune thunderx2t99_prefetch_tune
=
687 32, /* l1_cache_size */
688 64, /* l1_cache_line_size */
689 256, /* l2_cache_size */
690 true, /* prefetch_dynamic_strides */
691 -1, /* minimum_stride */
692 -1 /* default_opt_level */
695 static const cpu_prefetch_tune tsv110_prefetch_tune
=
698 64, /* l1_cache_size */
699 64, /* l1_cache_line_size */
700 512, /* l2_cache_size */
701 true, /* prefetch_dynamic_strides */
702 -1, /* minimum_stride */
703 -1 /* default_opt_level */
706 static const cpu_prefetch_tune xgene1_prefetch_tune
=
709 32, /* l1_cache_size */
710 64, /* l1_cache_line_size */
711 256, /* l2_cache_size */
712 true, /* prefetch_dynamic_strides */
713 -1, /* minimum_stride */
714 -1 /* default_opt_level */
717 static const struct tune_params generic_tunings
=
719 &cortexa57_extra_costs
,
720 &generic_addrcost_table
,
721 &generic_regmove_cost
,
722 &generic_vector_cost
,
723 &generic_branch_cost
,
724 &generic_approx_modes
,
725 SVE_NOT_IMPLEMENTED
, /* sve_width */
728 (AARCH64_FUSE_AES_AESMC
), /* fusible_ops */
729 "16:12", /* function_align. */
730 "4", /* jump_align. */
731 "8", /* loop_align. */
732 2, /* int_reassoc_width. */
733 4, /* fp_reassoc_width. */
734 1, /* vec_reassoc_width. */
735 2, /* min_div_recip_mul_sf. */
736 2, /* min_div_recip_mul_df. */
737 0, /* max_case_values. */
738 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
739 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
740 &generic_prefetch_tune
743 static const struct tune_params cortexa35_tunings
=
745 &cortexa53_extra_costs
,
746 &generic_addrcost_table
,
747 &cortexa53_regmove_cost
,
748 &generic_vector_cost
,
749 &generic_branch_cost
,
750 &generic_approx_modes
,
751 SVE_NOT_IMPLEMENTED
, /* sve_width */
754 (AARCH64_FUSE_AES_AESMC
| AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
755 | AARCH64_FUSE_MOVK_MOVK
| AARCH64_FUSE_ADRP_LDR
), /* fusible_ops */
756 "16", /* function_align. */
757 "4", /* jump_align. */
758 "8", /* loop_align. */
759 2, /* int_reassoc_width. */
760 4, /* fp_reassoc_width. */
761 1, /* vec_reassoc_width. */
762 2, /* min_div_recip_mul_sf. */
763 2, /* min_div_recip_mul_df. */
764 0, /* max_case_values. */
765 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
766 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
767 &generic_prefetch_tune
770 static const struct tune_params cortexa53_tunings
=
772 &cortexa53_extra_costs
,
773 &generic_addrcost_table
,
774 &cortexa53_regmove_cost
,
775 &generic_vector_cost
,
776 &generic_branch_cost
,
777 &generic_approx_modes
,
778 SVE_NOT_IMPLEMENTED
, /* sve_width */
781 (AARCH64_FUSE_AES_AESMC
| AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
782 | AARCH64_FUSE_MOVK_MOVK
| AARCH64_FUSE_ADRP_LDR
), /* fusible_ops */
783 "16", /* function_align. */
784 "4", /* jump_align. */
785 "8", /* loop_align. */
786 2, /* int_reassoc_width. */
787 4, /* fp_reassoc_width. */
788 1, /* vec_reassoc_width. */
789 2, /* min_div_recip_mul_sf. */
790 2, /* min_div_recip_mul_df. */
791 0, /* max_case_values. */
792 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
793 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
794 &generic_prefetch_tune
797 static const struct tune_params cortexa57_tunings
=
799 &cortexa57_extra_costs
,
800 &generic_addrcost_table
,
801 &cortexa57_regmove_cost
,
802 &cortexa57_vector_cost
,
803 &generic_branch_cost
,
804 &generic_approx_modes
,
805 SVE_NOT_IMPLEMENTED
, /* sve_width */
808 (AARCH64_FUSE_AES_AESMC
| AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
809 | AARCH64_FUSE_MOVK_MOVK
), /* fusible_ops */
810 "16", /* function_align. */
811 "4", /* jump_align. */
812 "8", /* loop_align. */
813 2, /* int_reassoc_width. */
814 4, /* fp_reassoc_width. */
815 1, /* vec_reassoc_width. */
816 2, /* min_div_recip_mul_sf. */
817 2, /* min_div_recip_mul_df. */
818 0, /* max_case_values. */
819 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
820 (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS
), /* tune_flags. */
821 &generic_prefetch_tune
824 static const struct tune_params cortexa72_tunings
=
826 &cortexa57_extra_costs
,
827 &generic_addrcost_table
,
828 &cortexa57_regmove_cost
,
829 &cortexa57_vector_cost
,
830 &generic_branch_cost
,
831 &generic_approx_modes
,
832 SVE_NOT_IMPLEMENTED
, /* sve_width */
835 (AARCH64_FUSE_AES_AESMC
| AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
836 | AARCH64_FUSE_MOVK_MOVK
), /* fusible_ops */
837 "16", /* function_align. */
838 "4", /* jump_align. */
839 "8", /* loop_align. */
840 2, /* int_reassoc_width. */
841 4, /* fp_reassoc_width. */
842 1, /* vec_reassoc_width. */
843 2, /* min_div_recip_mul_sf. */
844 2, /* min_div_recip_mul_df. */
845 0, /* max_case_values. */
846 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
847 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
848 &generic_prefetch_tune
851 static const struct tune_params cortexa73_tunings
=
853 &cortexa57_extra_costs
,
854 &generic_addrcost_table
,
855 &cortexa57_regmove_cost
,
856 &cortexa57_vector_cost
,
857 &generic_branch_cost
,
858 &generic_approx_modes
,
859 SVE_NOT_IMPLEMENTED
, /* sve_width */
860 4, /* memmov_cost. */
862 (AARCH64_FUSE_AES_AESMC
| AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
863 | AARCH64_FUSE_MOVK_MOVK
| AARCH64_FUSE_ADRP_LDR
), /* fusible_ops */
864 "16", /* function_align. */
865 "4", /* jump_align. */
866 "8", /* loop_align. */
867 2, /* int_reassoc_width. */
868 4, /* fp_reassoc_width. */
869 1, /* vec_reassoc_width. */
870 2, /* min_div_recip_mul_sf. */
871 2, /* min_div_recip_mul_df. */
872 0, /* max_case_values. */
873 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
874 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
875 &generic_prefetch_tune
880 static const struct tune_params exynosm1_tunings
=
882 &exynosm1_extra_costs
,
883 &exynosm1_addrcost_table
,
884 &exynosm1_regmove_cost
,
885 &exynosm1_vector_cost
,
886 &generic_branch_cost
,
887 &exynosm1_approx_modes
,
888 SVE_NOT_IMPLEMENTED
, /* sve_width */
891 (AARCH64_FUSE_AES_AESMC
), /* fusible_ops */
892 "4", /* function_align. */
893 "4", /* jump_align. */
894 "4", /* loop_align. */
895 2, /* int_reassoc_width. */
896 4, /* fp_reassoc_width. */
897 1, /* vec_reassoc_width. */
898 2, /* min_div_recip_mul_sf. */
899 2, /* min_div_recip_mul_df. */
900 48, /* max_case_values. */
901 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
902 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
903 &exynosm1_prefetch_tune
906 static const struct tune_params thunderxt88_tunings
=
908 &thunderx_extra_costs
,
909 &generic_addrcost_table
,
910 &thunderx_regmove_cost
,
911 &thunderx_vector_cost
,
912 &generic_branch_cost
,
913 &generic_approx_modes
,
914 SVE_NOT_IMPLEMENTED
, /* sve_width */
917 AARCH64_FUSE_CMP_BRANCH
, /* fusible_ops */
918 "8", /* function_align. */
919 "8", /* jump_align. */
920 "8", /* loop_align. */
921 2, /* int_reassoc_width. */
922 4, /* fp_reassoc_width. */
923 1, /* vec_reassoc_width. */
924 2, /* min_div_recip_mul_sf. */
925 2, /* min_div_recip_mul_df. */
926 0, /* max_case_values. */
927 tune_params::AUTOPREFETCHER_OFF
, /* autoprefetcher_model. */
928 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
), /* tune_flags. */
929 &thunderxt88_prefetch_tune
932 static const struct tune_params thunderx_tunings
=
934 &thunderx_extra_costs
,
935 &generic_addrcost_table
,
936 &thunderx_regmove_cost
,
937 &thunderx_vector_cost
,
938 &generic_branch_cost
,
939 &generic_approx_modes
,
940 SVE_NOT_IMPLEMENTED
, /* sve_width */
943 AARCH64_FUSE_CMP_BRANCH
, /* fusible_ops */
944 "8", /* function_align. */
945 "8", /* jump_align. */
946 "8", /* loop_align. */
947 2, /* int_reassoc_width. */
948 4, /* fp_reassoc_width. */
949 1, /* vec_reassoc_width. */
950 2, /* min_div_recip_mul_sf. */
951 2, /* min_div_recip_mul_df. */
952 0, /* max_case_values. */
953 tune_params::AUTOPREFETCHER_OFF
, /* autoprefetcher_model. */
954 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
955 | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND
), /* tune_flags. */
956 &thunderx_prefetch_tune
959 static const struct tune_params tsv110_tunings
=
962 &tsv110_addrcost_table
,
963 &tsv110_regmove_cost
,
965 &generic_branch_cost
,
966 &generic_approx_modes
,
967 SVE_NOT_IMPLEMENTED
, /* sve_width */
970 (AARCH64_FUSE_AES_AESMC
| AARCH64_FUSE_CMP_BRANCH
971 | AARCH64_FUSE_ALU_BRANCH
), /* fusible_ops */
972 "16", /* function_align. */
973 "4", /* jump_align. */
974 "8", /* loop_align. */
975 2, /* int_reassoc_width. */
976 4, /* fp_reassoc_width. */
977 1, /* vec_reassoc_width. */
978 2, /* min_div_recip_mul_sf. */
979 2, /* min_div_recip_mul_df. */
980 0, /* max_case_values. */
981 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
982 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
983 &tsv110_prefetch_tune
986 static const struct tune_params xgene1_tunings
=
989 &xgene1_addrcost_table
,
990 &xgene1_regmove_cost
,
992 &generic_branch_cost
,
993 &xgene1_approx_modes
,
994 SVE_NOT_IMPLEMENTED
, /* sve_width */
997 AARCH64_FUSE_NOTHING
, /* fusible_ops */
998 "16", /* function_align. */
999 "16", /* jump_align. */
1000 "16", /* loop_align. */
1001 2, /* int_reassoc_width. */
1002 4, /* fp_reassoc_width. */
1003 1, /* vec_reassoc_width. */
1004 2, /* min_div_recip_mul_sf. */
1005 2, /* min_div_recip_mul_df. */
1006 17, /* max_case_values. */
1007 tune_params::AUTOPREFETCHER_OFF
, /* autoprefetcher_model. */
1008 (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS
), /* tune_flags. */
1009 &xgene1_prefetch_tune
1012 static const struct tune_params emag_tunings
=
1014 &xgene1_extra_costs
,
1015 &xgene1_addrcost_table
,
1016 &xgene1_regmove_cost
,
1017 &xgene1_vector_cost
,
1018 &generic_branch_cost
,
1019 &xgene1_approx_modes
,
1020 SVE_NOT_IMPLEMENTED
,
1021 6, /* memmov_cost */
1023 AARCH64_FUSE_NOTHING
, /* fusible_ops */
1024 "16", /* function_align. */
1025 "16", /* jump_align. */
1026 "16", /* loop_align. */
1027 2, /* int_reassoc_width. */
1028 4, /* fp_reassoc_width. */
1029 1, /* vec_reassoc_width. */
1030 2, /* min_div_recip_mul_sf. */
1031 2, /* min_div_recip_mul_df. */
1032 17, /* max_case_values. */
1033 tune_params::AUTOPREFETCHER_OFF
, /* autoprefetcher_model. */
1034 (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS
), /* tune_flags. */
1035 &xgene1_prefetch_tune
1038 static const struct tune_params qdf24xx_tunings
=
1040 &qdf24xx_extra_costs
,
1041 &qdf24xx_addrcost_table
,
1042 &qdf24xx_regmove_cost
,
1043 &qdf24xx_vector_cost
,
1044 &generic_branch_cost
,
1045 &generic_approx_modes
,
1046 SVE_NOT_IMPLEMENTED
, /* sve_width */
1047 4, /* memmov_cost */
1049 (AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
1050 | AARCH64_FUSE_MOVK_MOVK
), /* fuseable_ops */
1051 "16", /* function_align. */
1052 "8", /* jump_align. */
1053 "16", /* loop_align. */
1054 2, /* int_reassoc_width. */
1055 4, /* fp_reassoc_width. */
1056 1, /* vec_reassoc_width. */
1057 2, /* min_div_recip_mul_sf. */
1058 2, /* min_div_recip_mul_df. */
1059 0, /* max_case_values. */
1060 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
1061 AARCH64_EXTRA_TUNE_RENAME_LOAD_REGS
, /* tune_flags. */
1062 &qdf24xx_prefetch_tune
1065 /* Tuning structure for the Qualcomm Saphira core. Default to falkor values
1067 static const struct tune_params saphira_tunings
=
1069 &generic_extra_costs
,
1070 &generic_addrcost_table
,
1071 &generic_regmove_cost
,
1072 &generic_vector_cost
,
1073 &generic_branch_cost
,
1074 &generic_approx_modes
,
1075 SVE_NOT_IMPLEMENTED
, /* sve_width */
1076 4, /* memmov_cost */
1078 (AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
1079 | AARCH64_FUSE_MOVK_MOVK
), /* fuseable_ops */
1080 "16", /* function_align. */
1081 "8", /* jump_align. */
1082 "16", /* loop_align. */
1083 2, /* int_reassoc_width. */
1084 4, /* fp_reassoc_width. */
1085 1, /* vec_reassoc_width. */
1086 2, /* min_div_recip_mul_sf. */
1087 2, /* min_div_recip_mul_df. */
1088 0, /* max_case_values. */
1089 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
1090 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
1091 &generic_prefetch_tune
1094 static const struct tune_params thunderx2t99_tunings
=
1096 &thunderx2t99_extra_costs
,
1097 &thunderx2t99_addrcost_table
,
1098 &thunderx2t99_regmove_cost
,
1099 &thunderx2t99_vector_cost
,
1100 &generic_branch_cost
,
1101 &generic_approx_modes
,
1102 SVE_NOT_IMPLEMENTED
, /* sve_width */
1103 4, /* memmov_cost. */
1104 4, /* issue_rate. */
1105 (AARCH64_FUSE_CMP_BRANCH
| AARCH64_FUSE_AES_AESMC
1106 | AARCH64_FUSE_ALU_BRANCH
), /* fusible_ops */
1107 "16", /* function_align. */
1108 "8", /* jump_align. */
1109 "16", /* loop_align. */
1110 3, /* int_reassoc_width. */
1111 2, /* fp_reassoc_width. */
1112 2, /* vec_reassoc_width. */
1113 2, /* min_div_recip_mul_sf. */
1114 2, /* min_div_recip_mul_df. */
1115 0, /* max_case_values. */
1116 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
1117 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
1118 &thunderx2t99_prefetch_tune
1121 static const struct tune_params neoversen1_tunings
=
1123 &cortexa57_extra_costs
,
1124 &generic_addrcost_table
,
1125 &generic_regmove_cost
,
1126 &cortexa57_vector_cost
,
1127 &generic_branch_cost
,
1128 &generic_approx_modes
,
1129 SVE_NOT_IMPLEMENTED
, /* sve_width */
1130 4, /* memmov_cost */
1132 AARCH64_FUSE_AES_AESMC
, /* fusible_ops */
1133 "32:16", /* function_align. */
1134 "32:16", /* jump_align. */
1135 "32:16", /* loop_align. */
1136 2, /* int_reassoc_width. */
1137 4, /* fp_reassoc_width. */
1138 2, /* vec_reassoc_width. */
1139 2, /* min_div_recip_mul_sf. */
1140 2, /* min_div_recip_mul_df. */
1141 0, /* max_case_values. */
1142 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
1143 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
1144 &generic_prefetch_tune
1147 /* Support for fine-grained override of the tuning structures. */
1148 struct aarch64_tuning_override_function
1151 void (*parse_override
)(const char*, struct tune_params
*);
1154 static void aarch64_parse_fuse_string (const char*, struct tune_params
*);
1155 static void aarch64_parse_tune_string (const char*, struct tune_params
*);
1156 static void aarch64_parse_sve_width_string (const char*, struct tune_params
*);
1158 static const struct aarch64_tuning_override_function
1159 aarch64_tuning_override_functions
[] =
1161 { "fuse", aarch64_parse_fuse_string
},
1162 { "tune", aarch64_parse_tune_string
},
1163 { "sve_width", aarch64_parse_sve_width_string
},
1167 /* A processor implementing AArch64. */
1170 const char *const name
;
1171 enum aarch64_processor ident
;
1172 enum aarch64_processor sched_core
;
1173 enum aarch64_arch arch
;
1174 unsigned architecture_version
;
1175 const uint64_t flags
;
1176 const struct tune_params
*const tune
;
1179 /* Architectures implementing AArch64. */
1180 static const struct processor all_architectures
[] =
1182 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
1183 {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
1184 #include "aarch64-arches.def"
1185 {NULL
, aarch64_none
, aarch64_none
, aarch64_no_arch
, 0, 0, NULL
}
1188 /* Processor cores implementing AArch64. */
1189 static const struct processor all_cores
[] =
1191 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART, VARIANT) \
1192 {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH, \
1193 all_architectures[AARCH64_ARCH_##ARCH].architecture_version, \
1194 FLAGS, &COSTS##_tunings},
1195 #include "aarch64-cores.def"
1196 {"generic", generic
, cortexa53
, AARCH64_ARCH_8A
, 8,
1197 AARCH64_FL_FOR_ARCH8
, &generic_tunings
},
1198 {NULL
, aarch64_none
, aarch64_none
, aarch64_no_arch
, 0, 0, NULL
}
1202 /* Target specification. These are populated by the -march, -mtune, -mcpu
1203 handling code or by target attributes. */
1204 static const struct processor
*selected_arch
;
1205 static const struct processor
*selected_cpu
;
1206 static const struct processor
*selected_tune
;
1208 enum aarch64_key_type aarch64_ra_sign_key
= AARCH64_KEY_A
;
1210 /* The current tuning set. */
1211 struct tune_params aarch64_tune_params
= generic_tunings
;
1213 /* Table of machine attributes. */
1214 static const struct attribute_spec aarch64_attribute_table
[] =
1216 /* { name, min_len, max_len, decl_req, type_req, fn_type_req,
1217 affects_type_identity, handler, exclude } */
1218 { "aarch64_vector_pcs", 0, 0, false, true, true, true, NULL
, NULL
},
1219 { NULL
, 0, 0, false, false, false, false, NULL
, NULL
}
1222 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
1224 /* An ISA extension in the co-processor and main instruction set space. */
1225 struct aarch64_option_extension
1227 const char *const name
;
1228 const unsigned long flags_on
;
1229 const unsigned long flags_off
;
1232 typedef enum aarch64_cond_code
1234 AARCH64_EQ
= 0, AARCH64_NE
, AARCH64_CS
, AARCH64_CC
, AARCH64_MI
, AARCH64_PL
,
1235 AARCH64_VS
, AARCH64_VC
, AARCH64_HI
, AARCH64_LS
, AARCH64_GE
, AARCH64_LT
,
1236 AARCH64_GT
, AARCH64_LE
, AARCH64_AL
, AARCH64_NV
1240 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
1242 struct aarch64_branch_protect_type
1244 /* The type's name that the user passes to the branch-protection option
1247 /* Function to handle the protection type and set global variables.
1248 First argument is the string token corresponding with this type and the
1249 second argument is the next token in the option string.
1251 * AARCH64_PARSE_OK: Handling was sucessful.
1252 * AARCH64_INVALID_ARG: The type is invalid in this context and the caller
1253 should print an error.
1254 * AARCH64_INVALID_FEATURE: The type is invalid and the handler prints its
1256 enum aarch64_parse_opt_result (*handler
)(char*, char*);
1257 /* A list of types that can follow this type in the option string. */
1258 const aarch64_branch_protect_type
* subtypes
;
1259 unsigned int num_subtypes
;
1262 static enum aarch64_parse_opt_result
1263 aarch64_handle_no_branch_protection (char* str
, char* rest
)
1265 aarch64_ra_sign_scope
= AARCH64_FUNCTION_NONE
;
1266 aarch64_enable_bti
= 0;
1269 error ("unexpected %<%s%> after %<%s%>", rest
, str
);
1270 return AARCH64_PARSE_INVALID_FEATURE
;
1272 return AARCH64_PARSE_OK
;
1275 static enum aarch64_parse_opt_result
1276 aarch64_handle_standard_branch_protection (char* str
, char* rest
)
1278 aarch64_ra_sign_scope
= AARCH64_FUNCTION_NON_LEAF
;
1279 aarch64_ra_sign_key
= AARCH64_KEY_A
;
1280 aarch64_enable_bti
= 1;
1283 error ("unexpected %<%s%> after %<%s%>", rest
, str
);
1284 return AARCH64_PARSE_INVALID_FEATURE
;
1286 return AARCH64_PARSE_OK
;
1289 static enum aarch64_parse_opt_result
1290 aarch64_handle_pac_ret_protection (char* str ATTRIBUTE_UNUSED
,
1291 char* rest ATTRIBUTE_UNUSED
)
1293 aarch64_ra_sign_scope
= AARCH64_FUNCTION_NON_LEAF
;
1294 aarch64_ra_sign_key
= AARCH64_KEY_A
;
1295 return AARCH64_PARSE_OK
;
1298 static enum aarch64_parse_opt_result
1299 aarch64_handle_pac_ret_leaf (char* str ATTRIBUTE_UNUSED
,
1300 char* rest ATTRIBUTE_UNUSED
)
1302 aarch64_ra_sign_scope
= AARCH64_FUNCTION_ALL
;
1303 return AARCH64_PARSE_OK
;
1306 static enum aarch64_parse_opt_result
1307 aarch64_handle_pac_ret_b_key (char* str ATTRIBUTE_UNUSED
,
1308 char* rest ATTRIBUTE_UNUSED
)
1310 aarch64_ra_sign_key
= AARCH64_KEY_B
;
1311 return AARCH64_PARSE_OK
;
1314 static enum aarch64_parse_opt_result
1315 aarch64_handle_bti_protection (char* str ATTRIBUTE_UNUSED
,
1316 char* rest ATTRIBUTE_UNUSED
)
1318 aarch64_enable_bti
= 1;
1319 return AARCH64_PARSE_OK
;
1322 static const struct aarch64_branch_protect_type aarch64_pac_ret_subtypes
[] = {
1323 { "leaf", aarch64_handle_pac_ret_leaf
, NULL
, 0 },
1324 { "b-key", aarch64_handle_pac_ret_b_key
, NULL
, 0 },
1325 { NULL
, NULL
, NULL
, 0 }
1328 static const struct aarch64_branch_protect_type aarch64_branch_protect_types
[] = {
1329 { "none", aarch64_handle_no_branch_protection
, NULL
, 0 },
1330 { "standard", aarch64_handle_standard_branch_protection
, NULL
, 0 },
1331 { "pac-ret", aarch64_handle_pac_ret_protection
, aarch64_pac_ret_subtypes
,
1332 ARRAY_SIZE (aarch64_pac_ret_subtypes
) },
1333 { "bti", aarch64_handle_bti_protection
, NULL
, 0 },
1334 { NULL
, NULL
, NULL
, 0 }
1337 /* The condition codes of the processor, and the inverse function. */
1338 static const char * const aarch64_condition_codes
[] =
1340 "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
1341 "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
1344 /* The preferred condition codes for SVE conditions. */
1345 static const char *const aarch64_sve_condition_codes
[] =
1347 "none", "any", "nlast", "last", "first", "nfrst", "vs", "vc",
1348 "pmore", "plast", "tcont", "tstop", "gt", "le", "al", "nv"
1351 /* Return the assembly token for svpattern value VALUE. */
1354 svpattern_token (enum aarch64_svpattern pattern
)
1358 #define CASE(UPPER, LOWER, VALUE) case AARCH64_SV_##UPPER: return #LOWER;
1359 AARCH64_FOR_SVPATTERN (CASE
)
1361 case AARCH64_NUM_SVPATTERNS
:
1367 /* Generate code to enable conditional branches in functions over 1 MiB. */
1369 aarch64_gen_far_branch (rtx
* operands
, int pos_label
, const char * dest
,
1370 const char * branch_format
)
1372 rtx_code_label
* tmp_label
= gen_label_rtx ();
1373 char label_buf
[256];
1375 ASM_GENERATE_INTERNAL_LABEL (label_buf
, dest
,
1376 CODE_LABEL_NUMBER (tmp_label
));
1377 const char *label_ptr
= targetm
.strip_name_encoding (label_buf
);
1378 rtx dest_label
= operands
[pos_label
];
1379 operands
[pos_label
] = tmp_label
;
1381 snprintf (buffer
, sizeof (buffer
), "%s%s", branch_format
, label_ptr
);
1382 output_asm_insn (buffer
, operands
);
1384 snprintf (buffer
, sizeof (buffer
), "b\t%%l%d\n%s:", pos_label
, label_ptr
);
1385 operands
[pos_label
] = dest_label
;
1386 output_asm_insn (buffer
, operands
);
1391 aarch64_err_no_fpadvsimd (machine_mode mode
)
1393 if (TARGET_GENERAL_REGS_ONLY
)
1394 if (FLOAT_MODE_P (mode
))
1395 error ("%qs is incompatible with the use of floating-point types",
1396 "-mgeneral-regs-only");
1398 error ("%qs is incompatible with the use of vector types",
1399 "-mgeneral-regs-only");
1401 if (FLOAT_MODE_P (mode
))
1402 error ("%qs feature modifier is incompatible with the use of"
1403 " floating-point types", "+nofp");
1405 error ("%qs feature modifier is incompatible with the use of"
1406 " vector types", "+nofp");
1409 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
1410 The register allocator chooses POINTER_AND_FP_REGS if FP_REGS and
1411 GENERAL_REGS have the same cost - even if POINTER_AND_FP_REGS has a much
1412 higher cost. POINTER_AND_FP_REGS is also used if the cost of both FP_REGS
1413 and GENERAL_REGS is lower than the memory cost (in this case the best class
1414 is the lowest cost one). Using POINTER_AND_FP_REGS irrespectively of its
1415 cost results in bad allocations with many redundant int<->FP moves which
1416 are expensive on various cores.
1417 To avoid this we don't allow POINTER_AND_FP_REGS as the allocno class, but
1418 force a decision between FP_REGS and GENERAL_REGS. We use the allocno class
1419 if it isn't POINTER_AND_FP_REGS. Similarly, use the best class if it isn't
1420 POINTER_AND_FP_REGS. Otherwise set the allocno class depending on the mode.
1421 The result of this is that it is no longer inefficient to have a higher
1422 memory move cost than the register move cost.
1426 aarch64_ira_change_pseudo_allocno_class (int regno
, reg_class_t allocno_class
,
1427 reg_class_t best_class
)
1431 if (!reg_class_subset_p (GENERAL_REGS
, allocno_class
)
1432 || !reg_class_subset_p (FP_REGS
, allocno_class
))
1433 return allocno_class
;
1435 if (!reg_class_subset_p (GENERAL_REGS
, best_class
)
1436 || !reg_class_subset_p (FP_REGS
, best_class
))
1439 mode
= PSEUDO_REGNO_MODE (regno
);
1440 return FLOAT_MODE_P (mode
) || VECTOR_MODE_P (mode
) ? FP_REGS
: GENERAL_REGS
;
1444 aarch64_min_divisions_for_recip_mul (machine_mode mode
)
1446 if (GET_MODE_UNIT_SIZE (mode
) == 4)
1447 return aarch64_tune_params
.min_div_recip_mul_sf
;
1448 return aarch64_tune_params
.min_div_recip_mul_df
;
1451 /* Return the reassociation width of treeop OPC with mode MODE. */
1453 aarch64_reassociation_width (unsigned opc
, machine_mode mode
)
1455 if (VECTOR_MODE_P (mode
))
1456 return aarch64_tune_params
.vec_reassoc_width
;
1457 if (INTEGRAL_MODE_P (mode
))
1458 return aarch64_tune_params
.int_reassoc_width
;
1459 /* Avoid reassociating floating point addition so we emit more FMAs. */
1460 if (FLOAT_MODE_P (mode
) && opc
!= PLUS_EXPR
)
1461 return aarch64_tune_params
.fp_reassoc_width
;
1465 /* Provide a mapping from gcc register numbers to dwarf register numbers. */
1467 aarch64_dbx_register_number (unsigned regno
)
1469 if (GP_REGNUM_P (regno
))
1470 return AARCH64_DWARF_R0
+ regno
- R0_REGNUM
;
1471 else if (regno
== SP_REGNUM
)
1472 return AARCH64_DWARF_SP
;
1473 else if (FP_REGNUM_P (regno
))
1474 return AARCH64_DWARF_V0
+ regno
- V0_REGNUM
;
1475 else if (PR_REGNUM_P (regno
))
1476 return AARCH64_DWARF_P0
+ regno
- P0_REGNUM
;
1477 else if (regno
== VG_REGNUM
)
1478 return AARCH64_DWARF_VG
;
1480 /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
1481 equivalent DWARF register. */
1482 return DWARF_FRAME_REGISTERS
;
1485 /* Return true if MODE is any of the Advanced SIMD structure modes. */
1487 aarch64_advsimd_struct_mode_p (machine_mode mode
)
1490 && (mode
== OImode
|| mode
== CImode
|| mode
== XImode
));
1493 /* Return true if MODE is an SVE predicate mode. */
1495 aarch64_sve_pred_mode_p (machine_mode mode
)
1498 && (mode
== VNx16BImode
1499 || mode
== VNx8BImode
1500 || mode
== VNx4BImode
1501 || mode
== VNx2BImode
));
1504 /* Three mutually-exclusive flags describing a vector or predicate type. */
1505 const unsigned int VEC_ADVSIMD
= 1;
1506 const unsigned int VEC_SVE_DATA
= 2;
1507 const unsigned int VEC_SVE_PRED
= 4;
1508 /* Can be used in combination with VEC_ADVSIMD or VEC_SVE_DATA to indicate
1509 a structure of 2, 3 or 4 vectors. */
1510 const unsigned int VEC_STRUCT
= 8;
1511 /* Useful combinations of the above. */
1512 const unsigned int VEC_ANY_SVE
= VEC_SVE_DATA
| VEC_SVE_PRED
;
1513 const unsigned int VEC_ANY_DATA
= VEC_ADVSIMD
| VEC_SVE_DATA
;
1515 /* Return a set of flags describing the vector properties of mode MODE.
1516 Ignore modes that are not supported by the current target. */
1518 aarch64_classify_vector_mode (machine_mode mode
)
1520 if (aarch64_advsimd_struct_mode_p (mode
))
1521 return VEC_ADVSIMD
| VEC_STRUCT
;
1523 if (aarch64_sve_pred_mode_p (mode
))
1524 return VEC_SVE_PRED
;
1526 /* Make the decision based on the mode's enum value rather than its
1527 properties, so that we keep the correct classification regardless
1528 of -msve-vector-bits. */
1531 /* Single SVE vectors. */
1539 return TARGET_SVE
? VEC_SVE_DATA
: 0;
1541 /* x2 SVE vectors. */
1549 /* x3 SVE vectors. */
1557 /* x4 SVE vectors. */
1565 return TARGET_SVE
? VEC_SVE_DATA
| VEC_STRUCT
: 0;
1567 /* 64-bit Advanced SIMD vectors. */
1571 /* ...E_V1DImode doesn't exist. */
1575 /* 128-bit Advanced SIMD vectors. */
1583 return TARGET_SIMD
? VEC_ADVSIMD
: 0;
1590 /* Return true if MODE is any of the data vector modes, including
1593 aarch64_vector_data_mode_p (machine_mode mode
)
1595 return aarch64_classify_vector_mode (mode
) & VEC_ANY_DATA
;
1598 /* Return true if MODE is an SVE data vector mode; either a single vector
1599 or a structure of vectors. */
1601 aarch64_sve_data_mode_p (machine_mode mode
)
1603 return aarch64_classify_vector_mode (mode
) & VEC_SVE_DATA
;
1606 /* Implement target hook TARGET_ARRAY_MODE. */
1607 static opt_machine_mode
1608 aarch64_array_mode (machine_mode mode
, unsigned HOST_WIDE_INT nelems
)
1610 if (aarch64_classify_vector_mode (mode
) == VEC_SVE_DATA
1611 && IN_RANGE (nelems
, 2, 4))
1612 return mode_for_vector (GET_MODE_INNER (mode
),
1613 GET_MODE_NUNITS (mode
) * nelems
);
1615 return opt_machine_mode ();
1618 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P. */
1620 aarch64_array_mode_supported_p (machine_mode mode
,
1621 unsigned HOST_WIDE_INT nelems
)
1624 && (AARCH64_VALID_SIMD_QREG_MODE (mode
)
1625 || AARCH64_VALID_SIMD_DREG_MODE (mode
))
1626 && (nelems
>= 2 && nelems
<= 4))
1632 /* Return the SVE predicate mode to use for elements that have
1633 ELEM_NBYTES bytes, if such a mode exists. */
1636 aarch64_sve_pred_mode (unsigned int elem_nbytes
)
1640 if (elem_nbytes
== 1)
1642 if (elem_nbytes
== 2)
1644 if (elem_nbytes
== 4)
1646 if (elem_nbytes
== 8)
1649 return opt_machine_mode ();
1652 /* Implement TARGET_VECTORIZE_GET_MASK_MODE. */
1654 static opt_machine_mode
1655 aarch64_get_mask_mode (poly_uint64 nunits
, poly_uint64 nbytes
)
1657 if (TARGET_SVE
&& known_eq (nbytes
, BYTES_PER_SVE_VECTOR
))
1659 unsigned int elem_nbytes
= vector_element_size (nbytes
, nunits
);
1660 machine_mode pred_mode
;
1661 if (aarch64_sve_pred_mode (elem_nbytes
).exists (&pred_mode
))
1665 return default_get_mask_mode (nunits
, nbytes
);
1668 /* Return the integer element mode associated with SVE mode MODE. */
1670 static scalar_int_mode
1671 aarch64_sve_element_int_mode (machine_mode mode
)
1673 unsigned int elt_bits
= vector_element_size (BITS_PER_SVE_VECTOR
,
1674 GET_MODE_NUNITS (mode
));
1675 return int_mode_for_size (elt_bits
, 0).require ();
1678 /* Implement TARGET_PREFERRED_ELSE_VALUE. For binary operations,
1679 prefer to use the first arithmetic operand as the else value if
1680 the else value doesn't matter, since that exactly matches the SVE
1681 destructive merging form. For ternary operations we could either
1682 pick the first operand and use FMAD-like instructions or the last
1683 operand and use FMLA-like instructions; the latter seems more
1687 aarch64_preferred_else_value (unsigned, tree
, unsigned int nops
, tree
*ops
)
1689 return nops
== 3 ? ops
[2] : ops
[0];
1692 /* Implement TARGET_HARD_REGNO_NREGS. */
1695 aarch64_hard_regno_nregs (unsigned regno
, machine_mode mode
)
1697 /* ??? Logically we should only need to provide a value when
1698 HARD_REGNO_MODE_OK says that the combination is valid,
1699 but at the moment we need to handle all modes. Just ignore
1700 any runtime parts for registers that can't store them. */
1701 HOST_WIDE_INT lowest_size
= constant_lower_bound (GET_MODE_SIZE (mode
));
1702 switch (aarch64_regno_regclass (regno
))
1707 if (aarch64_sve_data_mode_p (mode
))
1708 return exact_div (GET_MODE_SIZE (mode
),
1709 BYTES_PER_SVE_VECTOR
).to_constant ();
1710 return CEIL (lowest_size
, UNITS_PER_VREG
);
1716 return CEIL (lowest_size
, UNITS_PER_WORD
);
1721 /* Implement TARGET_HARD_REGNO_MODE_OK. */
1724 aarch64_hard_regno_mode_ok (unsigned regno
, machine_mode mode
)
1726 if (GET_MODE_CLASS (mode
) == MODE_CC
)
1727 return regno
== CC_REGNUM
;
1729 if (regno
== VG_REGNUM
)
1730 /* This must have the same size as _Unwind_Word. */
1731 return mode
== DImode
;
1733 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
1734 if (vec_flags
& VEC_SVE_PRED
)
1735 return PR_REGNUM_P (regno
);
1737 if (PR_REGNUM_P (regno
))
1740 if (regno
== SP_REGNUM
)
1741 /* The purpose of comparing with ptr_mode is to support the
1742 global register variable associated with the stack pointer
1743 register via the syntax of asm ("wsp") in ILP32. */
1744 return mode
== Pmode
|| mode
== ptr_mode
;
1746 if (regno
== FRAME_POINTER_REGNUM
|| regno
== ARG_POINTER_REGNUM
)
1747 return mode
== Pmode
;
1749 if (GP_REGNUM_P (regno
))
1751 if (known_le (GET_MODE_SIZE (mode
), 8))
1753 else if (known_le (GET_MODE_SIZE (mode
), 16))
1754 return (regno
& 1) == 0;
1756 else if (FP_REGNUM_P (regno
))
1758 if (vec_flags
& VEC_STRUCT
)
1759 return end_hard_regno (mode
, regno
) - 1 <= V31_REGNUM
;
1761 return !VECTOR_MODE_P (mode
) || vec_flags
!= 0;
1767 /* Return true if this is a definition of a vectorized simd function. */
1770 aarch64_simd_decl_p (tree fndecl
)
1776 fntype
= TREE_TYPE (fndecl
);
1780 /* Functions with the aarch64_vector_pcs attribute use the simd ABI. */
1781 if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (fntype
)) != NULL
)
1787 /* Return the mode a register save/restore should use. DImode for integer
1788 registers, DFmode for FP registers in non-SIMD functions (they only save
1789 the bottom half of a 128 bit register), or TFmode for FP registers in
1793 aarch64_reg_save_mode (tree fndecl
, unsigned regno
)
1795 return GP_REGNUM_P (regno
)
1797 : (aarch64_simd_decl_p (fndecl
) ? E_TFmode
: E_DFmode
);
1800 /* Return true if the instruction is a call to a SIMD function, false
1801 if it is not a SIMD function or if we do not know anything about
1805 aarch64_simd_call_p (rtx_insn
*insn
)
1811 gcc_assert (CALL_P (insn
));
1812 call
= get_call_rtx_from (insn
);
1813 symbol
= XEXP (XEXP (call
, 0), 0);
1814 if (GET_CODE (symbol
) != SYMBOL_REF
)
1816 fndecl
= SYMBOL_REF_DECL (symbol
);
1820 return aarch64_simd_decl_p (fndecl
);
1823 /* Implement TARGET_REMOVE_EXTRA_CALL_PRESERVED_REGS. If INSN calls
1824 a function that uses the SIMD ABI, take advantage of the extra
1825 call-preserved registers that the ABI provides. */
1828 aarch64_remove_extra_call_preserved_regs (rtx_insn
*insn
,
1829 HARD_REG_SET
*return_set
)
1831 if (aarch64_simd_call_p (insn
))
1833 for (int regno
= 0; regno
< FIRST_PSEUDO_REGISTER
; regno
++)
1834 if (FP_SIMD_SAVED_REGNUM_P (regno
))
1835 CLEAR_HARD_REG_BIT (*return_set
, regno
);
1839 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED. The callee only saves
1840 the lower 64 bits of a 128-bit register. Tell the compiler the callee
1841 clobbers the top 64 bits when restoring the bottom 64 bits. */
1844 aarch64_hard_regno_call_part_clobbered (rtx_insn
*insn
, unsigned int regno
,
1847 bool simd_p
= insn
&& CALL_P (insn
) && aarch64_simd_call_p (insn
);
1848 return FP_REGNUM_P (regno
)
1849 && maybe_gt (GET_MODE_SIZE (mode
), simd_p
? 16 : 8);
1852 /* Implement TARGET_RETURN_CALL_WITH_MAX_CLOBBERS. */
1855 aarch64_return_call_with_max_clobbers (rtx_insn
*call_1
, rtx_insn
*call_2
)
1857 gcc_assert (CALL_P (call_1
) && CALL_P (call_2
));
1859 if (!aarch64_simd_call_p (call_1
) || aarch64_simd_call_p (call_2
))
1865 /* Implement REGMODE_NATURAL_SIZE. */
1867 aarch64_regmode_natural_size (machine_mode mode
)
1869 /* The natural size for SVE data modes is one SVE data vector,
1870 and similarly for predicates. We can't independently modify
1871 anything smaller than that. */
1872 /* ??? For now, only do this for variable-width SVE registers.
1873 Doing it for constant-sized registers breaks lower-subreg.c. */
1874 /* ??? And once that's fixed, we should probably have similar
1875 code for Advanced SIMD. */
1876 if (!aarch64_sve_vg
.is_constant ())
1878 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
1879 if (vec_flags
& VEC_SVE_PRED
)
1880 return BYTES_PER_SVE_PRED
;
1881 if (vec_flags
& VEC_SVE_DATA
)
1882 return BYTES_PER_SVE_VECTOR
;
1884 return UNITS_PER_WORD
;
1887 /* Implement HARD_REGNO_CALLER_SAVE_MODE. */
1889 aarch64_hard_regno_caller_save_mode (unsigned regno
, unsigned,
1892 /* The predicate mode determines which bits are significant and
1893 which are "don't care". Decreasing the number of lanes would
1894 lose data while increasing the number of lanes would make bits
1895 unnecessarily significant. */
1896 if (PR_REGNUM_P (regno
))
1898 if (known_ge (GET_MODE_SIZE (mode
), 4))
1904 /* Return true if I's bits are consecutive ones from the MSB. */
1906 aarch64_high_bits_all_ones_p (HOST_WIDE_INT i
)
1908 return exact_log2 (-i
) != HOST_WIDE_INT_M1
;
1911 /* Implement TARGET_CONSTANT_ALIGNMENT. Make strings word-aligned so
1912 that strcpy from constants will be faster. */
1914 static HOST_WIDE_INT
1915 aarch64_constant_alignment (const_tree exp
, HOST_WIDE_INT align
)
1917 if (TREE_CODE (exp
) == STRING_CST
&& !optimize_size
)
1918 return MAX (align
, BITS_PER_WORD
);
1922 /* Return true if calls to DECL should be treated as
1923 long-calls (ie called via a register). */
1925 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED
)
1930 /* Return true if calls to symbol-ref SYM should be treated as
1931 long-calls (ie called via a register). */
1933 aarch64_is_long_call_p (rtx sym
)
1935 return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym
));
1938 /* Return true if calls to symbol-ref SYM should not go through
1942 aarch64_is_noplt_call_p (rtx sym
)
1944 const_tree decl
= SYMBOL_REF_DECL (sym
);
1949 || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl
)))
1950 && !targetm
.binds_local_p (decl
))
1956 /* Return true if the offsets to a zero/sign-extract operation
1957 represent an expression that matches an extend operation. The
1958 operands represent the paramters from
1960 (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)). */
1962 aarch64_is_extend_from_extract (scalar_int_mode mode
, rtx mult_imm
,
1965 HOST_WIDE_INT mult_val
, extract_val
;
1967 if (! CONST_INT_P (mult_imm
) || ! CONST_INT_P (extract_imm
))
1970 mult_val
= INTVAL (mult_imm
);
1971 extract_val
= INTVAL (extract_imm
);
1974 && extract_val
< GET_MODE_BITSIZE (mode
)
1975 && exact_log2 (extract_val
& ~7) > 0
1976 && (extract_val
& 7) <= 4
1977 && mult_val
== (1 << (extract_val
& 7)))
1983 /* Emit an insn that's a simple single-set. Both the operands must be
1984 known to be valid. */
1985 inline static rtx_insn
*
1986 emit_set_insn (rtx x
, rtx y
)
1988 return emit_insn (gen_rtx_SET (x
, y
));
1991 /* X and Y are two things to compare using CODE. Emit the compare insn and
1992 return the rtx for register 0 in the proper mode. */
1994 aarch64_gen_compare_reg (RTX_CODE code
, rtx x
, rtx y
)
1996 machine_mode mode
= SELECT_CC_MODE (code
, x
, y
);
1997 rtx cc_reg
= gen_rtx_REG (mode
, CC_REGNUM
);
1999 emit_set_insn (cc_reg
, gen_rtx_COMPARE (mode
, x
, y
));
2003 /* Similarly, but maybe zero-extend Y if Y_MODE < SImode. */
2006 aarch64_gen_compare_reg_maybe_ze (RTX_CODE code
, rtx x
, rtx y
,
2007 machine_mode y_mode
)
2009 if (y_mode
== E_QImode
|| y_mode
== E_HImode
)
2011 if (CONST_INT_P (y
))
2012 y
= GEN_INT (INTVAL (y
) & GET_MODE_MASK (y_mode
));
2016 machine_mode cc_mode
;
2018 t
= gen_rtx_ZERO_EXTEND (SImode
, y
);
2019 t
= gen_rtx_COMPARE (CC_SWPmode
, t
, x
);
2020 cc_mode
= CC_SWPmode
;
2021 cc_reg
= gen_rtx_REG (cc_mode
, CC_REGNUM
);
2022 emit_set_insn (cc_reg
, t
);
2027 return aarch64_gen_compare_reg (code
, x
, y
);
2030 /* Build the SYMBOL_REF for __tls_get_addr. */
2032 static GTY(()) rtx tls_get_addr_libfunc
;
2035 aarch64_tls_get_addr (void)
2037 if (!tls_get_addr_libfunc
)
2038 tls_get_addr_libfunc
= init_one_libfunc ("__tls_get_addr");
2039 return tls_get_addr_libfunc
;
2042 /* Return the TLS model to use for ADDR. */
2044 static enum tls_model
2045 tls_symbolic_operand_type (rtx addr
)
2047 enum tls_model tls_kind
= TLS_MODEL_NONE
;
2048 if (GET_CODE (addr
) == CONST
)
2051 rtx sym
= strip_offset (addr
, &addend
);
2052 if (GET_CODE (sym
) == SYMBOL_REF
)
2053 tls_kind
= SYMBOL_REF_TLS_MODEL (sym
);
2055 else if (GET_CODE (addr
) == SYMBOL_REF
)
2056 tls_kind
= SYMBOL_REF_TLS_MODEL (addr
);
2061 /* We'll allow lo_sum's in addresses in our legitimate addresses
2062 so that combine would take care of combining addresses where
2063 necessary, but for generation purposes, we'll generate the address
2066 tmp = hi (symbol_ref); adrp x1, foo
2067 dest = lo_sum (tmp, symbol_ref); add dest, x1, :lo_12:foo
2071 adrp x1, :got:foo adrp tmp, :tlsgd:foo
2072 ldr x1, [:got_lo12:foo] add dest, tmp, :tlsgd_lo12:foo
2076 Load TLS symbol, depending on TLS mechanism and TLS access model.
2078 Global Dynamic - Traditional TLS:
2079 adrp tmp, :tlsgd:imm
2080 add dest, tmp, #:tlsgd_lo12:imm
2083 Global Dynamic - TLS Descriptors:
2084 adrp dest, :tlsdesc:imm
2085 ldr tmp, [dest, #:tlsdesc_lo12:imm]
2086 add dest, dest, #:tlsdesc_lo12:imm
2093 adrp tmp, :gottprel:imm
2094 ldr dest, [tmp, #:gottprel_lo12:imm]
2099 add t0, tp, #:tprel_hi12:imm, lsl #12
2100 add t0, t0, #:tprel_lo12_nc:imm
2104 aarch64_load_symref_appropriately (rtx dest
, rtx imm
,
2105 enum aarch64_symbol_type type
)
2109 case SYMBOL_SMALL_ABSOLUTE
:
2111 /* In ILP32, the mode of dest can be either SImode or DImode. */
2113 machine_mode mode
= GET_MODE (dest
);
2115 gcc_assert (mode
== Pmode
|| mode
== ptr_mode
);
2117 if (can_create_pseudo_p ())
2118 tmp_reg
= gen_reg_rtx (mode
);
2120 emit_move_insn (tmp_reg
, gen_rtx_HIGH (mode
, imm
));
2121 emit_insn (gen_add_losym (dest
, tmp_reg
, imm
));
2125 case SYMBOL_TINY_ABSOLUTE
:
2126 emit_insn (gen_rtx_SET (dest
, imm
));
2129 case SYMBOL_SMALL_GOT_28K
:
2131 machine_mode mode
= GET_MODE (dest
);
2132 rtx gp_rtx
= pic_offset_table_rtx
;
2136 /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
2137 here before rtl expand. Tree IVOPT will generate rtl pattern to
2138 decide rtx costs, in which case pic_offset_table_rtx is not
2139 initialized. For that case no need to generate the first adrp
2140 instruction as the final cost for global variable access is
2144 /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
2145 using the page base as GOT base, the first page may be wasted,
2146 in the worst scenario, there is only 28K space for GOT).
2148 The generate instruction sequence for accessing global variable
2151 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
2153 Only one instruction needed. But we must initialize
2154 pic_offset_table_rtx properly. We generate initialize insn for
2155 every global access, and allow CSE to remove all redundant.
2157 The final instruction sequences will look like the following
2158 for multiply global variables access.
2160 adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
2162 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
2163 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
2164 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
2167 rtx s
= gen_rtx_SYMBOL_REF (Pmode
, "_GLOBAL_OFFSET_TABLE_");
2168 crtl
->uses_pic_offset_table
= 1;
2169 emit_move_insn (gp_rtx
, gen_rtx_HIGH (Pmode
, s
));
2171 if (mode
!= GET_MODE (gp_rtx
))
2172 gp_rtx
= gen_lowpart (mode
, gp_rtx
);
2176 if (mode
== ptr_mode
)
2179 insn
= gen_ldr_got_small_28k_di (dest
, gp_rtx
, imm
);
2181 insn
= gen_ldr_got_small_28k_si (dest
, gp_rtx
, imm
);
2183 mem
= XVECEXP (SET_SRC (insn
), 0, 0);
2187 gcc_assert (mode
== Pmode
);
2189 insn
= gen_ldr_got_small_28k_sidi (dest
, gp_rtx
, imm
);
2190 mem
= XVECEXP (XEXP (SET_SRC (insn
), 0), 0, 0);
2193 /* The operand is expected to be MEM. Whenever the related insn
2194 pattern changed, above code which calculate mem should be
2196 gcc_assert (GET_CODE (mem
) == MEM
);
2197 MEM_READONLY_P (mem
) = 1;
2198 MEM_NOTRAP_P (mem
) = 1;
2203 case SYMBOL_SMALL_GOT_4G
:
2205 /* In ILP32, the mode of dest can be either SImode or DImode,
2206 while the got entry is always of SImode size. The mode of
2207 dest depends on how dest is used: if dest is assigned to a
2208 pointer (e.g. in the memory), it has SImode; it may have
2209 DImode if dest is dereferenced to access the memeory.
2210 This is why we have to handle three different ldr_got_small
2211 patterns here (two patterns for ILP32). */
2216 machine_mode mode
= GET_MODE (dest
);
2218 if (can_create_pseudo_p ())
2219 tmp_reg
= gen_reg_rtx (mode
);
2221 emit_move_insn (tmp_reg
, gen_rtx_HIGH (mode
, imm
));
2222 if (mode
== ptr_mode
)
2225 insn
= gen_ldr_got_small_di (dest
, tmp_reg
, imm
);
2227 insn
= gen_ldr_got_small_si (dest
, tmp_reg
, imm
);
2229 mem
= XVECEXP (SET_SRC (insn
), 0, 0);
2233 gcc_assert (mode
== Pmode
);
2235 insn
= gen_ldr_got_small_sidi (dest
, tmp_reg
, imm
);
2236 mem
= XVECEXP (XEXP (SET_SRC (insn
), 0), 0, 0);
2239 gcc_assert (GET_CODE (mem
) == MEM
);
2240 MEM_READONLY_P (mem
) = 1;
2241 MEM_NOTRAP_P (mem
) = 1;
2246 case SYMBOL_SMALL_TLSGD
:
2249 machine_mode mode
= GET_MODE (dest
);
2250 rtx result
= gen_rtx_REG (mode
, R0_REGNUM
);
2254 aarch64_emit_call_insn (gen_tlsgd_small_si (result
, imm
));
2256 aarch64_emit_call_insn (gen_tlsgd_small_di (result
, imm
));
2257 insns
= get_insns ();
2260 RTL_CONST_CALL_P (insns
) = 1;
2261 emit_libcall_block (insns
, dest
, result
, imm
);
2265 case SYMBOL_SMALL_TLSDESC
:
2267 machine_mode mode
= GET_MODE (dest
);
2268 rtx x0
= gen_rtx_REG (mode
, R0_REGNUM
);
2271 gcc_assert (mode
== Pmode
|| mode
== ptr_mode
);
2273 /* In ILP32, the got entry is always of SImode size. Unlike
2274 small GOT, the dest is fixed at reg 0. */
2276 emit_insn (gen_tlsdesc_small_si (imm
));
2278 emit_insn (gen_tlsdesc_small_di (imm
));
2279 tp
= aarch64_load_tp (NULL
);
2282 tp
= gen_lowpart (mode
, tp
);
2284 emit_insn (gen_rtx_SET (dest
, gen_rtx_PLUS (mode
, tp
, x0
)));
2286 set_unique_reg_note (get_last_insn (), REG_EQUIV
, imm
);
2290 case SYMBOL_SMALL_TLSIE
:
2292 /* In ILP32, the mode of dest can be either SImode or DImode,
2293 while the got entry is always of SImode size. The mode of
2294 dest depends on how dest is used: if dest is assigned to a
2295 pointer (e.g. in the memory), it has SImode; it may have
2296 DImode if dest is dereferenced to access the memeory.
2297 This is why we have to handle three different tlsie_small
2298 patterns here (two patterns for ILP32). */
2299 machine_mode mode
= GET_MODE (dest
);
2300 rtx tmp_reg
= gen_reg_rtx (mode
);
2301 rtx tp
= aarch64_load_tp (NULL
);
2303 if (mode
== ptr_mode
)
2306 emit_insn (gen_tlsie_small_di (tmp_reg
, imm
));
2309 emit_insn (gen_tlsie_small_si (tmp_reg
, imm
));
2310 tp
= gen_lowpart (mode
, tp
);
2315 gcc_assert (mode
== Pmode
);
2316 emit_insn (gen_tlsie_small_sidi (tmp_reg
, imm
));
2319 emit_insn (gen_rtx_SET (dest
, gen_rtx_PLUS (mode
, tp
, tmp_reg
)));
2321 set_unique_reg_note (get_last_insn (), REG_EQUIV
, imm
);
2325 case SYMBOL_TLSLE12
:
2326 case SYMBOL_TLSLE24
:
2327 case SYMBOL_TLSLE32
:
2328 case SYMBOL_TLSLE48
:
2330 machine_mode mode
= GET_MODE (dest
);
2331 rtx tp
= aarch64_load_tp (NULL
);
2334 tp
= gen_lowpart (mode
, tp
);
2338 case SYMBOL_TLSLE12
:
2339 emit_insn ((mode
== DImode
? gen_tlsle12_di
: gen_tlsle12_si
)
2342 case SYMBOL_TLSLE24
:
2343 emit_insn ((mode
== DImode
? gen_tlsle24_di
: gen_tlsle24_si
)
2346 case SYMBOL_TLSLE32
:
2347 emit_insn ((mode
== DImode
? gen_tlsle32_di
: gen_tlsle32_si
)
2349 emit_insn ((mode
== DImode
? gen_adddi3
: gen_addsi3
)
2352 case SYMBOL_TLSLE48
:
2353 emit_insn ((mode
== DImode
? gen_tlsle48_di
: gen_tlsle48_si
)
2355 emit_insn ((mode
== DImode
? gen_adddi3
: gen_addsi3
)
2363 set_unique_reg_note (get_last_insn (), REG_EQUIV
, imm
);
2367 case SYMBOL_TINY_GOT
:
2368 emit_insn (gen_ldr_got_tiny (dest
, imm
));
2371 case SYMBOL_TINY_TLSIE
:
2373 machine_mode mode
= GET_MODE (dest
);
2374 rtx tp
= aarch64_load_tp (NULL
);
2376 if (mode
== ptr_mode
)
2379 emit_insn (gen_tlsie_tiny_di (dest
, imm
, tp
));
2382 tp
= gen_lowpart (mode
, tp
);
2383 emit_insn (gen_tlsie_tiny_si (dest
, imm
, tp
));
2388 gcc_assert (mode
== Pmode
);
2389 emit_insn (gen_tlsie_tiny_sidi (dest
, imm
, tp
));
2393 set_unique_reg_note (get_last_insn (), REG_EQUIV
, imm
);
2402 /* Emit a move from SRC to DEST. Assume that the move expanders can
2403 handle all moves if !can_create_pseudo_p (). The distinction is
2404 important because, unlike emit_move_insn, the move expanders know
2405 how to force Pmode objects into the constant pool even when the
2406 constant pool address is not itself legitimate. */
2408 aarch64_emit_move (rtx dest
, rtx src
)
2410 return (can_create_pseudo_p ()
2411 ? emit_move_insn (dest
, src
)
2412 : emit_move_insn_1 (dest
, src
));
2415 /* Apply UNOPTAB to OP and store the result in DEST. */
2418 aarch64_emit_unop (rtx dest
, optab unoptab
, rtx op
)
2420 rtx tmp
= expand_unop (GET_MODE (dest
), unoptab
, op
, dest
, 0);
2422 emit_move_insn (dest
, tmp
);
2425 /* Apply BINOPTAB to OP0 and OP1 and store the result in DEST. */
2428 aarch64_emit_binop (rtx dest
, optab binoptab
, rtx op0
, rtx op1
)
2430 rtx tmp
= expand_binop (GET_MODE (dest
), binoptab
, op0
, op1
, dest
, 0,
2433 emit_move_insn (dest
, tmp
);
2436 /* Split a 128-bit move operation into two 64-bit move operations,
2437 taking care to handle partial overlap of register to register
2438 copies. Special cases are needed when moving between GP regs and
2439 FP regs. SRC can be a register, constant or memory; DST a register
2440 or memory. If either operand is memory it must not have any side
2443 aarch64_split_128bit_move (rtx dst
, rtx src
)
2448 machine_mode mode
= GET_MODE (dst
);
2450 gcc_assert (mode
== TImode
|| mode
== TFmode
);
2451 gcc_assert (!(side_effects_p (src
) || side_effects_p (dst
)));
2452 gcc_assert (mode
== GET_MODE (src
) || GET_MODE (src
) == VOIDmode
);
2454 if (REG_P (dst
) && REG_P (src
))
2456 int src_regno
= REGNO (src
);
2457 int dst_regno
= REGNO (dst
);
2459 /* Handle FP <-> GP regs. */
2460 if (FP_REGNUM_P (dst_regno
) && GP_REGNUM_P (src_regno
))
2462 src_lo
= gen_lowpart (word_mode
, src
);
2463 src_hi
= gen_highpart (word_mode
, src
);
2465 emit_insn (gen_aarch64_movlow_di (mode
, dst
, src_lo
));
2466 emit_insn (gen_aarch64_movhigh_di (mode
, dst
, src_hi
));
2469 else if (GP_REGNUM_P (dst_regno
) && FP_REGNUM_P (src_regno
))
2471 dst_lo
= gen_lowpart (word_mode
, dst
);
2472 dst_hi
= gen_highpart (word_mode
, dst
);
2474 emit_insn (gen_aarch64_movdi_low (mode
, dst_lo
, src
));
2475 emit_insn (gen_aarch64_movdi_high (mode
, dst_hi
, src
));
2480 dst_lo
= gen_lowpart (word_mode
, dst
);
2481 dst_hi
= gen_highpart (word_mode
, dst
);
2482 src_lo
= gen_lowpart (word_mode
, src
);
2483 src_hi
= gen_highpart_mode (word_mode
, mode
, src
);
2485 /* At most one pairing may overlap. */
2486 if (reg_overlap_mentioned_p (dst_lo
, src_hi
))
2488 aarch64_emit_move (dst_hi
, src_hi
);
2489 aarch64_emit_move (dst_lo
, src_lo
);
2493 aarch64_emit_move (dst_lo
, src_lo
);
2494 aarch64_emit_move (dst_hi
, src_hi
);
2499 aarch64_split_128bit_move_p (rtx dst
, rtx src
)
2501 return (! REG_P (src
)
2502 || ! (FP_REGNUM_P (REGNO (dst
)) && FP_REGNUM_P (REGNO (src
))));
2505 /* Split a complex SIMD combine. */
2508 aarch64_split_simd_combine (rtx dst
, rtx src1
, rtx src2
)
2510 machine_mode src_mode
= GET_MODE (src1
);
2511 machine_mode dst_mode
= GET_MODE (dst
);
2513 gcc_assert (VECTOR_MODE_P (dst_mode
));
2514 gcc_assert (register_operand (dst
, dst_mode
)
2515 && register_operand (src1
, src_mode
)
2516 && register_operand (src2
, src_mode
));
2518 emit_insn (gen_aarch64_simd_combine (src_mode
, dst
, src1
, src2
));
2522 /* Split a complex SIMD move. */
2525 aarch64_split_simd_move (rtx dst
, rtx src
)
2527 machine_mode src_mode
= GET_MODE (src
);
2528 machine_mode dst_mode
= GET_MODE (dst
);
2530 gcc_assert (VECTOR_MODE_P (dst_mode
));
2532 if (REG_P (dst
) && REG_P (src
))
2534 gcc_assert (VECTOR_MODE_P (src_mode
));
2535 emit_insn (gen_aarch64_split_simd_mov (src_mode
, dst
, src
));
2540 aarch64_zero_extend_const_eq (machine_mode xmode
, rtx x
,
2541 machine_mode ymode
, rtx y
)
2543 rtx r
= simplify_const_unary_operation (ZERO_EXTEND
, xmode
, y
, ymode
);
2544 gcc_assert (r
!= NULL
);
2545 return rtx_equal_p (x
, r
);
2550 aarch64_force_temporary (machine_mode mode
, rtx x
, rtx value
)
2552 if (can_create_pseudo_p ())
2553 return force_reg (mode
, value
);
2557 aarch64_emit_move (x
, value
);
2562 /* Return true if predicate value X is a constant in which every element
2563 is a CONST_INT. When returning true, describe X in BUILDER as a VNx16BI
2564 value, i.e. as a predicate in which all bits are significant. */
2567 aarch64_get_sve_pred_bits (rtx_vector_builder
&builder
, rtx x
)
2569 if (GET_CODE (x
) != CONST_VECTOR
)
2572 unsigned int factor
= vector_element_size (GET_MODE_NUNITS (VNx16BImode
),
2573 GET_MODE_NUNITS (GET_MODE (x
)));
2574 unsigned int npatterns
= CONST_VECTOR_NPATTERNS (x
) * factor
;
2575 unsigned int nelts_per_pattern
= CONST_VECTOR_NELTS_PER_PATTERN (x
);
2576 builder
.new_vector (VNx16BImode
, npatterns
, nelts_per_pattern
);
2578 unsigned int nelts
= const_vector_encoded_nelts (x
);
2579 for (unsigned int i
= 0; i
< nelts
; ++i
)
2581 rtx elt
= CONST_VECTOR_ENCODED_ELT (x
, i
);
2582 if (!CONST_INT_P (elt
))
2585 builder
.quick_push (elt
);
2586 for (unsigned int j
= 1; j
< factor
; ++j
)
2587 builder
.quick_push (const0_rtx
);
2589 builder
.finalize ();
2593 /* BUILDER contains a predicate constant of mode VNx16BI. Return the
2594 widest predicate element size it can have (that is, the largest size
2595 for which each element would still be 0 or 1). */
2598 aarch64_widest_sve_pred_elt_size (rtx_vector_builder
&builder
)
2600 /* Start with the most optimistic assumption: that we only need
2601 one bit per pattern. This is what we will use if only the first
2602 bit in each pattern is ever set. */
2603 unsigned int mask
= GET_MODE_SIZE (DImode
);
2604 mask
|= builder
.npatterns ();
2606 /* Look for set bits. */
2607 unsigned int nelts
= builder
.encoded_nelts ();
2608 for (unsigned int i
= 1; i
< nelts
; ++i
)
2609 if (INTVAL (builder
.elt (i
)) != 0)
2615 return mask
& -mask
;
2618 /* BUILDER is a predicate constant of mode VNx16BI. Consider the value
2619 that the constant would have with predicate element size ELT_SIZE
2620 (ignoring the upper bits in each element) and return:
2622 * -1 if all bits are set
2623 * N if the predicate has N leading set bits followed by all clear bits
2624 * 0 if the predicate does not have any of these forms. */
2627 aarch64_partial_ptrue_length (rtx_vector_builder
&builder
,
2628 unsigned int elt_size
)
2630 /* If nelts_per_pattern is 3, we have set bits followed by clear bits
2631 followed by set bits. */
2632 if (builder
.nelts_per_pattern () == 3)
2635 /* Skip over leading set bits. */
2636 unsigned int nelts
= builder
.encoded_nelts ();
2638 for (; i
< nelts
; i
+= elt_size
)
2639 if (INTVAL (builder
.elt (i
)) == 0)
2641 unsigned int vl
= i
/ elt_size
;
2643 /* Check for the all-true case. */
2647 /* If nelts_per_pattern is 1, then either VL is zero, or we have a
2648 repeating pattern of set bits followed by clear bits. */
2649 if (builder
.nelts_per_pattern () != 2)
2652 /* We have a "foreground" value and a duplicated "background" value.
2653 If the background might repeat and the last set bit belongs to it,
2654 we might have set bits followed by clear bits followed by set bits. */
2655 if (i
> builder
.npatterns () && maybe_ne (nelts
, builder
.full_nelts ()))
2658 /* Make sure that the rest are all clear. */
2659 for (; i
< nelts
; i
+= elt_size
)
2660 if (INTVAL (builder
.elt (i
)) != 0)
2666 /* See if there is an svpattern that encodes an SVE predicate of mode
2667 PRED_MODE in which the first VL bits are set and the rest are clear.
2668 Return the pattern if so, otherwise return AARCH64_NUM_SVPATTERNS.
2669 A VL of -1 indicates an all-true vector. */
2672 aarch64_svpattern_for_vl (machine_mode pred_mode
, int vl
)
2675 return AARCH64_SV_ALL
;
2677 if (maybe_gt (vl
, GET_MODE_NUNITS (pred_mode
)))
2678 return AARCH64_NUM_SVPATTERNS
;
2680 if (vl
>= 1 && vl
<= 8)
2681 return aarch64_svpattern (AARCH64_SV_VL1
+ (vl
- 1));
2683 if (vl
>= 16 && vl
<= 256 && pow2p_hwi (vl
))
2684 return aarch64_svpattern (AARCH64_SV_VL16
+ (exact_log2 (vl
) - 4));
2687 if (GET_MODE_NUNITS (pred_mode
).is_constant (&max_vl
))
2689 if (vl
== (max_vl
/ 3) * 3)
2690 return AARCH64_SV_MUL3
;
2691 /* These would only trigger for non-power-of-2 lengths. */
2692 if (vl
== (max_vl
& -4))
2693 return AARCH64_SV_MUL4
;
2694 if (vl
== (1 << floor_log2 (max_vl
)))
2695 return AARCH64_SV_POW2
;
2697 return AARCH64_SV_ALL
;
2699 return AARCH64_NUM_SVPATTERNS
;
2702 /* Return a VNx16BImode constant in which every sequence of ELT_SIZE
2703 bits has the lowest bit set and the upper bits clear. This is the
2704 VNx16BImode equivalent of a PTRUE for controlling elements of
2705 ELT_SIZE bytes. However, because the constant is VNx16BImode,
2706 all bits are significant, even the upper zeros. */
2709 aarch64_ptrue_all (unsigned int elt_size
)
2711 rtx_vector_builder
builder (VNx16BImode
, elt_size
, 1);
2712 builder
.quick_push (const1_rtx
);
2713 for (unsigned int i
= 1; i
< elt_size
; ++i
)
2714 builder
.quick_push (const0_rtx
);
2715 return builder
.build ();
2718 /* Return an all-true predicate register of mode MODE. */
2721 aarch64_ptrue_reg (machine_mode mode
)
2723 gcc_assert (GET_MODE_CLASS (mode
) == MODE_VECTOR_BOOL
);
2724 return force_reg (mode
, CONSTM1_RTX (mode
));
2727 /* Return an all-false predicate register of mode MODE. */
2730 aarch64_pfalse_reg (machine_mode mode
)
2732 gcc_assert (GET_MODE_CLASS (mode
) == MODE_VECTOR_BOOL
);
2733 return force_reg (mode
, CONST0_RTX (mode
));
2736 /* Return true if we can move VALUE into a register using a single
2737 CNT[BHWD] instruction. */
2740 aarch64_sve_cnt_immediate_p (poly_int64 value
)
2742 HOST_WIDE_INT factor
= value
.coeffs
[0];
2743 /* The coefficient must be [1, 16] * {2, 4, 8, 16}. */
2744 return (value
.coeffs
[1] == factor
2745 && IN_RANGE (factor
, 2, 16 * 16)
2746 && (factor
& 1) == 0
2747 && factor
<= 16 * (factor
& -factor
));
2750 /* Likewise for rtx X. */
2753 aarch64_sve_cnt_immediate_p (rtx x
)
2756 return poly_int_rtx_p (x
, &value
) && aarch64_sve_cnt_immediate_p (value
);
2759 /* Return the asm string for an instruction with a CNT-like vector size
2760 operand (a vector pattern followed by a multiplier in the range [1, 16]).
2761 PREFIX is the mnemonic without the size suffix and OPERANDS is the
2762 first part of the operands template (the part that comes before the
2763 vector size itself). FACTOR is the number of quadwords.
2764 NELTS_PER_VQ, if nonzero, is the number of elements in each quadword.
2765 If it is zero, we can use any element size. */
2768 aarch64_output_sve_cnt_immediate (const char *prefix
, const char *operands
,
2769 unsigned int factor
,
2770 unsigned int nelts_per_vq
)
2772 static char buffer
[sizeof ("sqincd\t%x0, %w0, all, mul #16")];
2774 if (nelts_per_vq
== 0)
2775 /* There is some overlap in the ranges of the four CNT instructions.
2776 Here we always use the smallest possible element size, so that the
2777 multiplier is 1 whereever possible. */
2778 nelts_per_vq
= factor
& -factor
;
2779 int shift
= std::min (exact_log2 (nelts_per_vq
), 4);
2780 gcc_assert (IN_RANGE (shift
, 1, 4));
2781 char suffix
= "dwhb"[shift
- 1];
2784 unsigned int written
;
2786 written
= snprintf (buffer
, sizeof (buffer
), "%s%c\t%s",
2787 prefix
, suffix
, operands
);
2789 written
= snprintf (buffer
, sizeof (buffer
), "%s%c\t%s, all, mul #%d",
2790 prefix
, suffix
, operands
, factor
);
2791 gcc_assert (written
< sizeof (buffer
));
2795 /* Return the asm string for an instruction with a CNT-like vector size
2796 operand (a vector pattern followed by a multiplier in the range [1, 16]).
2797 PREFIX is the mnemonic without the size suffix and OPERANDS is the
2798 first part of the operands template (the part that comes before the
2799 vector size itself). X is the value of the vector size operand,
2800 as a polynomial integer rtx. */
2803 aarch64_output_sve_cnt_immediate (const char *prefix
, const char *operands
,
2806 poly_int64 value
= rtx_to_poly_int64 (x
);
2807 gcc_assert (aarch64_sve_cnt_immediate_p (value
));
2808 return aarch64_output_sve_cnt_immediate (prefix
, operands
,
2809 value
.coeffs
[1], 0);
2812 /* Return true if we can add VALUE to a register using a single ADDVL
2813 or ADDPL instruction. */
2816 aarch64_sve_addvl_addpl_immediate_p (poly_int64 value
)
2818 HOST_WIDE_INT factor
= value
.coeffs
[0];
2819 if (factor
== 0 || value
.coeffs
[1] != factor
)
2821 /* FACTOR counts VG / 2, so a value of 2 is one predicate width
2822 and a value of 16 is one vector width. */
2823 return (((factor
& 15) == 0 && IN_RANGE (factor
, -32 * 16, 31 * 16))
2824 || ((factor
& 1) == 0 && IN_RANGE (factor
, -32 * 2, 31 * 2)));
2827 /* Likewise for rtx X. */
2830 aarch64_sve_addvl_addpl_immediate_p (rtx x
)
2833 return (poly_int_rtx_p (x
, &value
)
2834 && aarch64_sve_addvl_addpl_immediate_p (value
));
2837 /* Return the asm string for adding ADDVL or ADDPL immediate X to operand 1
2838 and storing the result in operand 0. */
2841 aarch64_output_sve_addvl_addpl (rtx dest
, rtx base
, rtx offset
)
2843 static char buffer
[sizeof ("addpl\t%x0, %x1, #-") + 3 * sizeof (int)];
2844 poly_int64 offset_value
= rtx_to_poly_int64 (offset
);
2845 gcc_assert (aarch64_sve_addvl_addpl_immediate_p (offset_value
));
2847 /* Use INC or DEC if possible. */
2848 if (rtx_equal_p (dest
, base
) && GP_REGNUM_P (REGNO (dest
)))
2850 if (aarch64_sve_cnt_immediate_p (offset_value
))
2851 return aarch64_output_sve_cnt_immediate ("inc", "%x0",
2852 offset_value
.coeffs
[1], 0);
2853 if (aarch64_sve_cnt_immediate_p (-offset_value
))
2854 return aarch64_output_sve_cnt_immediate ("dec", "%x0",
2855 -offset_value
.coeffs
[1], 0);
2858 int factor
= offset_value
.coeffs
[1];
2859 if ((factor
& 15) == 0)
2860 snprintf (buffer
, sizeof (buffer
), "addvl\t%%x0, %%x1, #%d", factor
/ 16);
2862 snprintf (buffer
, sizeof (buffer
), "addpl\t%%x0, %%x1, #%d", factor
/ 2);
2866 /* Return true if X is a valid immediate for an SVE vector INC or DEC
2867 instruction. If it is, store the number of elements in each vector
2868 quadword in *NELTS_PER_VQ_OUT (if nonnull) and store the multiplication
2869 factor in *FACTOR_OUT (if nonnull). */
2872 aarch64_sve_inc_dec_immediate_p (rtx x
, int *factor_out
,
2873 unsigned int *nelts_per_vq_out
)
2878 if (!const_vec_duplicate_p (x
, &elt
)
2879 || !poly_int_rtx_p (elt
, &value
))
2882 unsigned int nelts_per_vq
= 128 / GET_MODE_UNIT_BITSIZE (GET_MODE (x
));
2883 if (nelts_per_vq
!= 8 && nelts_per_vq
!= 4 && nelts_per_vq
!= 2)
2884 /* There's no vector INCB. */
2887 HOST_WIDE_INT factor
= value
.coeffs
[0];
2888 if (value
.coeffs
[1] != factor
)
2891 /* The coefficient must be [1, 16] * NELTS_PER_VQ. */
2892 if ((factor
% nelts_per_vq
) != 0
2893 || !IN_RANGE (abs (factor
), nelts_per_vq
, 16 * nelts_per_vq
))
2897 *factor_out
= factor
;
2898 if (nelts_per_vq_out
)
2899 *nelts_per_vq_out
= nelts_per_vq
;
2903 /* Return true if X is a valid immediate for an SVE vector INC or DEC
2907 aarch64_sve_inc_dec_immediate_p (rtx x
)
2909 return aarch64_sve_inc_dec_immediate_p (x
, NULL
, NULL
);
2912 /* Return the asm template for an SVE vector INC or DEC instruction.
2913 OPERANDS gives the operands before the vector count and X is the
2914 value of the vector count operand itself. */
2917 aarch64_output_sve_inc_dec_immediate (const char *operands
, rtx x
)
2920 unsigned int nelts_per_vq
;
2921 if (!aarch64_sve_inc_dec_immediate_p (x
, &factor
, &nelts_per_vq
))
2924 return aarch64_output_sve_cnt_immediate ("dec", operands
, -factor
,
2927 return aarch64_output_sve_cnt_immediate ("inc", operands
, factor
,
2932 aarch64_internal_mov_immediate (rtx dest
, rtx imm
, bool generate
,
2933 scalar_int_mode mode
)
2936 unsigned HOST_WIDE_INT val
, val2
, mask
;
2937 int one_match
, zero_match
;
2942 if (aarch64_move_imm (val
, mode
))
2945 emit_insn (gen_rtx_SET (dest
, imm
));
2949 /* Check to see if the low 32 bits are either 0xffffXXXX or 0xXXXXffff
2950 (with XXXX non-zero). In that case check to see if the move can be done in
2952 val2
= val
& 0xffffffff;
2954 && aarch64_move_imm (val2
, SImode
)
2955 && (((val
>> 32) & 0xffff) == 0 || (val
>> 48) == 0))
2958 emit_insn (gen_rtx_SET (dest
, GEN_INT (val2
)));
2960 /* Check if we have to emit a second instruction by checking to see
2961 if any of the upper 32 bits of the original DI mode value is set. */
2965 i
= (val
>> 48) ? 48 : 32;
2968 emit_insn (gen_insv_immdi (dest
, GEN_INT (i
),
2969 GEN_INT ((val
>> i
) & 0xffff)));
2974 if ((val
>> 32) == 0 || mode
== SImode
)
2978 emit_insn (gen_rtx_SET (dest
, GEN_INT (val
& 0xffff)));
2980 emit_insn (gen_insv_immsi (dest
, GEN_INT (16),
2981 GEN_INT ((val
>> 16) & 0xffff)));
2983 emit_insn (gen_insv_immdi (dest
, GEN_INT (16),
2984 GEN_INT ((val
>> 16) & 0xffff)));
2989 /* Remaining cases are all for DImode. */
2992 zero_match
= ((val
& mask
) == 0) + ((val
& (mask
<< 16)) == 0) +
2993 ((val
& (mask
<< 32)) == 0) + ((val
& (mask
<< 48)) == 0);
2994 one_match
= ((~val
& mask
) == 0) + ((~val
& (mask
<< 16)) == 0) +
2995 ((~val
& (mask
<< 32)) == 0) + ((~val
& (mask
<< 48)) == 0);
2997 if (zero_match
!= 2 && one_match
!= 2)
2999 /* Try emitting a bitmask immediate with a movk replacing 16 bits.
3000 For a 64-bit bitmask try whether changing 16 bits to all ones or
3001 zeroes creates a valid bitmask. To check any repeated bitmask,
3002 try using 16 bits from the other 32-bit half of val. */
3004 for (i
= 0; i
< 64; i
+= 16, mask
<<= 16)
3007 if (val2
!= val
&& aarch64_bitmask_imm (val2
, mode
))
3010 if (val2
!= val
&& aarch64_bitmask_imm (val2
, mode
))
3012 val2
= val2
& ~mask
;
3013 val2
= val2
| (((val2
>> 32) | (val2
<< 32)) & mask
);
3014 if (val2
!= val
&& aarch64_bitmask_imm (val2
, mode
))
3021 emit_insn (gen_rtx_SET (dest
, GEN_INT (val2
)));
3022 emit_insn (gen_insv_immdi (dest
, GEN_INT (i
),
3023 GEN_INT ((val
>> i
) & 0xffff)));
3029 /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
3030 are emitted by the initial mov. If one_match > zero_match, skip set bits,
3031 otherwise skip zero bits. */
3035 val2
= one_match
> zero_match
? ~val
: val
;
3036 i
= (val2
& mask
) != 0 ? 0 : (val2
& (mask
<< 16)) != 0 ? 16 : 32;
3039 emit_insn (gen_rtx_SET (dest
, GEN_INT (one_match
> zero_match
3040 ? (val
| ~(mask
<< i
))
3041 : (val
& (mask
<< i
)))));
3042 for (i
+= 16; i
< 64; i
+= 16)
3044 if ((val2
& (mask
<< i
)) == 0)
3047 emit_insn (gen_insv_immdi (dest
, GEN_INT (i
),
3048 GEN_INT ((val
>> i
) & 0xffff)));
3055 /* Return whether imm is a 128-bit immediate which is simple enough to
3058 aarch64_mov128_immediate (rtx imm
)
3060 if (GET_CODE (imm
) == CONST_INT
)
3063 gcc_assert (CONST_WIDE_INT_NUNITS (imm
) == 2);
3065 rtx lo
= GEN_INT (CONST_WIDE_INT_ELT (imm
, 0));
3066 rtx hi
= GEN_INT (CONST_WIDE_INT_ELT (imm
, 1));
3068 return aarch64_internal_mov_immediate (NULL_RTX
, lo
, false, DImode
)
3069 + aarch64_internal_mov_immediate (NULL_RTX
, hi
, false, DImode
) <= 4;
3073 /* Return the number of temporary registers that aarch64_add_offset_1
3074 would need to add OFFSET to a register. */
3077 aarch64_add_offset_1_temporaries (HOST_WIDE_INT offset
)
3079 return abs_hwi (offset
) < 0x1000000 ? 0 : 1;
3082 /* A subroutine of aarch64_add_offset. Set DEST to SRC + OFFSET for
3083 a non-polynomial OFFSET. MODE is the mode of the addition.
3084 FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
3085 be set and CFA adjustments added to the generated instructions.
3087 TEMP1, if nonnull, is a register of mode MODE that can be used as a
3088 temporary if register allocation is already complete. This temporary
3089 register may overlap DEST but must not overlap SRC. If TEMP1 is known
3090 to hold abs (OFFSET), EMIT_MOVE_IMM can be set to false to avoid emitting
3091 the immediate again.
3093 Since this function may be used to adjust the stack pointer, we must
3094 ensure that it cannot cause transient stack deallocation (for example
3095 by first incrementing SP and then decrementing when adjusting by a
3096 large immediate). */
3099 aarch64_add_offset_1 (scalar_int_mode mode
, rtx dest
,
3100 rtx src
, HOST_WIDE_INT offset
, rtx temp1
,
3101 bool frame_related_p
, bool emit_move_imm
)
3103 gcc_assert (emit_move_imm
|| temp1
!= NULL_RTX
);
3104 gcc_assert (temp1
== NULL_RTX
|| !reg_overlap_mentioned_p (temp1
, src
));
3106 HOST_WIDE_INT moffset
= abs_hwi (offset
);
3111 if (!rtx_equal_p (dest
, src
))
3113 insn
= emit_insn (gen_rtx_SET (dest
, src
));
3114 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
3119 /* Single instruction adjustment. */
3120 if (aarch64_uimm12_shift (moffset
))
3122 insn
= emit_insn (gen_add3_insn (dest
, src
, GEN_INT (offset
)));
3123 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
3127 /* Emit 2 additions/subtractions if the adjustment is less than 24 bits
3130 a) the offset cannot be loaded by a 16-bit move or
3131 b) there is no spare register into which we can move it. */
3132 if (moffset
< 0x1000000
3133 && ((!temp1
&& !can_create_pseudo_p ())
3134 || !aarch64_move_imm (moffset
, mode
)))
3136 HOST_WIDE_INT low_off
= moffset
& 0xfff;
3138 low_off
= offset
< 0 ? -low_off
: low_off
;
3139 insn
= emit_insn (gen_add3_insn (dest
, src
, GEN_INT (low_off
)));
3140 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
3141 insn
= emit_insn (gen_add2_insn (dest
, GEN_INT (offset
- low_off
)));
3142 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
3146 /* Emit a move immediate if required and an addition/subtraction. */
3149 gcc_assert (temp1
!= NULL_RTX
|| can_create_pseudo_p ());
3150 temp1
= aarch64_force_temporary (mode
, temp1
, GEN_INT (moffset
));
3152 insn
= emit_insn (offset
< 0
3153 ? gen_sub3_insn (dest
, src
, temp1
)
3154 : gen_add3_insn (dest
, src
, temp1
));
3155 if (frame_related_p
)
3157 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
3158 rtx adj
= plus_constant (mode
, src
, offset
);
3159 add_reg_note (insn
, REG_CFA_ADJUST_CFA
, gen_rtx_SET (dest
, adj
));
3163 /* Return the number of temporary registers that aarch64_add_offset
3164 would need to move OFFSET into a register or add OFFSET to a register;
3165 ADD_P is true if we want the latter rather than the former. */
3168 aarch64_offset_temporaries (bool add_p
, poly_int64 offset
)
3170 /* This follows the same structure as aarch64_add_offset. */
3171 if (add_p
&& aarch64_sve_addvl_addpl_immediate_p (offset
))
3174 unsigned int count
= 0;
3175 HOST_WIDE_INT factor
= offset
.coeffs
[1];
3176 HOST_WIDE_INT constant
= offset
.coeffs
[0] - factor
;
3177 poly_int64
poly_offset (factor
, factor
);
3178 if (add_p
&& aarch64_sve_addvl_addpl_immediate_p (poly_offset
))
3179 /* Need one register for the ADDVL/ADDPL result. */
3181 else if (factor
!= 0)
3183 factor
= abs (factor
);
3184 if (factor
> 16 * (factor
& -factor
))
3185 /* Need one register for the CNT result and one for the multiplication
3186 factor. If necessary, the second temporary can be reused for the
3187 constant part of the offset. */
3189 /* Need one register for the CNT result (which might then
3193 return count
+ aarch64_add_offset_1_temporaries (constant
);
3196 /* If X can be represented as a poly_int64, return the number
3197 of temporaries that are required to add it to a register.
3198 Return -1 otherwise. */
3201 aarch64_add_offset_temporaries (rtx x
)
3204 if (!poly_int_rtx_p (x
, &offset
))
3206 return aarch64_offset_temporaries (true, offset
);
3209 /* Set DEST to SRC + OFFSET. MODE is the mode of the addition.
3210 FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
3211 be set and CFA adjustments added to the generated instructions.
3213 TEMP1, if nonnull, is a register of mode MODE that can be used as a
3214 temporary if register allocation is already complete. This temporary
3215 register may overlap DEST if !FRAME_RELATED_P but must not overlap SRC.
3216 If TEMP1 is known to hold abs (OFFSET), EMIT_MOVE_IMM can be set to
3217 false to avoid emitting the immediate again.
3219 TEMP2, if nonnull, is a second temporary register that doesn't
3220 overlap either DEST or REG.
3222 Since this function may be used to adjust the stack pointer, we must
3223 ensure that it cannot cause transient stack deallocation (for example
3224 by first incrementing SP and then decrementing when adjusting by a
3225 large immediate). */
3228 aarch64_add_offset (scalar_int_mode mode
, rtx dest
, rtx src
,
3229 poly_int64 offset
, rtx temp1
, rtx temp2
,
3230 bool frame_related_p
, bool emit_move_imm
= true)
3232 gcc_assert (emit_move_imm
|| temp1
!= NULL_RTX
);
3233 gcc_assert (temp1
== NULL_RTX
|| !reg_overlap_mentioned_p (temp1
, src
));
3234 gcc_assert (temp1
== NULL_RTX
3236 || !reg_overlap_mentioned_p (temp1
, dest
));
3237 gcc_assert (temp2
== NULL_RTX
|| !reg_overlap_mentioned_p (dest
, temp2
));
3239 /* Try using ADDVL or ADDPL to add the whole value. */
3240 if (src
!= const0_rtx
&& aarch64_sve_addvl_addpl_immediate_p (offset
))
3242 rtx offset_rtx
= gen_int_mode (offset
, mode
);
3243 rtx_insn
*insn
= emit_insn (gen_add3_insn (dest
, src
, offset_rtx
));
3244 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
3248 /* Coefficient 1 is multiplied by the number of 128-bit blocks in an
3249 SVE vector register, over and above the minimum size of 128 bits.
3250 This is equivalent to half the value returned by CNTD with a
3251 vector shape of ALL. */
3252 HOST_WIDE_INT factor
= offset
.coeffs
[1];
3253 HOST_WIDE_INT constant
= offset
.coeffs
[0] - factor
;
3255 /* Try using ADDVL or ADDPL to add the VG-based part. */
3256 poly_int64
poly_offset (factor
, factor
);
3257 if (src
!= const0_rtx
3258 && aarch64_sve_addvl_addpl_immediate_p (poly_offset
))
3260 rtx offset_rtx
= gen_int_mode (poly_offset
, mode
);
3261 if (frame_related_p
)
3263 rtx_insn
*insn
= emit_insn (gen_add3_insn (dest
, src
, offset_rtx
));
3264 RTX_FRAME_RELATED_P (insn
) = true;
3269 rtx addr
= gen_rtx_PLUS (mode
, src
, offset_rtx
);
3270 src
= aarch64_force_temporary (mode
, temp1
, addr
);
3275 /* Otherwise use a CNT-based sequence. */
3276 else if (factor
!= 0)
3278 /* Use a subtraction if we have a negative factor. */
3279 rtx_code code
= PLUS
;
3286 /* Calculate CNTD * FACTOR / 2. First try to fold the division
3287 into the multiplication. */
3291 /* Use a right shift by 1. */
3295 HOST_WIDE_INT low_bit
= factor
& -factor
;
3296 if (factor
<= 16 * low_bit
)
3298 if (factor
> 16 * 8)
3300 /* "CNTB Xn, ALL, MUL #FACTOR" is out of range, so calculate
3301 the value with the minimum multiplier and shift it into
3303 int extra_shift
= exact_log2 (low_bit
);
3304 shift
+= extra_shift
;
3305 factor
>>= extra_shift
;
3307 val
= gen_int_mode (poly_int64 (factor
* 2, factor
* 2), mode
);
3311 /* Use CNTD, then multiply it by FACTOR. */
3312 val
= gen_int_mode (poly_int64 (2, 2), mode
);
3313 val
= aarch64_force_temporary (mode
, temp1
, val
);
3315 /* Go back to using a negative multiplication factor if we have
3316 no register from which to subtract. */
3317 if (code
== MINUS
&& src
== const0_rtx
)
3322 rtx coeff1
= gen_int_mode (factor
, mode
);
3323 coeff1
= aarch64_force_temporary (mode
, temp2
, coeff1
);
3324 val
= gen_rtx_MULT (mode
, val
, coeff1
);
3329 /* Multiply by 1 << SHIFT. */
3330 val
= aarch64_force_temporary (mode
, temp1
, val
);
3331 val
= gen_rtx_ASHIFT (mode
, val
, GEN_INT (shift
));
3333 else if (shift
== -1)
3336 val
= aarch64_force_temporary (mode
, temp1
, val
);
3337 val
= gen_rtx_ASHIFTRT (mode
, val
, const1_rtx
);
3340 /* Calculate SRC +/- CNTD * FACTOR / 2. */
3341 if (src
!= const0_rtx
)
3343 val
= aarch64_force_temporary (mode
, temp1
, val
);
3344 val
= gen_rtx_fmt_ee (code
, mode
, src
, val
);
3346 else if (code
== MINUS
)
3348 val
= aarch64_force_temporary (mode
, temp1
, val
);
3349 val
= gen_rtx_NEG (mode
, val
);
3352 if (constant
== 0 || frame_related_p
)
3354 rtx_insn
*insn
= emit_insn (gen_rtx_SET (dest
, val
));
3355 if (frame_related_p
)
3357 RTX_FRAME_RELATED_P (insn
) = true;
3358 add_reg_note (insn
, REG_CFA_ADJUST_CFA
,
3359 gen_rtx_SET (dest
, plus_constant (Pmode
, src
,
3368 src
= aarch64_force_temporary (mode
, temp1
, val
);
3373 emit_move_imm
= true;
3376 aarch64_add_offset_1 (mode
, dest
, src
, constant
, temp1
,
3377 frame_related_p
, emit_move_imm
);
3380 /* Like aarch64_add_offset, but the offset is given as an rtx rather
3381 than a poly_int64. */
3384 aarch64_split_add_offset (scalar_int_mode mode
, rtx dest
, rtx src
,
3385 rtx offset_rtx
, rtx temp1
, rtx temp2
)
3387 aarch64_add_offset (mode
, dest
, src
, rtx_to_poly_int64 (offset_rtx
),
3388 temp1
, temp2
, false);
3391 /* Add DELTA to the stack pointer, marking the instructions frame-related.
3392 TEMP1 is available as a temporary if nonnull. EMIT_MOVE_IMM is false
3393 if TEMP1 already contains abs (DELTA). */
3396 aarch64_add_sp (rtx temp1
, rtx temp2
, poly_int64 delta
, bool emit_move_imm
)
3398 aarch64_add_offset (Pmode
, stack_pointer_rtx
, stack_pointer_rtx
, delta
,
3399 temp1
, temp2
, true, emit_move_imm
);
3402 /* Subtract DELTA from the stack pointer, marking the instructions
3403 frame-related if FRAME_RELATED_P. TEMP1 is available as a temporary
3407 aarch64_sub_sp (rtx temp1
, rtx temp2
, poly_int64 delta
, bool frame_related_p
,
3408 bool emit_move_imm
= true)
3410 aarch64_add_offset (Pmode
, stack_pointer_rtx
, stack_pointer_rtx
, -delta
,
3411 temp1
, temp2
, frame_related_p
, emit_move_imm
);
3414 /* Set DEST to (vec_series BASE STEP). */
3417 aarch64_expand_vec_series (rtx dest
, rtx base
, rtx step
)
3419 machine_mode mode
= GET_MODE (dest
);
3420 scalar_mode inner
= GET_MODE_INNER (mode
);
3422 /* Each operand can be a register or an immediate in the range [-16, 15]. */
3423 if (!aarch64_sve_index_immediate_p (base
))
3424 base
= force_reg (inner
, base
);
3425 if (!aarch64_sve_index_immediate_p (step
))
3426 step
= force_reg (inner
, step
);
3428 emit_set_insn (dest
, gen_rtx_VEC_SERIES (mode
, base
, step
));
3431 /* Duplicate 128-bit Advanced SIMD vector SRC so that it fills an SVE
3432 register of mode MODE. Use TARGET for the result if it's nonnull
3435 The two vector modes must have the same element mode. The behavior
3436 is to duplicate architectural lane N of SRC into architectural lanes
3437 N + I * STEP of the result. On big-endian targets, architectural
3438 lane 0 of an Advanced SIMD vector is the last element of the vector
3439 in memory layout, so for big-endian targets this operation has the
3440 effect of reversing SRC before duplicating it. Callers need to
3441 account for this. */
3444 aarch64_expand_sve_dupq (rtx target
, machine_mode mode
, rtx src
)
3446 machine_mode src_mode
= GET_MODE (src
);
3447 gcc_assert (GET_MODE_INNER (mode
) == GET_MODE_INNER (src_mode
));
3448 insn_code icode
= (BYTES_BIG_ENDIAN
3449 ? code_for_aarch64_vec_duplicate_vq_be (mode
)
3450 : code_for_aarch64_vec_duplicate_vq_le (mode
));
3453 expand_operand ops
[3];
3454 create_output_operand (&ops
[i
++], target
, mode
);
3455 create_output_operand (&ops
[i
++], src
, src_mode
);
3456 if (BYTES_BIG_ENDIAN
)
3458 /* Create a PARALLEL describing the reversal of SRC. */
3459 unsigned int nelts_per_vq
= 128 / GET_MODE_UNIT_BITSIZE (mode
);
3460 rtx sel
= aarch64_gen_stepped_int_parallel (nelts_per_vq
,
3461 nelts_per_vq
- 1, -1);
3462 create_fixed_operand (&ops
[i
++], sel
);
3464 expand_insn (icode
, i
, ops
);
3465 return ops
[0].value
;
3468 /* Try to force 128-bit vector value SRC into memory and use LD1RQ to fetch
3469 the memory image into DEST. Return true on success. */
3472 aarch64_expand_sve_ld1rq (rtx dest
, rtx src
)
3474 src
= force_const_mem (GET_MODE (src
), src
);
3478 /* Make sure that the address is legitimate. */
3479 if (!aarch64_sve_ld1rq_operand_p (src
))
3481 rtx addr
= force_reg (Pmode
, XEXP (src
, 0));
3482 src
= replace_equiv_address (src
, addr
);
3485 machine_mode mode
= GET_MODE (dest
);
3486 unsigned int elem_bytes
= GET_MODE_UNIT_SIZE (mode
);
3487 machine_mode pred_mode
= aarch64_sve_pred_mode (elem_bytes
).require ();
3488 rtx ptrue
= aarch64_ptrue_reg (pred_mode
);
3489 emit_insn (gen_aarch64_sve_ld1rq (mode
, dest
, src
, ptrue
));
3493 /* Return a register containing CONST_VECTOR SRC, given that SRC has an
3494 SVE data mode and isn't a legitimate constant. Use TARGET for the
3495 result if convenient.
3497 The returned register can have whatever mode seems most natural
3498 given the contents of SRC. */
3501 aarch64_expand_sve_const_vector (rtx target
, rtx src
)
3503 machine_mode mode
= GET_MODE (src
);
3504 unsigned int npatterns
= CONST_VECTOR_NPATTERNS (src
);
3505 unsigned int nelts_per_pattern
= CONST_VECTOR_NELTS_PER_PATTERN (src
);
3506 scalar_mode elt_mode
= GET_MODE_INNER (mode
);
3507 unsigned int elt_bits
= GET_MODE_BITSIZE (elt_mode
);
3508 unsigned int encoded_bits
= npatterns
* nelts_per_pattern
* elt_bits
;
3510 if (nelts_per_pattern
== 1 && encoded_bits
== 128)
3512 /* The constant is a duplicated quadword but can't be narrowed
3513 beyond a quadword. Get the memory image of the first quadword
3514 as a 128-bit vector and try using LD1RQ to load it from memory.
3516 The effect for both endiannesses is to load memory lane N into
3517 architectural lanes N + I * STEP of the result. On big-endian
3518 targets, the layout of the 128-bit vector in an Advanced SIMD
3519 register would be different from its layout in an SVE register,
3520 but this 128-bit vector is a memory value only. */
3521 machine_mode vq_mode
= aarch64_vq_mode (elt_mode
).require ();
3522 rtx vq_value
= simplify_gen_subreg (vq_mode
, src
, mode
, 0);
3523 if (vq_value
&& aarch64_expand_sve_ld1rq (target
, vq_value
))
3527 if (nelts_per_pattern
== 1 && encoded_bits
< 128)
3529 /* The vector is a repeating sequence of 64 bits or fewer.
3530 See if we can load them using an Advanced SIMD move and then
3531 duplicate it to fill a vector. This is better than using a GPR
3532 move because it keeps everything in the same register file. */
3533 machine_mode vq_mode
= aarch64_vq_mode (elt_mode
).require ();
3534 rtx_vector_builder
builder (vq_mode
, npatterns
, 1);
3535 for (unsigned int i
= 0; i
< npatterns
; ++i
)
3537 /* We want memory lane N to go into architectural lane N,
3538 so reverse for big-endian targets. The DUP .Q pattern
3539 has a compensating reverse built-in. */
3540 unsigned int srci
= BYTES_BIG_ENDIAN
? npatterns
- i
- 1 : i
;
3541 builder
.quick_push (CONST_VECTOR_ENCODED_ELT (src
, srci
));
3543 rtx vq_src
= builder
.build ();
3544 if (aarch64_simd_valid_immediate (vq_src
, NULL
))
3546 vq_src
= force_reg (vq_mode
, vq_src
);
3547 return aarch64_expand_sve_dupq (target
, mode
, vq_src
);
3550 /* Get an integer representation of the repeating part of Advanced
3551 SIMD vector VQ_SRC. This preserves the endianness of VQ_SRC,
3552 which for big-endian targets is lane-swapped wrt a normal
3553 Advanced SIMD vector. This means that for both endiannesses,
3554 memory lane N of SVE vector SRC corresponds to architectural
3555 lane N of a register holding VQ_SRC. This in turn means that
3556 memory lane 0 of SVE vector SRC is in the lsb of VQ_SRC (viewed
3557 as a single 128-bit value) and thus that memory lane 0 of SRC is
3558 in the lsb of the integer. Duplicating the integer therefore
3559 ensures that memory lane N of SRC goes into architectural lane
3560 N + I * INDEX of the SVE register. */
3561 scalar_mode int_mode
= int_mode_for_size (encoded_bits
, 0).require ();
3562 rtx elt_value
= simplify_gen_subreg (int_mode
, vq_src
, vq_mode
, 0);
3565 /* Pretend that we had a vector of INT_MODE to start with. */
3566 elt_mode
= int_mode
;
3567 mode
= aarch64_full_sve_mode (int_mode
).require ();
3569 /* If the integer can be moved into a general register by a
3570 single instruction, do that and duplicate the result. */
3571 if (CONST_INT_P (elt_value
)
3572 && aarch64_move_imm (INTVAL (elt_value
), elt_mode
))
3574 elt_value
= force_reg (elt_mode
, elt_value
);
3575 return expand_vector_broadcast (mode
, elt_value
);
3578 else if (npatterns
== 1)
3579 /* We're duplicating a single value, but can't do better than
3580 force it to memory and load from there. This handles things
3581 like symbolic constants. */
3582 elt_value
= CONST_VECTOR_ENCODED_ELT (src
, 0);
3586 /* Load the element from memory if we can, otherwise move it into
3587 a register and use a DUP. */
3588 rtx op
= force_const_mem (elt_mode
, elt_value
);
3590 op
= force_reg (elt_mode
, elt_value
);
3591 return expand_vector_broadcast (mode
, op
);
3595 /* Try using INDEX. */
3597 if (const_vec_series_p (src
, &base
, &step
))
3599 aarch64_expand_vec_series (target
, base
, step
);
3603 /* From here on, it's better to force the whole constant to memory
3605 if (GET_MODE_NUNITS (mode
).is_constant ())
3608 /* Expand each pattern individually. */
3609 gcc_assert (npatterns
> 1);
3610 rtx_vector_builder builder
;
3611 auto_vec
<rtx
, 16> vectors (npatterns
);
3612 for (unsigned int i
= 0; i
< npatterns
; ++i
)
3614 builder
.new_vector (mode
, 1, nelts_per_pattern
);
3615 for (unsigned int j
= 0; j
< nelts_per_pattern
; ++j
)
3616 builder
.quick_push (CONST_VECTOR_ELT (src
, i
+ j
* npatterns
));
3617 vectors
.quick_push (force_reg (mode
, builder
.build ()));
3620 /* Use permutes to interleave the separate vectors. */
3621 while (npatterns
> 1)
3624 for (unsigned int i
= 0; i
< npatterns
; ++i
)
3626 rtx tmp
= (npatterns
== 1 ? target
: gen_reg_rtx (mode
));
3627 rtvec v
= gen_rtvec (2, vectors
[i
], vectors
[i
+ npatterns
]);
3628 emit_set_insn (tmp
, gen_rtx_UNSPEC (mode
, v
, UNSPEC_ZIP1
));
3632 gcc_assert (vectors
[0] == target
);
3636 /* Use WHILE to set predicate register DEST so that the first VL bits
3637 are set and the rest are clear. */
3640 aarch64_sve_move_pred_via_while (rtx dest
, unsigned int vl
)
3642 rtx limit
= force_reg (DImode
, gen_int_mode (vl
, DImode
));
3643 emit_insn (gen_while_ult (DImode
, GET_MODE (dest
),
3644 dest
, const0_rtx
, limit
));
3647 /* Set DEST to immediate IMM. */
3650 aarch64_expand_mov_immediate (rtx dest
, rtx imm
)
3652 machine_mode mode
= GET_MODE (dest
);
3654 /* Check on what type of symbol it is. */
3655 scalar_int_mode int_mode
;
3656 if ((GET_CODE (imm
) == SYMBOL_REF
3657 || GET_CODE (imm
) == LABEL_REF
3658 || GET_CODE (imm
) == CONST
3659 || GET_CODE (imm
) == CONST_POLY_INT
)
3660 && is_a
<scalar_int_mode
> (mode
, &int_mode
))
3664 HOST_WIDE_INT const_offset
;
3665 enum aarch64_symbol_type sty
;
3667 /* If we have (const (plus symbol offset)), separate out the offset
3668 before we start classifying the symbol. */
3669 rtx base
= strip_offset (imm
, &offset
);
3671 /* We must always add an offset involving VL separately, rather than
3672 folding it into the relocation. */
3673 if (!offset
.is_constant (&const_offset
))
3675 if (base
== const0_rtx
&& aarch64_sve_cnt_immediate_p (offset
))
3676 emit_insn (gen_rtx_SET (dest
, imm
));
3679 /* Do arithmetic on 32-bit values if the result is smaller
3681 if (partial_subreg_p (int_mode
, SImode
))
3683 /* It is invalid to do symbol calculations in modes
3684 narrower than SImode. */
3685 gcc_assert (base
== const0_rtx
);
3686 dest
= gen_lowpart (SImode
, dest
);
3689 if (base
!= const0_rtx
)
3691 base
= aarch64_force_temporary (int_mode
, dest
, base
);
3692 aarch64_add_offset (int_mode
, dest
, base
, offset
,
3693 NULL_RTX
, NULL_RTX
, false);
3696 aarch64_add_offset (int_mode
, dest
, base
, offset
,
3697 dest
, NULL_RTX
, false);
3702 sty
= aarch64_classify_symbol (base
, const_offset
);
3705 case SYMBOL_FORCE_TO_MEM
:
3706 if (const_offset
!= 0
3707 && targetm
.cannot_force_const_mem (int_mode
, imm
))
3709 gcc_assert (can_create_pseudo_p ());
3710 base
= aarch64_force_temporary (int_mode
, dest
, base
);
3711 aarch64_add_offset (int_mode
, dest
, base
, const_offset
,
3712 NULL_RTX
, NULL_RTX
, false);
3716 mem
= force_const_mem (ptr_mode
, imm
);
3719 /* If we aren't generating PC relative literals, then
3720 we need to expand the literal pool access carefully.
3721 This is something that needs to be done in a number
3722 of places, so could well live as a separate function. */
3723 if (!aarch64_pcrelative_literal_loads
)
3725 gcc_assert (can_create_pseudo_p ());
3726 base
= gen_reg_rtx (ptr_mode
);
3727 aarch64_expand_mov_immediate (base
, XEXP (mem
, 0));
3728 if (ptr_mode
!= Pmode
)
3729 base
= convert_memory_address (Pmode
, base
);
3730 mem
= gen_rtx_MEM (ptr_mode
, base
);
3733 if (int_mode
!= ptr_mode
)
3734 mem
= gen_rtx_ZERO_EXTEND (int_mode
, mem
);
3736 emit_insn (gen_rtx_SET (dest
, mem
));
3740 case SYMBOL_SMALL_TLSGD
:
3741 case SYMBOL_SMALL_TLSDESC
:
3742 case SYMBOL_SMALL_TLSIE
:
3743 case SYMBOL_SMALL_GOT_28K
:
3744 case SYMBOL_SMALL_GOT_4G
:
3745 case SYMBOL_TINY_GOT
:
3746 case SYMBOL_TINY_TLSIE
:
3747 if (const_offset
!= 0)
3749 gcc_assert(can_create_pseudo_p ());
3750 base
= aarch64_force_temporary (int_mode
, dest
, base
);
3751 aarch64_add_offset (int_mode
, dest
, base
, const_offset
,
3752 NULL_RTX
, NULL_RTX
, false);
3757 case SYMBOL_SMALL_ABSOLUTE
:
3758 case SYMBOL_TINY_ABSOLUTE
:
3759 case SYMBOL_TLSLE12
:
3760 case SYMBOL_TLSLE24
:
3761 case SYMBOL_TLSLE32
:
3762 case SYMBOL_TLSLE48
:
3763 aarch64_load_symref_appropriately (dest
, imm
, sty
);
3771 if (!CONST_INT_P (imm
))
3773 if (GET_CODE (imm
) == HIGH
3774 || aarch64_simd_valid_immediate (imm
, NULL
))
3776 emit_insn (gen_rtx_SET (dest
, imm
));
3780 rtx_vector_builder builder
;
3781 if (GET_MODE_CLASS (GET_MODE (imm
)) == MODE_VECTOR_BOOL
3782 && aarch64_get_sve_pred_bits (builder
, imm
))
3784 unsigned int elt_size
= aarch64_widest_sve_pred_elt_size (builder
);
3785 int vl
= aarch64_partial_ptrue_length (builder
, elt_size
);
3788 aarch64_sve_move_pred_via_while (dest
, vl
);
3793 if (GET_CODE (imm
) == CONST_VECTOR
&& aarch64_sve_data_mode_p (mode
))
3794 if (rtx res
= aarch64_expand_sve_const_vector (dest
, imm
))
3797 emit_insn (gen_aarch64_sve_reinterpret (mode
, dest
, res
));
3801 rtx mem
= force_const_mem (mode
, imm
);
3803 emit_move_insn (dest
, mem
);
3807 aarch64_internal_mov_immediate (dest
, imm
, true,
3808 as_a
<scalar_int_mode
> (mode
));
3811 /* Emit an SVE predicated move from SRC to DEST. PRED is a predicate
3812 that is known to contain PTRUE. */
3815 aarch64_emit_sve_pred_move (rtx dest
, rtx pred
, rtx src
)
3817 expand_operand ops
[3];
3818 machine_mode mode
= GET_MODE (dest
);
3819 create_output_operand (&ops
[0], dest
, mode
);
3820 create_input_operand (&ops
[1], pred
, GET_MODE(pred
));
3821 create_input_operand (&ops
[2], src
, mode
);
3822 temporary_volatile_ok
v (true);
3823 expand_insn (code_for_aarch64_pred_mov (mode
), 3, ops
);
3826 /* Expand a pre-RA SVE data move from SRC to DEST in which at least one
3827 operand is in memory. In this case we need to use the predicated LD1
3828 and ST1 instead of LDR and STR, both for correctness on big-endian
3829 targets and because LD1 and ST1 support a wider range of addressing modes.
3830 PRED_MODE is the mode of the predicate.
3832 See the comment at the head of aarch64-sve.md for details about the
3833 big-endian handling. */
3836 aarch64_expand_sve_mem_move (rtx dest
, rtx src
, machine_mode pred_mode
)
3838 machine_mode mode
= GET_MODE (dest
);
3839 rtx ptrue
= aarch64_ptrue_reg (pred_mode
);
3840 if (!register_operand (src
, mode
)
3841 && !register_operand (dest
, mode
))
3843 rtx tmp
= gen_reg_rtx (mode
);
3845 aarch64_emit_sve_pred_move (tmp
, ptrue
, src
);
3847 emit_move_insn (tmp
, src
);
3850 aarch64_emit_sve_pred_move (dest
, ptrue
, src
);
3853 /* Called only on big-endian targets. See whether an SVE vector move
3854 from SRC to DEST is effectively a REV[BHW] instruction, because at
3855 least one operand is a subreg of an SVE vector that has wider or
3856 narrower elements. Return true and emit the instruction if so.
3860 (set (reg:VNx8HI R1) (subreg:VNx8HI (reg:VNx16QI R2) 0))
3862 represents a VIEW_CONVERT between the following vectors, viewed
3865 R2: { [0].high, [0].low, [1].high, [1].low, ... }
3866 R1: { [0], [1], [2], [3], ... }
3868 The high part of lane X in R2 should therefore correspond to lane X*2
3869 of R1, but the register representations are:
3872 R2: ...... [1].high [1].low [0].high [0].low
3873 R1: ...... [3] [2] [1] [0]
3875 where the low part of lane X in R2 corresponds to lane X*2 in R1.
3876 We therefore need a reverse operation to swap the high and low values
3879 This is purely an optimization. Without it we would spill the
3880 subreg operand to the stack in one mode and reload it in the
3881 other mode, which has the same effect as the REV. */
3884 aarch64_maybe_expand_sve_subreg_move (rtx dest
, rtx src
)
3886 gcc_assert (BYTES_BIG_ENDIAN
);
3887 if (GET_CODE (dest
) == SUBREG
)
3888 dest
= SUBREG_REG (dest
);
3889 if (GET_CODE (src
) == SUBREG
)
3890 src
= SUBREG_REG (src
);
3892 /* The optimization handles two single SVE REGs with different element
3896 || aarch64_classify_vector_mode (GET_MODE (dest
)) != VEC_SVE_DATA
3897 || aarch64_classify_vector_mode (GET_MODE (src
)) != VEC_SVE_DATA
3898 || (GET_MODE_UNIT_SIZE (GET_MODE (dest
))
3899 == GET_MODE_UNIT_SIZE (GET_MODE (src
))))
3902 /* Generate *aarch64_sve_mov<mode>_subreg_be. */
3903 rtx ptrue
= aarch64_ptrue_reg (VNx16BImode
);
3904 rtx unspec
= gen_rtx_UNSPEC (GET_MODE (dest
), gen_rtvec (2, ptrue
, src
),
3906 emit_insn (gen_rtx_SET (dest
, unspec
));
3910 /* Return a copy of X with mode MODE, without changing its other
3911 attributes. Unlike gen_lowpart, this doesn't care whether the
3912 mode change is valid. */
3915 aarch64_replace_reg_mode (rtx x
, machine_mode mode
)
3917 if (GET_MODE (x
) == mode
)
3920 x
= shallow_copy_rtx (x
);
3921 set_mode_and_regno (x
, mode
, REGNO (x
));
3925 /* Split a *aarch64_sve_mov<mode>_subreg_be pattern with the given
3929 aarch64_split_sve_subreg_move (rtx dest
, rtx ptrue
, rtx src
)
3931 /* Decide which REV operation we need. The mode with narrower elements
3932 determines the mode of the operands and the mode with the wider
3933 elements determines the reverse width. */
3934 machine_mode mode_with_wider_elts
= GET_MODE (dest
);
3935 machine_mode mode_with_narrower_elts
= GET_MODE (src
);
3936 if (GET_MODE_UNIT_SIZE (mode_with_wider_elts
)
3937 < GET_MODE_UNIT_SIZE (mode_with_narrower_elts
))
3938 std::swap (mode_with_wider_elts
, mode_with_narrower_elts
);
3940 unsigned int wider_bytes
= GET_MODE_UNIT_SIZE (mode_with_wider_elts
);
3941 unsigned int unspec
;
3942 if (wider_bytes
== 8)
3943 unspec
= UNSPEC_REV64
;
3944 else if (wider_bytes
== 4)
3945 unspec
= UNSPEC_REV32
;
3946 else if (wider_bytes
== 2)
3947 unspec
= UNSPEC_REV16
;
3950 machine_mode pred_mode
= aarch64_sve_pred_mode (wider_bytes
).require ();
3954 (set DEST (unspec [PTRUE (unspec [SRC] UNSPEC_REV<nn>)]
3955 UNSPEC_MERGE_PTRUE))
3957 with the appropriate modes. */
3958 ptrue
= gen_lowpart (pred_mode
, ptrue
);
3959 dest
= aarch64_replace_reg_mode (dest
, mode_with_narrower_elts
);
3960 src
= aarch64_replace_reg_mode (src
, mode_with_narrower_elts
);
3961 src
= gen_rtx_UNSPEC (mode_with_narrower_elts
, gen_rtvec (1, src
), unspec
);
3962 src
= gen_rtx_UNSPEC (mode_with_narrower_elts
, gen_rtvec (2, ptrue
, src
),
3963 UNSPEC_MERGE_PTRUE
);
3964 emit_insn (gen_rtx_SET (dest
, src
));
3968 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED
,
3969 tree exp ATTRIBUTE_UNUSED
)
3971 if (aarch64_simd_decl_p (cfun
->decl
) != aarch64_simd_decl_p (decl
))
3977 /* Implement TARGET_PASS_BY_REFERENCE. */
3980 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED
,
3983 bool named ATTRIBUTE_UNUSED
)
3986 machine_mode dummymode
;
3989 /* GET_MODE_SIZE (BLKmode) is useless since it is 0. */
3990 if (mode
== BLKmode
&& type
)
3991 size
= int_size_in_bytes (type
);
3993 /* No frontends can create types with variable-sized modes, so we
3994 shouldn't be asked to pass or return them. */
3995 size
= GET_MODE_SIZE (mode
).to_constant ();
3997 /* Aggregates are passed by reference based on their size. */
3998 if (type
&& AGGREGATE_TYPE_P (type
))
4000 size
= int_size_in_bytes (type
);
4003 /* Variable sized arguments are always returned by reference. */
4007 /* Can this be a candidate to be passed in fp/simd register(s)? */
4008 if (aarch64_vfp_is_call_or_return_candidate (mode
, type
,
4013 /* Arguments which are variable sized or larger than 2 registers are
4014 passed by reference unless they are a homogenous floating point
4016 return size
> 2 * UNITS_PER_WORD
;
4019 /* Return TRUE if VALTYPE is padded to its least significant bits. */
4021 aarch64_return_in_msb (const_tree valtype
)
4023 machine_mode dummy_mode
;
4026 /* Never happens in little-endian mode. */
4027 if (!BYTES_BIG_ENDIAN
)
4030 /* Only composite types smaller than or equal to 16 bytes can
4031 be potentially returned in registers. */
4032 if (!aarch64_composite_type_p (valtype
, TYPE_MODE (valtype
))
4033 || int_size_in_bytes (valtype
) <= 0
4034 || int_size_in_bytes (valtype
) > 16)
4037 /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
4038 or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
4039 is always passed/returned in the least significant bits of fp/simd
4041 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype
), valtype
,
4042 &dummy_mode
, &dummy_int
, NULL
))
4048 /* Implement TARGET_FUNCTION_VALUE.
4049 Define how to find the value returned by a function. */
4052 aarch64_function_value (const_tree type
, const_tree func
,
4053 bool outgoing ATTRIBUTE_UNUSED
)
4058 machine_mode ag_mode
;
4060 mode
= TYPE_MODE (type
);
4061 if (INTEGRAL_TYPE_P (type
))
4062 mode
= promote_function_mode (type
, mode
, &unsignedp
, func
, 1);
4064 if (aarch64_return_in_msb (type
))
4066 HOST_WIDE_INT size
= int_size_in_bytes (type
);
4068 if (size
% UNITS_PER_WORD
!= 0)
4070 size
+= UNITS_PER_WORD
- size
% UNITS_PER_WORD
;
4071 mode
= int_mode_for_size (size
* BITS_PER_UNIT
, 0).require ();
4075 if (aarch64_vfp_is_call_or_return_candidate (mode
, type
,
4076 &ag_mode
, &count
, NULL
))
4078 if (!aarch64_composite_type_p (type
, mode
))
4080 gcc_assert (count
== 1 && mode
== ag_mode
);
4081 return gen_rtx_REG (mode
, V0_REGNUM
);
4088 par
= gen_rtx_PARALLEL (mode
, rtvec_alloc (count
));
4089 for (i
= 0; i
< count
; i
++)
4091 rtx tmp
= gen_rtx_REG (ag_mode
, V0_REGNUM
+ i
);
4092 rtx offset
= gen_int_mode (i
* GET_MODE_SIZE (ag_mode
), Pmode
);
4093 tmp
= gen_rtx_EXPR_LIST (VOIDmode
, tmp
, offset
);
4094 XVECEXP (par
, 0, i
) = tmp
;
4100 return gen_rtx_REG (mode
, R0_REGNUM
);
4103 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
4104 Return true if REGNO is the number of a hard register in which the values
4105 of called function may come back. */
4108 aarch64_function_value_regno_p (const unsigned int regno
)
4110 /* Maximum of 16 bytes can be returned in the general registers. Examples
4111 of 16-byte return values are: 128-bit integers and 16-byte small
4112 structures (excluding homogeneous floating-point aggregates). */
4113 if (regno
== R0_REGNUM
|| regno
== R1_REGNUM
)
4116 /* Up to four fp/simd registers can return a function value, e.g. a
4117 homogeneous floating-point aggregate having four members. */
4118 if (regno
>= V0_REGNUM
&& regno
< V0_REGNUM
+ HA_MAX_NUM_FLDS
)
4119 return TARGET_FLOAT
;
4124 /* Implement TARGET_RETURN_IN_MEMORY.
4126 If the type T of the result of a function is such that
4128 would require that arg be passed as a value in a register (or set of
4129 registers) according to the parameter passing rules, then the result
4130 is returned in the same registers as would be used for such an
4134 aarch64_return_in_memory (const_tree type
, const_tree fndecl ATTRIBUTE_UNUSED
)
4137 machine_mode ag_mode
;
4140 if (!AGGREGATE_TYPE_P (type
)
4141 && TREE_CODE (type
) != COMPLEX_TYPE
4142 && TREE_CODE (type
) != VECTOR_TYPE
)
4143 /* Simple scalar types always returned in registers. */
4146 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type
),
4153 /* Types larger than 2 registers returned in memory. */
4154 size
= int_size_in_bytes (type
);
4155 return (size
< 0 || size
> 2 * UNITS_PER_WORD
);
4159 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v
, machine_mode mode
,
4160 const_tree type
, int *nregs
)
4162 CUMULATIVE_ARGS
*pcum
= get_cumulative_args (pcum_v
);
4163 return aarch64_vfp_is_call_or_return_candidate (mode
,
4165 &pcum
->aapcs_vfp_rmode
,
4170 /* Given MODE and TYPE of a function argument, return the alignment in
4171 bits. The idea is to suppress any stronger alignment requested by
4172 the user and opt for the natural alignment (specified in AAPCS64 \S
4173 4.1). ABI_BREAK is set to true if the alignment was incorrectly
4174 calculated in versions of GCC prior to GCC-9. This is a helper
4175 function for local use only. */
4178 aarch64_function_arg_alignment (machine_mode mode
, const_tree type
,
4183 return GET_MODE_ALIGNMENT (mode
);
4185 if (integer_zerop (TYPE_SIZE (type
)))
4188 gcc_assert (TYPE_MODE (type
) == mode
);
4190 if (!AGGREGATE_TYPE_P (type
))
4191 return TYPE_ALIGN (TYPE_MAIN_VARIANT (type
));
4193 if (TREE_CODE (type
) == ARRAY_TYPE
)
4194 return TYPE_ALIGN (TREE_TYPE (type
));
4196 unsigned int alignment
= 0;
4197 unsigned int bitfield_alignment
= 0;
4198 for (tree field
= TYPE_FIELDS (type
); field
; field
= DECL_CHAIN (field
))
4199 if (TREE_CODE (field
) == FIELD_DECL
)
4201 alignment
= std::max (alignment
, DECL_ALIGN (field
));
4202 if (DECL_BIT_FIELD_TYPE (field
))
4204 = std::max (bitfield_alignment
,
4205 TYPE_ALIGN (DECL_BIT_FIELD_TYPE (field
)));
4208 if (bitfield_alignment
> alignment
)
4211 return bitfield_alignment
;
4217 /* Layout a function argument according to the AAPCS64 rules. The rule
4218 numbers refer to the rule numbers in the AAPCS64. */
4221 aarch64_layout_arg (cumulative_args_t pcum_v
, machine_mode mode
,
4223 bool named ATTRIBUTE_UNUSED
)
4225 CUMULATIVE_ARGS
*pcum
= get_cumulative_args (pcum_v
);
4226 int ncrn
, nvrn
, nregs
;
4227 bool allocate_ncrn
, allocate_nvrn
;
4231 /* We need to do this once per argument. */
4232 if (pcum
->aapcs_arg_processed
)
4235 pcum
->aapcs_arg_processed
= true;
4237 /* Size in bytes, rounded to the nearest multiple of 8 bytes. */
4239 size
= int_size_in_bytes (type
);
4241 /* No frontends can create types with variable-sized modes, so we
4242 shouldn't be asked to pass or return them. */
4243 size
= GET_MODE_SIZE (mode
).to_constant ();
4244 size
= ROUND_UP (size
, UNITS_PER_WORD
);
4246 allocate_ncrn
= (type
) ? !(FLOAT_TYPE_P (type
)) : !FLOAT_MODE_P (mode
);
4247 allocate_nvrn
= aarch64_vfp_is_call_candidate (pcum_v
,
4252 /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
4253 The following code thus handles passing by SIMD/FP registers first. */
4255 nvrn
= pcum
->aapcs_nvrn
;
4257 /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
4258 and homogenous short-vector aggregates (HVA). */
4262 aarch64_err_no_fpadvsimd (mode
);
4264 if (nvrn
+ nregs
<= NUM_FP_ARG_REGS
)
4266 pcum
->aapcs_nextnvrn
= nvrn
+ nregs
;
4267 if (!aarch64_composite_type_p (type
, mode
))
4269 gcc_assert (nregs
== 1);
4270 pcum
->aapcs_reg
= gen_rtx_REG (mode
, V0_REGNUM
+ nvrn
);
4276 par
= gen_rtx_PARALLEL (mode
, rtvec_alloc (nregs
));
4277 for (i
= 0; i
< nregs
; i
++)
4279 rtx tmp
= gen_rtx_REG (pcum
->aapcs_vfp_rmode
,
4280 V0_REGNUM
+ nvrn
+ i
);
4281 rtx offset
= gen_int_mode
4282 (i
* GET_MODE_SIZE (pcum
->aapcs_vfp_rmode
), Pmode
);
4283 tmp
= gen_rtx_EXPR_LIST (VOIDmode
, tmp
, offset
);
4284 XVECEXP (par
, 0, i
) = tmp
;
4286 pcum
->aapcs_reg
= par
;
4292 /* C.3 NSRN is set to 8. */
4293 pcum
->aapcs_nextnvrn
= NUM_FP_ARG_REGS
;
4298 ncrn
= pcum
->aapcs_ncrn
;
4299 nregs
= size
/ UNITS_PER_WORD
;
4301 /* C6 - C9. though the sign and zero extension semantics are
4302 handled elsewhere. This is the case where the argument fits
4303 entirely general registers. */
4304 if (allocate_ncrn
&& (ncrn
+ nregs
<= NUM_ARG_REGS
))
4306 gcc_assert (nregs
== 0 || nregs
== 1 || nregs
== 2);
4308 /* C.8 if the argument has an alignment of 16 then the NGRN is
4309 rounded up to the next even number. */
4312 /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
4313 comparison is there because for > 16 * BITS_PER_UNIT
4314 alignment nregs should be > 2 and therefore it should be
4315 passed by reference rather than value. */
4316 && (aarch64_function_arg_alignment (mode
, type
, &abi_break
)
4317 == 16 * BITS_PER_UNIT
))
4319 if (abi_break
&& warn_psabi
&& currently_expanding_gimple_stmt
)
4320 inform (input_location
, "parameter passing for argument of type "
4321 "%qT changed in GCC 9.1", type
);
4323 gcc_assert (ncrn
+ nregs
<= NUM_ARG_REGS
);
4326 /* NREGS can be 0 when e.g. an empty structure is to be passed.
4327 A reg is still generated for it, but the caller should be smart
4328 enough not to use it. */
4329 if (nregs
== 0 || nregs
== 1 || GET_MODE_CLASS (mode
) == MODE_INT
)
4330 pcum
->aapcs_reg
= gen_rtx_REG (mode
, R0_REGNUM
+ ncrn
);
4336 par
= gen_rtx_PARALLEL (mode
, rtvec_alloc (nregs
));
4337 for (i
= 0; i
< nregs
; i
++)
4339 rtx tmp
= gen_rtx_REG (word_mode
, R0_REGNUM
+ ncrn
+ i
);
4340 tmp
= gen_rtx_EXPR_LIST (VOIDmode
, tmp
,
4341 GEN_INT (i
* UNITS_PER_WORD
));
4342 XVECEXP (par
, 0, i
) = tmp
;
4344 pcum
->aapcs_reg
= par
;
4347 pcum
->aapcs_nextncrn
= ncrn
+ nregs
;
4352 pcum
->aapcs_nextncrn
= NUM_ARG_REGS
;
4354 /* The argument is passed on stack; record the needed number of words for
4355 this argument and align the total size if necessary. */
4357 pcum
->aapcs_stack_words
= size
/ UNITS_PER_WORD
;
4359 if (aarch64_function_arg_alignment (mode
, type
, &abi_break
)
4360 == 16 * BITS_PER_UNIT
)
4362 int new_size
= ROUND_UP (pcum
->aapcs_stack_size
, 16 / UNITS_PER_WORD
);
4363 if (pcum
->aapcs_stack_size
!= new_size
)
4365 if (abi_break
&& warn_psabi
&& currently_expanding_gimple_stmt
)
4366 inform (input_location
, "parameter passing for argument of type "
4367 "%qT changed in GCC 9.1", type
);
4368 pcum
->aapcs_stack_size
= new_size
;
4374 /* Implement TARGET_FUNCTION_ARG. */
4377 aarch64_function_arg (cumulative_args_t pcum_v
, machine_mode mode
,
4378 const_tree type
, bool named
)
4380 CUMULATIVE_ARGS
*pcum
= get_cumulative_args (pcum_v
);
4381 gcc_assert (pcum
->pcs_variant
== ARM_PCS_AAPCS64
);
4383 if (mode
== VOIDmode
)
4386 aarch64_layout_arg (pcum_v
, mode
, type
, named
);
4387 return pcum
->aapcs_reg
;
4391 aarch64_init_cumulative_args (CUMULATIVE_ARGS
*pcum
,
4392 const_tree fntype ATTRIBUTE_UNUSED
,
4393 rtx libname ATTRIBUTE_UNUSED
,
4394 const_tree fndecl ATTRIBUTE_UNUSED
,
4395 unsigned n_named ATTRIBUTE_UNUSED
)
4397 pcum
->aapcs_ncrn
= 0;
4398 pcum
->aapcs_nvrn
= 0;
4399 pcum
->aapcs_nextncrn
= 0;
4400 pcum
->aapcs_nextnvrn
= 0;
4401 pcum
->pcs_variant
= ARM_PCS_AAPCS64
;
4402 pcum
->aapcs_reg
= NULL_RTX
;
4403 pcum
->aapcs_arg_processed
= false;
4404 pcum
->aapcs_stack_words
= 0;
4405 pcum
->aapcs_stack_size
= 0;
4408 && fndecl
&& TREE_PUBLIC (fndecl
)
4409 && fntype
&& fntype
!= error_mark_node
)
4411 const_tree type
= TREE_TYPE (fntype
);
4412 machine_mode mode ATTRIBUTE_UNUSED
; /* To pass pointer as argument. */
4413 int nregs ATTRIBUTE_UNUSED
; /* Likewise. */
4414 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type
), type
,
4415 &mode
, &nregs
, NULL
))
4416 aarch64_err_no_fpadvsimd (TYPE_MODE (type
));
4422 aarch64_function_arg_advance (cumulative_args_t pcum_v
,
4427 CUMULATIVE_ARGS
*pcum
= get_cumulative_args (pcum_v
);
4428 if (pcum
->pcs_variant
== ARM_PCS_AAPCS64
)
4430 aarch64_layout_arg (pcum_v
, mode
, type
, named
);
4431 gcc_assert ((pcum
->aapcs_reg
!= NULL_RTX
)
4432 != (pcum
->aapcs_stack_words
!= 0));
4433 pcum
->aapcs_arg_processed
= false;
4434 pcum
->aapcs_ncrn
= pcum
->aapcs_nextncrn
;
4435 pcum
->aapcs_nvrn
= pcum
->aapcs_nextnvrn
;
4436 pcum
->aapcs_stack_size
+= pcum
->aapcs_stack_words
;
4437 pcum
->aapcs_stack_words
= 0;
4438 pcum
->aapcs_reg
= NULL_RTX
;
4443 aarch64_function_arg_regno_p (unsigned regno
)
4445 return ((GP_REGNUM_P (regno
) && regno
< R0_REGNUM
+ NUM_ARG_REGS
)
4446 || (FP_REGNUM_P (regno
) && regno
< V0_REGNUM
+ NUM_FP_ARG_REGS
));
4449 /* Implement FUNCTION_ARG_BOUNDARY. Every parameter gets at least
4450 PARM_BOUNDARY bits of alignment, but will be given anything up
4451 to STACK_BOUNDARY bits if the type requires it. This makes sure
4452 that both before and after the layout of each argument, the Next
4453 Stacked Argument Address (NSAA) will have a minimum alignment of
4457 aarch64_function_arg_boundary (machine_mode mode
, const_tree type
)
4460 unsigned int alignment
= aarch64_function_arg_alignment (mode
, type
,
4462 if (abi_break
& warn_psabi
)
4463 inform (input_location
, "parameter passing for argument of type "
4464 "%qT changed in GCC 9.1", type
);
4466 return MIN (MAX (alignment
, PARM_BOUNDARY
), STACK_BOUNDARY
);
4469 /* Implement TARGET_GET_RAW_RESULT_MODE and TARGET_GET_RAW_ARG_MODE. */
4471 static fixed_size_mode
4472 aarch64_get_reg_raw_mode (int regno
)
4474 if (TARGET_SVE
&& FP_REGNUM_P (regno
))
4475 /* Don't use the SVE part of the register for __builtin_apply and
4476 __builtin_return. The SVE registers aren't used by the normal PCS,
4477 so using them there would be a waste of time. The PCS extensions
4478 for SVE types are fundamentally incompatible with the
4479 __builtin_return/__builtin_apply interface. */
4480 return as_a
<fixed_size_mode
> (V16QImode
);
4481 return default_get_reg_raw_mode (regno
);
4484 /* Implement TARGET_FUNCTION_ARG_PADDING.
4486 Small aggregate types are placed in the lowest memory address.
4488 The related parameter passing rules are B.4, C.3, C.5 and C.14. */
4490 static pad_direction
4491 aarch64_function_arg_padding (machine_mode mode
, const_tree type
)
4493 /* On little-endian targets, the least significant byte of every stack
4494 argument is passed at the lowest byte address of the stack slot. */
4495 if (!BYTES_BIG_ENDIAN
)
4498 /* Otherwise, integral, floating-point and pointer types are padded downward:
4499 the least significant byte of a stack argument is passed at the highest
4500 byte address of the stack slot. */
4502 ? (INTEGRAL_TYPE_P (type
) || SCALAR_FLOAT_TYPE_P (type
)
4503 || POINTER_TYPE_P (type
))
4504 : (SCALAR_INT_MODE_P (mode
) || SCALAR_FLOAT_MODE_P (mode
)))
4505 return PAD_DOWNWARD
;
4507 /* Everything else padded upward, i.e. data in first byte of stack slot. */
4511 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
4513 It specifies padding for the last (may also be the only)
4514 element of a block move between registers and memory. If
4515 assuming the block is in the memory, padding upward means that
4516 the last element is padded after its highest significant byte,
4517 while in downward padding, the last element is padded at the
4518 its least significant byte side.
4520 Small aggregates and small complex types are always padded
4523 We don't need to worry about homogeneous floating-point or
4524 short-vector aggregates; their move is not affected by the
4525 padding direction determined here. Regardless of endianness,
4526 each element of such an aggregate is put in the least
4527 significant bits of a fp/simd register.
4529 Return !BYTES_BIG_ENDIAN if the least significant byte of the
4530 register has useful data, and return the opposite if the most
4531 significant byte does. */
4534 aarch64_pad_reg_upward (machine_mode mode
, const_tree type
,
4535 bool first ATTRIBUTE_UNUSED
)
4538 /* Small composite types are always padded upward. */
4539 if (BYTES_BIG_ENDIAN
&& aarch64_composite_type_p (type
, mode
))
4543 size
= int_size_in_bytes (type
);
4545 /* No frontends can create types with variable-sized modes, so we
4546 shouldn't be asked to pass or return them. */
4547 size
= GET_MODE_SIZE (mode
).to_constant ();
4548 if (size
< 2 * UNITS_PER_WORD
)
4552 /* Otherwise, use the default padding. */
4553 return !BYTES_BIG_ENDIAN
;
4556 static scalar_int_mode
4557 aarch64_libgcc_cmp_return_mode (void)
4562 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
4564 /* We use the 12-bit shifted immediate arithmetic instructions so values
4565 must be multiple of (1 << 12), i.e. 4096. */
4566 #define ARITH_FACTOR 4096
4568 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
4569 #error Cannot use simple address calculation for stack probing
4572 /* The pair of scratch registers used for stack probing. */
4573 #define PROBE_STACK_FIRST_REG R9_REGNUM
4574 #define PROBE_STACK_SECOND_REG R10_REGNUM
4576 /* Emit code to probe a range of stack addresses from FIRST to FIRST+POLY_SIZE,
4577 inclusive. These are offsets from the current stack pointer. */
4580 aarch64_emit_probe_stack_range (HOST_WIDE_INT first
, poly_int64 poly_size
)
4583 if (!poly_size
.is_constant (&size
))
4585 sorry ("stack probes for SVE frames");
4589 rtx reg1
= gen_rtx_REG (Pmode
, PROBE_STACK_FIRST_REG
);
4591 /* See the same assertion on PROBE_INTERVAL above. */
4592 gcc_assert ((first
% ARITH_FACTOR
) == 0);
4594 /* See if we have a constant small number of probes to generate. If so,
4595 that's the easy case. */
4596 if (size
<= PROBE_INTERVAL
)
4598 const HOST_WIDE_INT base
= ROUND_UP (size
, ARITH_FACTOR
);
4600 emit_set_insn (reg1
,
4601 plus_constant (Pmode
,
4602 stack_pointer_rtx
, -(first
+ base
)));
4603 emit_stack_probe (plus_constant (Pmode
, reg1
, base
- size
));
4606 /* The run-time loop is made up of 8 insns in the generic case while the
4607 compile-time loop is made up of 4+2*(n-2) insns for n # of intervals. */
4608 else if (size
<= 4 * PROBE_INTERVAL
)
4610 HOST_WIDE_INT i
, rem
;
4612 emit_set_insn (reg1
,
4613 plus_constant (Pmode
,
4615 -(first
+ PROBE_INTERVAL
)));
4616 emit_stack_probe (reg1
);
4618 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
4619 it exceeds SIZE. If only two probes are needed, this will not
4620 generate any code. Then probe at FIRST + SIZE. */
4621 for (i
= 2 * PROBE_INTERVAL
; i
< size
; i
+= PROBE_INTERVAL
)
4623 emit_set_insn (reg1
,
4624 plus_constant (Pmode
, reg1
, -PROBE_INTERVAL
));
4625 emit_stack_probe (reg1
);
4628 rem
= size
- (i
- PROBE_INTERVAL
);
4631 const HOST_WIDE_INT base
= ROUND_UP (rem
, ARITH_FACTOR
);
4633 emit_set_insn (reg1
, plus_constant (Pmode
, reg1
, -base
));
4634 emit_stack_probe (plus_constant (Pmode
, reg1
, base
- rem
));
4637 emit_stack_probe (plus_constant (Pmode
, reg1
, -rem
));
4640 /* Otherwise, do the same as above, but in a loop. Note that we must be
4641 extra careful with variables wrapping around because we might be at
4642 the very top (or the very bottom) of the address space and we have
4643 to be able to handle this case properly; in particular, we use an
4644 equality test for the loop condition. */
4647 rtx reg2
= gen_rtx_REG (Pmode
, PROBE_STACK_SECOND_REG
);
4649 /* Step 1: round SIZE to the previous multiple of the interval. */
4651 HOST_WIDE_INT rounded_size
= size
& -PROBE_INTERVAL
;
4654 /* Step 2: compute initial and final value of the loop counter. */
4656 /* TEST_ADDR = SP + FIRST. */
4657 emit_set_insn (reg1
,
4658 plus_constant (Pmode
, stack_pointer_rtx
, -first
));
4660 /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE. */
4661 HOST_WIDE_INT adjustment
= - (first
+ rounded_size
);
4662 if (! aarch64_uimm12_shift (adjustment
))
4664 aarch64_internal_mov_immediate (reg2
, GEN_INT (adjustment
),
4666 emit_set_insn (reg2
, gen_rtx_PLUS (Pmode
, stack_pointer_rtx
, reg2
));
4669 emit_set_insn (reg2
,
4670 plus_constant (Pmode
, stack_pointer_rtx
, adjustment
));
4676 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
4679 while (TEST_ADDR != LAST_ADDR)
4681 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
4682 until it is equal to ROUNDED_SIZE. */
4684 emit_insn (gen_probe_stack_range (reg1
, reg1
, reg2
));
4687 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
4688 that SIZE is equal to ROUNDED_SIZE. */
4690 if (size
!= rounded_size
)
4692 HOST_WIDE_INT rem
= size
- rounded_size
;
4696 const HOST_WIDE_INT base
= ROUND_UP (rem
, ARITH_FACTOR
);
4698 emit_set_insn (reg2
, plus_constant (Pmode
, reg2
, -base
));
4699 emit_stack_probe (plus_constant (Pmode
, reg2
, base
- rem
));
4702 emit_stack_probe (plus_constant (Pmode
, reg2
, -rem
));
4706 /* Make sure nothing is scheduled before we are done. */
4707 emit_insn (gen_blockage ());
4710 /* Probe a range of stack addresses from REG1 to REG2 inclusive. These are
4711 absolute addresses. */
4714 aarch64_output_probe_stack_range (rtx reg1
, rtx reg2
)
4716 static int labelno
= 0;
4720 ASM_GENERATE_INTERNAL_LABEL (loop_lab
, "LPSRL", labelno
++);
4723 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file
, loop_lab
);
4725 HOST_WIDE_INT stack_clash_probe_interval
4726 = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE
);
4728 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
4730 HOST_WIDE_INT interval
;
4731 if (flag_stack_clash_protection
)
4732 interval
= stack_clash_probe_interval
;
4734 interval
= PROBE_INTERVAL
;
4736 gcc_assert (aarch64_uimm12_shift (interval
));
4737 xops
[1] = GEN_INT (interval
);
4739 output_asm_insn ("sub\t%0, %0, %1", xops
);
4741 /* If doing stack clash protection then we probe up by the ABI specified
4742 amount. We do this because we're dropping full pages at a time in the
4743 loop. But if we're doing non-stack clash probing, probe at SP 0. */
4744 if (flag_stack_clash_protection
)
4745 xops
[1] = GEN_INT (STACK_CLASH_CALLER_GUARD
);
4747 xops
[1] = CONST0_RTX (GET_MODE (xops
[1]));
4749 /* Probe at TEST_ADDR. If we're inside the loop it is always safe to probe
4750 by this amount for each iteration. */
4751 output_asm_insn ("str\txzr, [%0, %1]", xops
);
4753 /* Test if TEST_ADDR == LAST_ADDR. */
4755 output_asm_insn ("cmp\t%0, %1", xops
);
4758 fputs ("\tb.ne\t", asm_out_file
);
4759 assemble_name_raw (asm_out_file
, loop_lab
);
4760 fputc ('\n', asm_out_file
);
4765 /* Emit the probe loop for doing stack clash probes and stack adjustments for
4766 SVE. This emits probes from BASE to BASE - ADJUSTMENT based on a guard size
4767 of GUARD_SIZE. When a probe is emitted it is done at most
4768 MIN_PROBE_THRESHOLD bytes from the current BASE at an interval of
4769 at most MIN_PROBE_THRESHOLD. By the end of this function
4770 BASE = BASE - ADJUSTMENT. */
4773 aarch64_output_probe_sve_stack_clash (rtx base
, rtx adjustment
,
4774 rtx min_probe_threshold
, rtx guard_size
)
4776 /* This function is not allowed to use any instruction generation function
4777 like gen_ and friends. If you do you'll likely ICE during CFG validation,
4778 so instead emit the code you want using output_asm_insn. */
4779 gcc_assert (flag_stack_clash_protection
);
4780 gcc_assert (CONST_INT_P (min_probe_threshold
) && CONST_INT_P (guard_size
));
4781 gcc_assert (INTVAL (guard_size
) > INTVAL (min_probe_threshold
));
4783 /* The minimum required allocation before the residual requires probing. */
4784 HOST_WIDE_INT residual_probe_guard
= INTVAL (min_probe_threshold
);
4786 /* Clamp the value down to the nearest value that can be used with a cmp. */
4787 residual_probe_guard
= aarch64_clamp_to_uimm12_shift (residual_probe_guard
);
4788 rtx probe_offset_value_rtx
= gen_int_mode (residual_probe_guard
, Pmode
);
4790 gcc_assert (INTVAL (min_probe_threshold
) >= residual_probe_guard
);
4791 gcc_assert (aarch64_uimm12_shift (residual_probe_guard
));
4793 static int labelno
= 0;
4794 char loop_start_lab
[32];
4795 char loop_end_lab
[32];
4798 ASM_GENERATE_INTERNAL_LABEL (loop_start_lab
, "SVLPSPL", labelno
);
4799 ASM_GENERATE_INTERNAL_LABEL (loop_end_lab
, "SVLPEND", labelno
++);
4801 /* Emit loop start label. */
4802 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file
, loop_start_lab
);
4804 /* ADJUSTMENT < RESIDUAL_PROBE_GUARD. */
4805 xops
[0] = adjustment
;
4806 xops
[1] = probe_offset_value_rtx
;
4807 output_asm_insn ("cmp\t%0, %1", xops
);
4809 /* Branch to end if not enough adjustment to probe. */
4810 fputs ("\tb.lt\t", asm_out_file
);
4811 assemble_name_raw (asm_out_file
, loop_end_lab
);
4812 fputc ('\n', asm_out_file
);
4814 /* BASE = BASE - RESIDUAL_PROBE_GUARD. */
4816 xops
[1] = probe_offset_value_rtx
;
4817 output_asm_insn ("sub\t%0, %0, %1", xops
);
4819 /* Probe at BASE. */
4820 xops
[1] = const0_rtx
;
4821 output_asm_insn ("str\txzr, [%0, %1]", xops
);
4823 /* ADJUSTMENT = ADJUSTMENT - RESIDUAL_PROBE_GUARD. */
4824 xops
[0] = adjustment
;
4825 xops
[1] = probe_offset_value_rtx
;
4826 output_asm_insn ("sub\t%0, %0, %1", xops
);
4828 /* Branch to start if still more bytes to allocate. */
4829 fputs ("\tb\t", asm_out_file
);
4830 assemble_name_raw (asm_out_file
, loop_start_lab
);
4831 fputc ('\n', asm_out_file
);
4833 /* No probe leave. */
4834 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file
, loop_end_lab
);
4836 /* BASE = BASE - ADJUSTMENT. */
4838 xops
[1] = adjustment
;
4839 output_asm_insn ("sub\t%0, %0, %1", xops
);
4843 /* Determine whether a frame chain needs to be generated. */
4845 aarch64_needs_frame_chain (void)
4847 /* Force a frame chain for EH returns so the return address is at FP+8. */
4848 if (frame_pointer_needed
|| crtl
->calls_eh_return
)
4851 /* A leaf function cannot have calls or write LR. */
4852 bool is_leaf
= crtl
->is_leaf
&& !df_regs_ever_live_p (LR_REGNUM
);
4854 /* Don't use a frame chain in leaf functions if leaf frame pointers
4856 if (flag_omit_leaf_frame_pointer
&& is_leaf
)
4859 return aarch64_use_frame_pointer
;
4862 /* Mark the registers that need to be saved by the callee and calculate
4863 the size of the callee-saved registers area and frame record (both FP
4864 and LR may be omitted). */
4866 aarch64_layout_frame (void)
4868 HOST_WIDE_INT offset
= 0;
4869 int regno
, last_fp_reg
= INVALID_REGNUM
;
4870 bool simd_function
= aarch64_simd_decl_p (cfun
->decl
);
4872 cfun
->machine
->frame
.emit_frame_chain
= aarch64_needs_frame_chain ();
4874 /* Adjust the outgoing arguments size if required. Keep it in sync with what
4875 the mid-end is doing. */
4876 crtl
->outgoing_args_size
= STACK_DYNAMIC_OFFSET (cfun
);
4878 #define SLOT_NOT_REQUIRED (-2)
4879 #define SLOT_REQUIRED (-1)
4881 cfun
->machine
->frame
.wb_candidate1
= INVALID_REGNUM
;
4882 cfun
->machine
->frame
.wb_candidate2
= INVALID_REGNUM
;
4884 /* If this is a non-leaf simd function with calls we assume that
4885 at least one of those calls is to a non-simd function and thus
4886 we must save V8 to V23 in the prologue. */
4888 if (simd_function
&& !crtl
->is_leaf
)
4890 for (regno
= V0_REGNUM
; regno
<= V31_REGNUM
; regno
++)
4891 if (FP_SIMD_SAVED_REGNUM_P (regno
))
4892 df_set_regs_ever_live (regno
, true);
4895 /* First mark all the registers that really need to be saved... */
4896 for (regno
= R0_REGNUM
; regno
<= R30_REGNUM
; regno
++)
4897 cfun
->machine
->frame
.reg_offset
[regno
] = SLOT_NOT_REQUIRED
;
4899 for (regno
= V0_REGNUM
; regno
<= V31_REGNUM
; regno
++)
4900 cfun
->machine
->frame
.reg_offset
[regno
] = SLOT_NOT_REQUIRED
;
4902 /* ... that includes the eh data registers (if needed)... */
4903 if (crtl
->calls_eh_return
)
4904 for (regno
= 0; EH_RETURN_DATA_REGNO (regno
) != INVALID_REGNUM
; regno
++)
4905 cfun
->machine
->frame
.reg_offset
[EH_RETURN_DATA_REGNO (regno
)]
4908 /* ... and any callee saved register that dataflow says is live. */
4909 for (regno
= R0_REGNUM
; regno
<= R30_REGNUM
; regno
++)
4910 if (df_regs_ever_live_p (regno
)
4911 && (regno
== R30_REGNUM
4912 || !call_used_regs
[regno
]))
4913 cfun
->machine
->frame
.reg_offset
[regno
] = SLOT_REQUIRED
;
4915 for (regno
= V0_REGNUM
; regno
<= V31_REGNUM
; regno
++)
4916 if (df_regs_ever_live_p (regno
)
4917 && (!call_used_regs
[regno
]
4918 || (simd_function
&& FP_SIMD_SAVED_REGNUM_P (regno
))))
4920 cfun
->machine
->frame
.reg_offset
[regno
] = SLOT_REQUIRED
;
4921 last_fp_reg
= regno
;
4924 if (cfun
->machine
->frame
.emit_frame_chain
)
4926 /* FP and LR are placed in the linkage record. */
4927 cfun
->machine
->frame
.reg_offset
[R29_REGNUM
] = 0;
4928 cfun
->machine
->frame
.wb_candidate1
= R29_REGNUM
;
4929 cfun
->machine
->frame
.reg_offset
[R30_REGNUM
] = UNITS_PER_WORD
;
4930 cfun
->machine
->frame
.wb_candidate2
= R30_REGNUM
;
4931 offset
= 2 * UNITS_PER_WORD
;
4934 /* With stack-clash, LR must be saved in non-leaf functions. */
4935 gcc_assert (crtl
->is_leaf
4936 || (cfun
->machine
->frame
.reg_offset
[R30_REGNUM
]
4937 != SLOT_NOT_REQUIRED
));
4939 /* Now assign stack slots for them. */
4940 for (regno
= R0_REGNUM
; regno
<= R30_REGNUM
; regno
++)
4941 if (cfun
->machine
->frame
.reg_offset
[regno
] == SLOT_REQUIRED
)
4943 cfun
->machine
->frame
.reg_offset
[regno
] = offset
;
4944 if (cfun
->machine
->frame
.wb_candidate1
== INVALID_REGNUM
)
4945 cfun
->machine
->frame
.wb_candidate1
= regno
;
4946 else if (cfun
->machine
->frame
.wb_candidate2
== INVALID_REGNUM
)
4947 cfun
->machine
->frame
.wb_candidate2
= regno
;
4948 offset
+= UNITS_PER_WORD
;
4951 HOST_WIDE_INT max_int_offset
= offset
;
4952 offset
= ROUND_UP (offset
, STACK_BOUNDARY
/ BITS_PER_UNIT
);
4953 bool has_align_gap
= offset
!= max_int_offset
;
4955 for (regno
= V0_REGNUM
; regno
<= V31_REGNUM
; regno
++)
4956 if (cfun
->machine
->frame
.reg_offset
[regno
] == SLOT_REQUIRED
)
4958 /* If there is an alignment gap between integer and fp callee-saves,
4959 allocate the last fp register to it if possible. */
4960 if (regno
== last_fp_reg
4963 && (offset
& 8) == 0)
4965 cfun
->machine
->frame
.reg_offset
[regno
] = max_int_offset
;
4969 cfun
->machine
->frame
.reg_offset
[regno
] = offset
;
4970 if (cfun
->machine
->frame
.wb_candidate1
== INVALID_REGNUM
)
4971 cfun
->machine
->frame
.wb_candidate1
= regno
;
4972 else if (cfun
->machine
->frame
.wb_candidate2
== INVALID_REGNUM
4973 && cfun
->machine
->frame
.wb_candidate1
>= V0_REGNUM
)
4974 cfun
->machine
->frame
.wb_candidate2
= regno
;
4975 offset
+= simd_function
? UNITS_PER_VREG
: UNITS_PER_WORD
;
4978 offset
= ROUND_UP (offset
, STACK_BOUNDARY
/ BITS_PER_UNIT
);
4980 cfun
->machine
->frame
.saved_regs_size
= offset
;
4982 HOST_WIDE_INT varargs_and_saved_regs_size
4983 = offset
+ cfun
->machine
->frame
.saved_varargs_size
;
4985 cfun
->machine
->frame
.hard_fp_offset
4986 = aligned_upper_bound (varargs_and_saved_regs_size
4987 + get_frame_size (),
4988 STACK_BOUNDARY
/ BITS_PER_UNIT
);
4990 /* Both these values are already aligned. */
4991 gcc_assert (multiple_p (crtl
->outgoing_args_size
,
4992 STACK_BOUNDARY
/ BITS_PER_UNIT
));
4993 cfun
->machine
->frame
.frame_size
4994 = (cfun
->machine
->frame
.hard_fp_offset
4995 + crtl
->outgoing_args_size
);
4997 cfun
->machine
->frame
.locals_offset
= cfun
->machine
->frame
.saved_varargs_size
;
4999 cfun
->machine
->frame
.initial_adjust
= 0;
5000 cfun
->machine
->frame
.final_adjust
= 0;
5001 cfun
->machine
->frame
.callee_adjust
= 0;
5002 cfun
->machine
->frame
.callee_offset
= 0;
5004 HOST_WIDE_INT max_push_offset
= 0;
5005 if (cfun
->machine
->frame
.wb_candidate2
!= INVALID_REGNUM
)
5006 max_push_offset
= 512;
5007 else if (cfun
->machine
->frame
.wb_candidate1
!= INVALID_REGNUM
)
5008 max_push_offset
= 256;
5010 HOST_WIDE_INT const_size
, const_fp_offset
;
5011 if (cfun
->machine
->frame
.frame_size
.is_constant (&const_size
)
5012 && const_size
< max_push_offset
5013 && known_eq (crtl
->outgoing_args_size
, 0))
5015 /* Simple, small frame with no outgoing arguments:
5016 stp reg1, reg2, [sp, -frame_size]!
5017 stp reg3, reg4, [sp, 16] */
5018 cfun
->machine
->frame
.callee_adjust
= const_size
;
5020 else if (known_lt (crtl
->outgoing_args_size
5021 + cfun
->machine
->frame
.saved_regs_size
, 512)
5022 && !(cfun
->calls_alloca
5023 && known_lt (cfun
->machine
->frame
.hard_fp_offset
,
5026 /* Frame with small outgoing arguments:
5027 sub sp, sp, frame_size
5028 stp reg1, reg2, [sp, outgoing_args_size]
5029 stp reg3, reg4, [sp, outgoing_args_size + 16] */
5030 cfun
->machine
->frame
.initial_adjust
= cfun
->machine
->frame
.frame_size
;
5031 cfun
->machine
->frame
.callee_offset
5032 = cfun
->machine
->frame
.frame_size
- cfun
->machine
->frame
.hard_fp_offset
;
5034 else if (cfun
->machine
->frame
.hard_fp_offset
.is_constant (&const_fp_offset
)
5035 && const_fp_offset
< max_push_offset
)
5037 /* Frame with large outgoing arguments but a small local area:
5038 stp reg1, reg2, [sp, -hard_fp_offset]!
5039 stp reg3, reg4, [sp, 16]
5040 sub sp, sp, outgoing_args_size */
5041 cfun
->machine
->frame
.callee_adjust
= const_fp_offset
;
5042 cfun
->machine
->frame
.final_adjust
5043 = cfun
->machine
->frame
.frame_size
- cfun
->machine
->frame
.callee_adjust
;
5047 /* Frame with large local area and outgoing arguments using frame pointer:
5048 sub sp, sp, hard_fp_offset
5049 stp x29, x30, [sp, 0]
5051 stp reg3, reg4, [sp, 16]
5052 sub sp, sp, outgoing_args_size */
5053 cfun
->machine
->frame
.initial_adjust
= cfun
->machine
->frame
.hard_fp_offset
;
5054 cfun
->machine
->frame
.final_adjust
5055 = cfun
->machine
->frame
.frame_size
- cfun
->machine
->frame
.initial_adjust
;
5058 cfun
->machine
->frame
.laid_out
= true;
5061 /* Return true if the register REGNO is saved on entry to
5062 the current function. */
5065 aarch64_register_saved_on_entry (int regno
)
5067 return cfun
->machine
->frame
.reg_offset
[regno
] >= 0;
5070 /* Return the next register up from REGNO up to LIMIT for the callee
5074 aarch64_next_callee_save (unsigned regno
, unsigned limit
)
5076 while (regno
<= limit
&& !aarch64_register_saved_on_entry (regno
))
5081 /* Push the register number REGNO of mode MODE to the stack with write-back
5082 adjusting the stack by ADJUSTMENT. */
5085 aarch64_pushwb_single_reg (machine_mode mode
, unsigned regno
,
5086 HOST_WIDE_INT adjustment
)
5088 rtx base_rtx
= stack_pointer_rtx
;
5091 reg
= gen_rtx_REG (mode
, regno
);
5092 mem
= gen_rtx_PRE_MODIFY (Pmode
, base_rtx
,
5093 plus_constant (Pmode
, base_rtx
, -adjustment
));
5094 mem
= gen_frame_mem (mode
, mem
);
5096 insn
= emit_move_insn (mem
, reg
);
5097 RTX_FRAME_RELATED_P (insn
) = 1;
5100 /* Generate and return an instruction to store the pair of registers
5101 REG and REG2 of mode MODE to location BASE with write-back adjusting
5102 the stack location BASE by ADJUSTMENT. */
5105 aarch64_gen_storewb_pair (machine_mode mode
, rtx base
, rtx reg
, rtx reg2
,
5106 HOST_WIDE_INT adjustment
)
5111 return gen_storewb_pairdi_di (base
, base
, reg
, reg2
,
5112 GEN_INT (-adjustment
),
5113 GEN_INT (UNITS_PER_WORD
- adjustment
));
5115 return gen_storewb_pairdf_di (base
, base
, reg
, reg2
,
5116 GEN_INT (-adjustment
),
5117 GEN_INT (UNITS_PER_WORD
- adjustment
));
5119 return gen_storewb_pairtf_di (base
, base
, reg
, reg2
,
5120 GEN_INT (-adjustment
),
5121 GEN_INT (UNITS_PER_VREG
- adjustment
));
5127 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
5128 stack pointer by ADJUSTMENT. */
5131 aarch64_push_regs (unsigned regno1
, unsigned regno2
, HOST_WIDE_INT adjustment
)
5134 machine_mode mode
= aarch64_reg_save_mode (cfun
->decl
, regno1
);
5136 if (regno2
== INVALID_REGNUM
)
5137 return aarch64_pushwb_single_reg (mode
, regno1
, adjustment
);
5139 rtx reg1
= gen_rtx_REG (mode
, regno1
);
5140 rtx reg2
= gen_rtx_REG (mode
, regno2
);
5142 insn
= emit_insn (aarch64_gen_storewb_pair (mode
, stack_pointer_rtx
, reg1
,
5144 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn
), 0, 2)) = 1;
5145 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn
), 0, 1)) = 1;
5146 RTX_FRAME_RELATED_P (insn
) = 1;
5149 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
5150 adjusting it by ADJUSTMENT afterwards. */
5153 aarch64_gen_loadwb_pair (machine_mode mode
, rtx base
, rtx reg
, rtx reg2
,
5154 HOST_WIDE_INT adjustment
)
5159 return gen_loadwb_pairdi_di (base
, base
, reg
, reg2
, GEN_INT (adjustment
),
5160 GEN_INT (UNITS_PER_WORD
));
5162 return gen_loadwb_pairdf_di (base
, base
, reg
, reg2
, GEN_INT (adjustment
),
5163 GEN_INT (UNITS_PER_WORD
));
5165 return gen_loadwb_pairtf_di (base
, base
, reg
, reg2
, GEN_INT (adjustment
),
5166 GEN_INT (UNITS_PER_VREG
));
5172 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
5173 afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
5177 aarch64_pop_regs (unsigned regno1
, unsigned regno2
, HOST_WIDE_INT adjustment
,
5180 machine_mode mode
= aarch64_reg_save_mode (cfun
->decl
, regno1
);
5181 rtx reg1
= gen_rtx_REG (mode
, regno1
);
5183 *cfi_ops
= alloc_reg_note (REG_CFA_RESTORE
, reg1
, *cfi_ops
);
5185 if (regno2
== INVALID_REGNUM
)
5187 rtx mem
= plus_constant (Pmode
, stack_pointer_rtx
, adjustment
);
5188 mem
= gen_rtx_POST_MODIFY (Pmode
, stack_pointer_rtx
, mem
);
5189 emit_move_insn (reg1
, gen_frame_mem (mode
, mem
));
5193 rtx reg2
= gen_rtx_REG (mode
, regno2
);
5194 *cfi_ops
= alloc_reg_note (REG_CFA_RESTORE
, reg2
, *cfi_ops
);
5195 emit_insn (aarch64_gen_loadwb_pair (mode
, stack_pointer_rtx
, reg1
,
5200 /* Generate and return a store pair instruction of mode MODE to store
5201 register REG1 to MEM1 and register REG2 to MEM2. */
5204 aarch64_gen_store_pair (machine_mode mode
, rtx mem1
, rtx reg1
, rtx mem2
,
5210 return gen_store_pair_dw_didi (mem1
, reg1
, mem2
, reg2
);
5213 return gen_store_pair_dw_dfdf (mem1
, reg1
, mem2
, reg2
);
5216 return gen_store_pair_dw_tftf (mem1
, reg1
, mem2
, reg2
);
5223 /* Generate and regurn a load pair isntruction of mode MODE to load register
5224 REG1 from MEM1 and register REG2 from MEM2. */
5227 aarch64_gen_load_pair (machine_mode mode
, rtx reg1
, rtx mem1
, rtx reg2
,
5233 return gen_load_pair_dw_didi (reg1
, mem1
, reg2
, mem2
);
5236 return gen_load_pair_dw_dfdf (reg1
, mem1
, reg2
, mem2
);
5239 return gen_load_pair_dw_tftf (reg1
, mem1
, reg2
, mem2
);
5246 /* Return TRUE if return address signing should be enabled for the current
5247 function, otherwise return FALSE. */
5250 aarch64_return_address_signing_enabled (void)
5252 /* This function should only be called after frame laid out. */
5253 gcc_assert (cfun
->machine
->frame
.laid_out
);
5255 /* If signing scope is AARCH64_FUNCTION_NON_LEAF, we only sign a leaf function
5256 if its LR is pushed onto stack. */
5257 return (aarch64_ra_sign_scope
== AARCH64_FUNCTION_ALL
5258 || (aarch64_ra_sign_scope
== AARCH64_FUNCTION_NON_LEAF
5259 && cfun
->machine
->frame
.reg_offset
[LR_REGNUM
] >= 0));
5262 /* Return TRUE if Branch Target Identification Mechanism is enabled. */
5264 aarch64_bti_enabled (void)
5266 return (aarch64_enable_bti
== 1);
5269 /* Emit code to save the callee-saved registers from register number START
5270 to LIMIT to the stack at the location starting at offset START_OFFSET,
5271 skipping any write-back candidates if SKIP_WB is true. */
5274 aarch64_save_callee_saves (machine_mode mode
, poly_int64 start_offset
,
5275 unsigned start
, unsigned limit
, bool skip_wb
)
5281 for (regno
= aarch64_next_callee_save (start
, limit
);
5283 regno
= aarch64_next_callee_save (regno
+ 1, limit
))
5290 && (regno
== cfun
->machine
->frame
.wb_candidate1
5291 || regno
== cfun
->machine
->frame
.wb_candidate2
))
5294 if (cfun
->machine
->reg_is_wrapped_separately
[regno
])
5297 reg
= gen_rtx_REG (mode
, regno
);
5298 offset
= start_offset
+ cfun
->machine
->frame
.reg_offset
[regno
];
5299 mem
= gen_frame_mem (mode
, plus_constant (Pmode
, stack_pointer_rtx
,
5302 regno2
= aarch64_next_callee_save (regno
+ 1, limit
);
5303 offset_diff
= cfun
->machine
->frame
.reg_offset
[regno2
]
5304 - cfun
->machine
->frame
.reg_offset
[regno
];
5307 && !cfun
->machine
->reg_is_wrapped_separately
[regno2
]
5308 && known_eq (GET_MODE_SIZE (mode
), offset_diff
))
5310 rtx reg2
= gen_rtx_REG (mode
, regno2
);
5313 offset
= start_offset
+ cfun
->machine
->frame
.reg_offset
[regno2
];
5314 mem2
= gen_frame_mem (mode
, plus_constant (Pmode
, stack_pointer_rtx
,
5316 insn
= emit_insn (aarch64_gen_store_pair (mode
, mem
, reg
, mem2
,
5319 /* The first part of a frame-related parallel insn is
5320 always assumed to be relevant to the frame
5321 calculations; subsequent parts, are only
5322 frame-related if explicitly marked. */
5323 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn
), 0, 1)) = 1;
5327 insn
= emit_move_insn (mem
, reg
);
5329 RTX_FRAME_RELATED_P (insn
) = 1;
5333 /* Emit code to restore the callee registers of mode MODE from register
5334 number START up to and including LIMIT. Restore from the stack offset
5335 START_OFFSET, skipping any write-back candidates if SKIP_WB is true.
5336 Write the appropriate REG_CFA_RESTORE notes into CFI_OPS. */
5339 aarch64_restore_callee_saves (machine_mode mode
,
5340 poly_int64 start_offset
, unsigned start
,
5341 unsigned limit
, bool skip_wb
, rtx
*cfi_ops
)
5343 rtx base_rtx
= stack_pointer_rtx
;
5348 for (regno
= aarch64_next_callee_save (start
, limit
);
5350 regno
= aarch64_next_callee_save (regno
+ 1, limit
))
5352 if (cfun
->machine
->reg_is_wrapped_separately
[regno
])
5359 && (regno
== cfun
->machine
->frame
.wb_candidate1
5360 || regno
== cfun
->machine
->frame
.wb_candidate2
))
5363 reg
= gen_rtx_REG (mode
, regno
);
5364 offset
= start_offset
+ cfun
->machine
->frame
.reg_offset
[regno
];
5365 mem
= gen_frame_mem (mode
, plus_constant (Pmode
, base_rtx
, offset
));
5367 regno2
= aarch64_next_callee_save (regno
+ 1, limit
);
5368 offset_diff
= cfun
->machine
->frame
.reg_offset
[regno2
]
5369 - cfun
->machine
->frame
.reg_offset
[regno
];
5372 && !cfun
->machine
->reg_is_wrapped_separately
[regno2
]
5373 && known_eq (GET_MODE_SIZE (mode
), offset_diff
))
5375 rtx reg2
= gen_rtx_REG (mode
, regno2
);
5378 offset
= start_offset
+ cfun
->machine
->frame
.reg_offset
[regno2
];
5379 mem2
= gen_frame_mem (mode
, plus_constant (Pmode
, base_rtx
, offset
));
5380 emit_insn (aarch64_gen_load_pair (mode
, reg
, mem
, reg2
, mem2
));
5382 *cfi_ops
= alloc_reg_note (REG_CFA_RESTORE
, reg2
, *cfi_ops
);
5386 emit_move_insn (reg
, mem
);
5387 *cfi_ops
= alloc_reg_note (REG_CFA_RESTORE
, reg
, *cfi_ops
);
5391 /* Return true if OFFSET is a signed 4-bit value multiplied by the size
5395 offset_4bit_signed_scaled_p (machine_mode mode
, poly_int64 offset
)
5397 HOST_WIDE_INT multiple
;
5398 return (constant_multiple_p (offset
, GET_MODE_SIZE (mode
), &multiple
)
5399 && IN_RANGE (multiple
, -8, 7));
5402 /* Return true if OFFSET is a unsigned 6-bit value multiplied by the size
5406 offset_6bit_unsigned_scaled_p (machine_mode mode
, poly_int64 offset
)
5408 HOST_WIDE_INT multiple
;
5409 return (constant_multiple_p (offset
, GET_MODE_SIZE (mode
), &multiple
)
5410 && IN_RANGE (multiple
, 0, 63));
5413 /* Return true if OFFSET is a signed 7-bit value multiplied by the size
5417 aarch64_offset_7bit_signed_scaled_p (machine_mode mode
, poly_int64 offset
)
5419 HOST_WIDE_INT multiple
;
5420 return (constant_multiple_p (offset
, GET_MODE_SIZE (mode
), &multiple
)
5421 && IN_RANGE (multiple
, -64, 63));
5424 /* Return true if OFFSET is a signed 9-bit value. */
5427 aarch64_offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED
,
5430 HOST_WIDE_INT const_offset
;
5431 return (offset
.is_constant (&const_offset
)
5432 && IN_RANGE (const_offset
, -256, 255));
5435 /* Return true if OFFSET is a signed 9-bit value multiplied by the size
5439 offset_9bit_signed_scaled_p (machine_mode mode
, poly_int64 offset
)
5441 HOST_WIDE_INT multiple
;
5442 return (constant_multiple_p (offset
, GET_MODE_SIZE (mode
), &multiple
)
5443 && IN_RANGE (multiple
, -256, 255));
5446 /* Return true if OFFSET is an unsigned 12-bit value multiplied by the size
5450 offset_12bit_unsigned_scaled_p (machine_mode mode
, poly_int64 offset
)
5452 HOST_WIDE_INT multiple
;
5453 return (constant_multiple_p (offset
, GET_MODE_SIZE (mode
), &multiple
)
5454 && IN_RANGE (multiple
, 0, 4095));
5457 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS. */
5460 aarch64_get_separate_components (void)
5462 sbitmap components
= sbitmap_alloc (LAST_SAVED_REGNUM
+ 1);
5463 bitmap_clear (components
);
5465 /* The registers we need saved to the frame. */
5466 for (unsigned regno
= 0; regno
<= LAST_SAVED_REGNUM
; regno
++)
5467 if (aarch64_register_saved_on_entry (regno
))
5469 poly_int64 offset
= cfun
->machine
->frame
.reg_offset
[regno
];
5470 if (!frame_pointer_needed
)
5471 offset
+= cfun
->machine
->frame
.frame_size
5472 - cfun
->machine
->frame
.hard_fp_offset
;
5473 /* Check that we can access the stack slot of the register with one
5474 direct load with no adjustments needed. */
5475 if (offset_12bit_unsigned_scaled_p (DImode
, offset
))
5476 bitmap_set_bit (components
, regno
);
5479 /* Don't mess with the hard frame pointer. */
5480 if (frame_pointer_needed
)
5481 bitmap_clear_bit (components
, HARD_FRAME_POINTER_REGNUM
);
5483 unsigned reg1
= cfun
->machine
->frame
.wb_candidate1
;
5484 unsigned reg2
= cfun
->machine
->frame
.wb_candidate2
;
5485 /* If registers have been chosen to be stored/restored with
5486 writeback don't interfere with them to avoid having to output explicit
5487 stack adjustment instructions. */
5488 if (reg2
!= INVALID_REGNUM
)
5489 bitmap_clear_bit (components
, reg2
);
5490 if (reg1
!= INVALID_REGNUM
)
5491 bitmap_clear_bit (components
, reg1
);
5493 bitmap_clear_bit (components
, LR_REGNUM
);
5494 bitmap_clear_bit (components
, SP_REGNUM
);
5499 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB. */
5502 aarch64_components_for_bb (basic_block bb
)
5504 bitmap in
= DF_LIVE_IN (bb
);
5505 bitmap gen
= &DF_LIVE_BB_INFO (bb
)->gen
;
5506 bitmap kill
= &DF_LIVE_BB_INFO (bb
)->kill
;
5507 bool simd_function
= aarch64_simd_decl_p (cfun
->decl
);
5509 sbitmap components
= sbitmap_alloc (LAST_SAVED_REGNUM
+ 1);
5510 bitmap_clear (components
);
5512 /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets. */
5513 for (unsigned regno
= 0; regno
<= LAST_SAVED_REGNUM
; regno
++)
5514 if ((!call_used_regs
[regno
]
5515 || (simd_function
&& FP_SIMD_SAVED_REGNUM_P (regno
)))
5516 && (bitmap_bit_p (in
, regno
)
5517 || bitmap_bit_p (gen
, regno
)
5518 || bitmap_bit_p (kill
, regno
)))
5520 unsigned regno2
, offset
, offset2
;
5521 bitmap_set_bit (components
, regno
);
5523 /* If there is a callee-save at an adjacent offset, add it too
5524 to increase the use of LDP/STP. */
5525 offset
= cfun
->machine
->frame
.reg_offset
[regno
];
5526 regno2
= ((offset
& 8) == 0) ? regno
+ 1 : regno
- 1;
5528 if (regno2
<= LAST_SAVED_REGNUM
)
5530 offset2
= cfun
->machine
->frame
.reg_offset
[regno2
];
5531 if ((offset
& ~8) == (offset2
& ~8))
5532 bitmap_set_bit (components
, regno2
);
5539 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
5540 Nothing to do for aarch64. */
5543 aarch64_disqualify_components (sbitmap
, edge
, sbitmap
, bool)
5547 /* Return the next set bit in BMP from START onwards. Return the total number
5548 of bits in BMP if no set bit is found at or after START. */
5551 aarch64_get_next_set_bit (sbitmap bmp
, unsigned int start
)
5553 unsigned int nbits
= SBITMAP_SIZE (bmp
);
5557 gcc_assert (start
< nbits
);
5558 for (unsigned int i
= start
; i
< nbits
; i
++)
5559 if (bitmap_bit_p (bmp
, i
))
5565 /* Do the work for aarch64_emit_prologue_components and
5566 aarch64_emit_epilogue_components. COMPONENTS is the bitmap of registers
5567 to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
5568 for these components or the epilogue sequence. That is, it determines
5569 whether we should emit stores or loads and what kind of CFA notes to attach
5570 to the insns. Otherwise the logic for the two sequences is very
5574 aarch64_process_components (sbitmap components
, bool prologue_p
)
5576 rtx ptr_reg
= gen_rtx_REG (Pmode
, frame_pointer_needed
5577 ? HARD_FRAME_POINTER_REGNUM
5578 : STACK_POINTER_REGNUM
);
5580 unsigned last_regno
= SBITMAP_SIZE (components
);
5581 unsigned regno
= aarch64_get_next_set_bit (components
, R0_REGNUM
);
5582 rtx_insn
*insn
= NULL
;
5584 while (regno
!= last_regno
)
5586 /* AAPCS64 section 5.1.2 requires only the low 64 bits to be saved
5587 so DFmode for the vector registers is enough. For simd functions
5588 we want to save the low 128 bits. */
5589 machine_mode mode
= aarch64_reg_save_mode (cfun
->decl
, regno
);
5591 rtx reg
= gen_rtx_REG (mode
, regno
);
5592 poly_int64 offset
= cfun
->machine
->frame
.reg_offset
[regno
];
5593 if (!frame_pointer_needed
)
5594 offset
+= cfun
->machine
->frame
.frame_size
5595 - cfun
->machine
->frame
.hard_fp_offset
;
5596 rtx addr
= plus_constant (Pmode
, ptr_reg
, offset
);
5597 rtx mem
= gen_frame_mem (mode
, addr
);
5599 rtx set
= prologue_p
? gen_rtx_SET (mem
, reg
) : gen_rtx_SET (reg
, mem
);
5600 unsigned regno2
= aarch64_get_next_set_bit (components
, regno
+ 1);
5601 /* No more registers to handle after REGNO.
5602 Emit a single save/restore and exit. */
5603 if (regno2
== last_regno
)
5605 insn
= emit_insn (set
);
5606 RTX_FRAME_RELATED_P (insn
) = 1;
5608 add_reg_note (insn
, REG_CFA_OFFSET
, copy_rtx (set
));
5610 add_reg_note (insn
, REG_CFA_RESTORE
, reg
);
5614 poly_int64 offset2
= cfun
->machine
->frame
.reg_offset
[regno2
];
5615 /* The next register is not of the same class or its offset is not
5616 mergeable with the current one into a pair. */
5617 if (!satisfies_constraint_Ump (mem
)
5618 || GP_REGNUM_P (regno
) != GP_REGNUM_P (regno2
)
5619 || (aarch64_simd_decl_p (cfun
->decl
) && FP_REGNUM_P (regno
))
5620 || maybe_ne ((offset2
- cfun
->machine
->frame
.reg_offset
[regno
]),
5621 GET_MODE_SIZE (mode
)))
5623 insn
= emit_insn (set
);
5624 RTX_FRAME_RELATED_P (insn
) = 1;
5626 add_reg_note (insn
, REG_CFA_OFFSET
, copy_rtx (set
));
5628 add_reg_note (insn
, REG_CFA_RESTORE
, reg
);
5634 /* REGNO2 can be saved/restored in a pair with REGNO. */
5635 rtx reg2
= gen_rtx_REG (mode
, regno2
);
5636 if (!frame_pointer_needed
)
5637 offset2
+= cfun
->machine
->frame
.frame_size
5638 - cfun
->machine
->frame
.hard_fp_offset
;
5639 rtx addr2
= plus_constant (Pmode
, ptr_reg
, offset2
);
5640 rtx mem2
= gen_frame_mem (mode
, addr2
);
5641 rtx set2
= prologue_p
? gen_rtx_SET (mem2
, reg2
)
5642 : gen_rtx_SET (reg2
, mem2
);
5645 insn
= emit_insn (aarch64_gen_store_pair (mode
, mem
, reg
, mem2
, reg2
));
5647 insn
= emit_insn (aarch64_gen_load_pair (mode
, reg
, mem
, reg2
, mem2
));
5649 RTX_FRAME_RELATED_P (insn
) = 1;
5652 add_reg_note (insn
, REG_CFA_OFFSET
, set
);
5653 add_reg_note (insn
, REG_CFA_OFFSET
, set2
);
5657 add_reg_note (insn
, REG_CFA_RESTORE
, reg
);
5658 add_reg_note (insn
, REG_CFA_RESTORE
, reg2
);
5661 regno
= aarch64_get_next_set_bit (components
, regno2
+ 1);
5665 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS. */
5668 aarch64_emit_prologue_components (sbitmap components
)
5670 aarch64_process_components (components
, true);
5673 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS. */
5676 aarch64_emit_epilogue_components (sbitmap components
)
5678 aarch64_process_components (components
, false);
5681 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS. */
5684 aarch64_set_handled_components (sbitmap components
)
5686 for (unsigned regno
= 0; regno
<= LAST_SAVED_REGNUM
; regno
++)
5687 if (bitmap_bit_p (components
, regno
))
5688 cfun
->machine
->reg_is_wrapped_separately
[regno
] = true;
5691 /* On AArch64 we have an ABI defined safe buffer. This constant is used to
5692 determining the probe offset for alloca. */
5694 static HOST_WIDE_INT
5695 aarch64_stack_clash_protection_alloca_probe_range (void)
5697 return STACK_CLASH_CALLER_GUARD
;
5701 /* Allocate POLY_SIZE bytes of stack space using TEMP1 and TEMP2 as scratch
5702 registers. If POLY_SIZE is not large enough to require a probe this function
5703 will only adjust the stack. When allocating the stack space
5704 FRAME_RELATED_P is then used to indicate if the allocation is frame related.
5705 FINAL_ADJUSTMENT_P indicates whether we are allocating the outgoing
5706 arguments. If we are then we ensure that any allocation larger than the ABI
5707 defined buffer needs a probe so that the invariant of having a 1KB buffer is
5710 We emit barriers after each stack adjustment to prevent optimizations from
5711 breaking the invariant that we never drop the stack more than a page. This
5712 invariant is needed to make it easier to correctly handle asynchronous
5713 events, e.g. if we were to allow the stack to be dropped by more than a page
5714 and then have multiple probes up and we take a signal somewhere in between
5715 then the signal handler doesn't know the state of the stack and can make no
5716 assumptions about which pages have been probed. */
5719 aarch64_allocate_and_probe_stack_space (rtx temp1
, rtx temp2
,
5720 poly_int64 poly_size
,
5721 bool frame_related_p
,
5722 bool final_adjustment_p
)
5724 HOST_WIDE_INT guard_size
5725 = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE
);
5726 HOST_WIDE_INT guard_used_by_caller
= STACK_CLASH_CALLER_GUARD
;
5727 /* When doing the final adjustment for the outgoing argument size we can't
5728 assume that LR was saved at position 0. So subtract it's offset from the
5729 ABI safe buffer so that we don't accidentally allow an adjustment that
5730 would result in an allocation larger than the ABI buffer without
5732 HOST_WIDE_INT min_probe_threshold
5733 = final_adjustment_p
5734 ? guard_used_by_caller
- cfun
->machine
->frame
.reg_offset
[LR_REGNUM
]
5735 : guard_size
- guard_used_by_caller
;
5737 poly_int64 frame_size
= cfun
->machine
->frame
.frame_size
;
5739 /* We should always have a positive probe threshold. */
5740 gcc_assert (min_probe_threshold
> 0);
5742 if (flag_stack_clash_protection
&& !final_adjustment_p
)
5744 poly_int64 initial_adjust
= cfun
->machine
->frame
.initial_adjust
;
5745 poly_int64 final_adjust
= cfun
->machine
->frame
.final_adjust
;
5747 if (known_eq (frame_size
, 0))
5749 dump_stack_clash_frame_info (NO_PROBE_NO_FRAME
, false);
5751 else if (known_lt (initial_adjust
, guard_size
- guard_used_by_caller
)
5752 && known_lt (final_adjust
, guard_used_by_caller
))
5754 dump_stack_clash_frame_info (NO_PROBE_SMALL_FRAME
, true);
5758 /* If SIZE is not large enough to require probing, just adjust the stack and
5760 if (known_lt (poly_size
, min_probe_threshold
)
5761 || !flag_stack_clash_protection
)
5763 aarch64_sub_sp (temp1
, temp2
, poly_size
, frame_related_p
);
5768 /* Handle the SVE non-constant case first. */
5769 if (!poly_size
.is_constant (&size
))
5773 fprintf (dump_file
, "Stack clash SVE prologue: ");
5774 print_dec (poly_size
, dump_file
);
5775 fprintf (dump_file
, " bytes, dynamic probing will be required.\n");
5778 /* First calculate the amount of bytes we're actually spilling. */
5779 aarch64_add_offset (Pmode
, temp1
, CONST0_RTX (Pmode
),
5780 poly_size
, temp1
, temp2
, false, true);
5782 rtx_insn
*insn
= get_last_insn ();
5784 if (frame_related_p
)
5786 /* This is done to provide unwinding information for the stack
5787 adjustments we're about to do, however to prevent the optimizers
5788 from removing the R11 move and leaving the CFA note (which would be
5789 very wrong) we tie the old and new stack pointer together.
5790 The tie will expand to nothing but the optimizers will not touch
5792 rtx stack_ptr_copy
= gen_rtx_REG (Pmode
, STACK_CLASH_SVE_CFA_REGNUM
);
5793 emit_move_insn (stack_ptr_copy
, stack_pointer_rtx
);
5794 emit_insn (gen_stack_tie (stack_ptr_copy
, stack_pointer_rtx
));
5796 /* We want the CFA independent of the stack pointer for the
5797 duration of the loop. */
5798 add_reg_note (insn
, REG_CFA_DEF_CFA
, stack_ptr_copy
);
5799 RTX_FRAME_RELATED_P (insn
) = 1;
5802 rtx probe_const
= gen_int_mode (min_probe_threshold
, Pmode
);
5803 rtx guard_const
= gen_int_mode (guard_size
, Pmode
);
5805 insn
= emit_insn (gen_probe_sve_stack_clash (Pmode
, stack_pointer_rtx
,
5806 stack_pointer_rtx
, temp1
,
5807 probe_const
, guard_const
));
5809 /* Now reset the CFA register if needed. */
5810 if (frame_related_p
)
5812 add_reg_note (insn
, REG_CFA_DEF_CFA
,
5813 gen_rtx_PLUS (Pmode
, stack_pointer_rtx
,
5814 gen_int_mode (poly_size
, Pmode
)));
5815 RTX_FRAME_RELATED_P (insn
) = 1;
5823 "Stack clash AArch64 prologue: " HOST_WIDE_INT_PRINT_DEC
5824 " bytes, probing will be required.\n", size
);
5826 /* Round size to the nearest multiple of guard_size, and calculate the
5827 residual as the difference between the original size and the rounded
5829 HOST_WIDE_INT rounded_size
= ROUND_DOWN (size
, guard_size
);
5830 HOST_WIDE_INT residual
= size
- rounded_size
;
5832 /* We can handle a small number of allocations/probes inline. Otherwise
5834 if (rounded_size
<= STACK_CLASH_MAX_UNROLL_PAGES
* guard_size
)
5836 for (HOST_WIDE_INT i
= 0; i
< rounded_size
; i
+= guard_size
)
5838 aarch64_sub_sp (NULL
, temp2
, guard_size
, true);
5839 emit_stack_probe (plus_constant (Pmode
, stack_pointer_rtx
,
5840 guard_used_by_caller
));
5841 emit_insn (gen_blockage ());
5843 dump_stack_clash_frame_info (PROBE_INLINE
, size
!= rounded_size
);
5847 /* Compute the ending address. */
5848 aarch64_add_offset (Pmode
, temp1
, stack_pointer_rtx
, -rounded_size
,
5849 temp1
, NULL
, false, true);
5850 rtx_insn
*insn
= get_last_insn ();
5852 /* For the initial allocation, we don't have a frame pointer
5853 set up, so we always need CFI notes. If we're doing the
5854 final allocation, then we may have a frame pointer, in which
5855 case it is the CFA, otherwise we need CFI notes.
5857 We can determine which allocation we are doing by looking at
5858 the value of FRAME_RELATED_P since the final allocations are not
5860 if (frame_related_p
)
5862 /* We want the CFA independent of the stack pointer for the
5863 duration of the loop. */
5864 add_reg_note (insn
, REG_CFA_DEF_CFA
,
5865 plus_constant (Pmode
, temp1
, rounded_size
));
5866 RTX_FRAME_RELATED_P (insn
) = 1;
5869 /* This allocates and probes the stack. Note that this re-uses some of
5870 the existing Ada stack protection code. However we are guaranteed not
5871 to enter the non loop or residual branches of that code.
5873 The non-loop part won't be entered because if our allocation amount
5874 doesn't require a loop, the case above would handle it.
5876 The residual amount won't be entered because TEMP1 is a mutliple of
5877 the allocation size. The residual will always be 0. As such, the only
5878 part we are actually using from that code is the loop setup. The
5879 actual probing is done in aarch64_output_probe_stack_range. */
5880 insn
= emit_insn (gen_probe_stack_range (stack_pointer_rtx
,
5881 stack_pointer_rtx
, temp1
));
5883 /* Now reset the CFA register if needed. */
5884 if (frame_related_p
)
5886 add_reg_note (insn
, REG_CFA_DEF_CFA
,
5887 plus_constant (Pmode
, stack_pointer_rtx
, rounded_size
));
5888 RTX_FRAME_RELATED_P (insn
) = 1;
5891 emit_insn (gen_blockage ());
5892 dump_stack_clash_frame_info (PROBE_LOOP
, size
!= rounded_size
);
5895 /* Handle any residuals. Residuals of at least MIN_PROBE_THRESHOLD have to
5896 be probed. This maintains the requirement that each page is probed at
5897 least once. For initial probing we probe only if the allocation is
5898 more than GUARD_SIZE - buffer, and for the outgoing arguments we probe
5899 if the amount is larger than buffer. GUARD_SIZE - buffer + buffer ==
5900 GUARD_SIZE. This works that for any allocation that is large enough to
5901 trigger a probe here, we'll have at least one, and if they're not large
5902 enough for this code to emit anything for them, The page would have been
5903 probed by the saving of FP/LR either by this function or any callees. If
5904 we don't have any callees then we won't have more stack adjustments and so
5908 HOST_WIDE_INT residual_probe_offset
= guard_used_by_caller
;
5909 /* If we're doing final adjustments, and we've done any full page
5910 allocations then any residual needs to be probed. */
5911 if (final_adjustment_p
&& rounded_size
!= 0)
5912 min_probe_threshold
= 0;
5913 /* If doing a small final adjustment, we always probe at offset 0.
5914 This is done to avoid issues when LR is not at position 0 or when
5915 the final adjustment is smaller than the probing offset. */
5916 else if (final_adjustment_p
&& rounded_size
== 0)
5917 residual_probe_offset
= 0;
5919 aarch64_sub_sp (temp1
, temp2
, residual
, frame_related_p
);
5920 if (residual
>= min_probe_threshold
)
5924 "Stack clash AArch64 prologue residuals: "
5925 HOST_WIDE_INT_PRINT_DEC
" bytes, probing will be required."
5928 emit_stack_probe (plus_constant (Pmode
, stack_pointer_rtx
,
5929 residual_probe_offset
));
5930 emit_insn (gen_blockage ());
5935 /* Return 1 if the register is used by the epilogue. We need to say the
5936 return register is used, but only after epilogue generation is complete.
5937 Note that in the case of sibcalls, the values "used by the epilogue" are
5938 considered live at the start of the called function.
5940 For SIMD functions we need to return 1 for FP registers that are saved and
5941 restored by a function but are not zero in call_used_regs. If we do not do
5942 this optimizations may remove the restore of the register. */
5945 aarch64_epilogue_uses (int regno
)
5947 if (epilogue_completed
)
5949 if (regno
== LR_REGNUM
)
5951 if (aarch64_simd_decl_p (cfun
->decl
) && FP_SIMD_SAVED_REGNUM_P (regno
))
5957 /* Add a REG_CFA_EXPRESSION note to INSN to say that register REG
5958 is saved at BASE + OFFSET. */
5961 aarch64_add_cfa_expression (rtx_insn
*insn
, unsigned int reg
,
5962 rtx base
, poly_int64 offset
)
5964 rtx mem
= gen_frame_mem (DImode
, plus_constant (Pmode
, base
, offset
));
5965 add_reg_note (insn
, REG_CFA_EXPRESSION
,
5966 gen_rtx_SET (mem
, regno_reg_rtx
[reg
]));
5969 /* AArch64 stack frames generated by this compiler look like:
5971 +-------------------------------+
5973 | incoming stack arguments |
5975 +-------------------------------+
5976 | | <-- incoming stack pointer (aligned)
5977 | callee-allocated save area |
5978 | for register varargs |
5980 +-------------------------------+
5981 | local variables | <-- frame_pointer_rtx
5983 +-------------------------------+
5985 +-------------------------------+ |
5986 | callee-saved registers | | frame.saved_regs_size
5987 +-------------------------------+ |
5989 +-------------------------------+ |
5990 | FP' | / <- hard_frame_pointer_rtx (aligned)
5991 +-------------------------------+
5992 | dynamic allocation |
5993 +-------------------------------+
5995 +-------------------------------+
5996 | outgoing stack arguments | <-- arg_pointer
5998 +-------------------------------+
5999 | | <-- stack_pointer_rtx (aligned)
6001 Dynamic stack allocations via alloca() decrease stack_pointer_rtx
6002 but leave frame_pointer_rtx and hard_frame_pointer_rtx
6005 By default for stack-clash we assume the guard is at least 64KB, but this
6006 value is configurable to either 4KB or 64KB. We also force the guard size to
6007 be the same as the probing interval and both values are kept in sync.
6009 With those assumptions the callee can allocate up to 63KB (or 3KB depending
6010 on the guard size) of stack space without probing.
6012 When probing is needed, we emit a probe at the start of the prologue
6013 and every PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE bytes thereafter.
6015 We have to track how much space has been allocated and the only stores
6016 to the stack we track as implicit probes are the FP/LR stores.
6018 For outgoing arguments we probe if the size is larger than 1KB, such that
6019 the ABI specified buffer is maintained for the next callee.
6021 The following registers are reserved during frame layout and should not be
6022 used for any other purpose:
6024 - r11: Used by stack clash protection when SVE is enabled.
6025 - r12(EP0) and r13(EP1): Used as temporaries for stack adjustment.
6026 - r14 and r15: Used for speculation tracking.
6027 - r16(IP0), r17(IP1): Used by indirect tailcalls.
6028 - r30(LR), r29(FP): Used by standard frame layout.
6030 These registers must be avoided in frame layout related code unless the
6031 explicit intention is to interact with one of the features listed above. */
6033 /* Generate the prologue instructions for entry into a function.
6034 Establish the stack frame by decreasing the stack pointer with a
6035 properly calculated size and, if necessary, create a frame record
6036 filled with the values of LR and previous frame pointer. The
6037 current FP is also set up if it is in use. */
6040 aarch64_expand_prologue (void)
6042 poly_int64 frame_size
= cfun
->machine
->frame
.frame_size
;
6043 poly_int64 initial_adjust
= cfun
->machine
->frame
.initial_adjust
;
6044 HOST_WIDE_INT callee_adjust
= cfun
->machine
->frame
.callee_adjust
;
6045 poly_int64 final_adjust
= cfun
->machine
->frame
.final_adjust
;
6046 poly_int64 callee_offset
= cfun
->machine
->frame
.callee_offset
;
6047 unsigned reg1
= cfun
->machine
->frame
.wb_candidate1
;
6048 unsigned reg2
= cfun
->machine
->frame
.wb_candidate2
;
6049 bool emit_frame_chain
= cfun
->machine
->frame
.emit_frame_chain
;
6052 /* Sign return address for functions. */
6053 if (aarch64_return_address_signing_enabled ())
6055 switch (aarch64_ra_sign_key
)
6058 insn
= emit_insn (gen_paciasp ());
6061 insn
= emit_insn (gen_pacibsp ());
6066 add_reg_note (insn
, REG_CFA_TOGGLE_RA_MANGLE
, const0_rtx
);
6067 RTX_FRAME_RELATED_P (insn
) = 1;
6070 if (flag_stack_usage_info
)
6071 current_function_static_stack_size
= constant_lower_bound (frame_size
);
6073 if (flag_stack_check
== STATIC_BUILTIN_STACK_CHECK
)
6075 if (crtl
->is_leaf
&& !cfun
->calls_alloca
)
6077 if (maybe_gt (frame_size
, PROBE_INTERVAL
)
6078 && maybe_gt (frame_size
, get_stack_check_protect ()))
6079 aarch64_emit_probe_stack_range (get_stack_check_protect (),
6081 - get_stack_check_protect ()));
6083 else if (maybe_gt (frame_size
, 0))
6084 aarch64_emit_probe_stack_range (get_stack_check_protect (), frame_size
);
6087 rtx tmp0_rtx
= gen_rtx_REG (Pmode
, EP0_REGNUM
);
6088 rtx tmp1_rtx
= gen_rtx_REG (Pmode
, EP1_REGNUM
);
6090 /* In theory we should never have both an initial adjustment
6091 and a callee save adjustment. Verify that is the case since the
6092 code below does not handle it for -fstack-clash-protection. */
6093 gcc_assert (known_eq (initial_adjust
, 0) || callee_adjust
== 0);
6095 /* Will only probe if the initial adjustment is larger than the guard
6096 less the amount of the guard reserved for use by the caller's
6098 aarch64_allocate_and_probe_stack_space (tmp0_rtx
, tmp1_rtx
, initial_adjust
,
6101 if (callee_adjust
!= 0)
6102 aarch64_push_regs (reg1
, reg2
, callee_adjust
);
6104 if (emit_frame_chain
)
6106 poly_int64 reg_offset
= callee_adjust
;
6107 if (callee_adjust
== 0)
6111 reg_offset
= callee_offset
;
6112 aarch64_save_callee_saves (DImode
, reg_offset
, reg1
, reg2
, false);
6114 aarch64_add_offset (Pmode
, hard_frame_pointer_rtx
,
6115 stack_pointer_rtx
, callee_offset
,
6116 tmp1_rtx
, tmp0_rtx
, frame_pointer_needed
);
6117 if (frame_pointer_needed
&& !frame_size
.is_constant ())
6119 /* Variable-sized frames need to describe the save slot
6120 address using DW_CFA_expression rather than DW_CFA_offset.
6121 This means that, without taking further action, the
6122 locations of the registers that we've already saved would
6123 remain based on the stack pointer even after we redefine
6124 the CFA based on the frame pointer. We therefore need new
6125 DW_CFA_expressions to re-express the save slots with addresses
6126 based on the frame pointer. */
6127 rtx_insn
*insn
= get_last_insn ();
6128 gcc_assert (RTX_FRAME_RELATED_P (insn
));
6130 /* Add an explicit CFA definition if this was previously
6132 if (!find_reg_note (insn
, REG_CFA_ADJUST_CFA
, NULL_RTX
))
6134 rtx src
= plus_constant (Pmode
, stack_pointer_rtx
,
6136 add_reg_note (insn
, REG_CFA_ADJUST_CFA
,
6137 gen_rtx_SET (hard_frame_pointer_rtx
, src
));
6140 /* Change the save slot expressions for the registers that
6141 we've already saved. */
6142 reg_offset
-= callee_offset
;
6143 aarch64_add_cfa_expression (insn
, reg2
, hard_frame_pointer_rtx
,
6144 reg_offset
+ UNITS_PER_WORD
);
6145 aarch64_add_cfa_expression (insn
, reg1
, hard_frame_pointer_rtx
,
6148 emit_insn (gen_stack_tie (stack_pointer_rtx
, hard_frame_pointer_rtx
));
6151 aarch64_save_callee_saves (DImode
, callee_offset
, R0_REGNUM
, R30_REGNUM
,
6152 callee_adjust
!= 0 || emit_frame_chain
);
6153 if (aarch64_simd_decl_p (cfun
->decl
))
6154 aarch64_save_callee_saves (TFmode
, callee_offset
, V0_REGNUM
, V31_REGNUM
,
6155 callee_adjust
!= 0 || emit_frame_chain
);
6157 aarch64_save_callee_saves (DFmode
, callee_offset
, V0_REGNUM
, V31_REGNUM
,
6158 callee_adjust
!= 0 || emit_frame_chain
);
6160 /* We may need to probe the final adjustment if it is larger than the guard
6161 that is assumed by the called. */
6162 aarch64_allocate_and_probe_stack_space (tmp1_rtx
, tmp0_rtx
, final_adjust
,
6163 !frame_pointer_needed
, true);
6166 /* Return TRUE if we can use a simple_return insn.
6168 This function checks whether the callee saved stack is empty, which
6169 means no restore actions are need. The pro_and_epilogue will use
6170 this to check whether shrink-wrapping opt is feasible. */
6173 aarch64_use_return_insn_p (void)
6175 if (!reload_completed
)
6181 return known_eq (cfun
->machine
->frame
.frame_size
, 0);
6184 /* Return false for non-leaf SIMD functions in order to avoid
6185 shrink-wrapping them. Doing this will lose the necessary
6186 save/restore of FP registers. */
6189 aarch64_use_simple_return_insn_p (void)
6191 if (aarch64_simd_decl_p (cfun
->decl
) && !crtl
->is_leaf
)
6197 /* Generate the epilogue instructions for returning from a function.
6198 This is almost exactly the reverse of the prolog sequence, except
6199 that we need to insert barriers to avoid scheduling loads that read
6200 from a deallocated stack, and we optimize the unwind records by
6201 emitting them all together if possible. */
6203 aarch64_expand_epilogue (bool for_sibcall
)
6205 poly_int64 initial_adjust
= cfun
->machine
->frame
.initial_adjust
;
6206 HOST_WIDE_INT callee_adjust
= cfun
->machine
->frame
.callee_adjust
;
6207 poly_int64 final_adjust
= cfun
->machine
->frame
.final_adjust
;
6208 poly_int64 callee_offset
= cfun
->machine
->frame
.callee_offset
;
6209 unsigned reg1
= cfun
->machine
->frame
.wb_candidate1
;
6210 unsigned reg2
= cfun
->machine
->frame
.wb_candidate2
;
6213 /* A stack clash protection prologue may not have left EP0_REGNUM or
6214 EP1_REGNUM in a usable state. The same is true for allocations
6215 with an SVE component, since we then need both temporary registers
6216 for each allocation. For stack clash we are in a usable state if
6217 the adjustment is less than GUARD_SIZE - GUARD_USED_BY_CALLER. */
6218 HOST_WIDE_INT guard_size
6219 = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE
);
6220 HOST_WIDE_INT guard_used_by_caller
= STACK_CLASH_CALLER_GUARD
;
6222 /* We can re-use the registers when the allocation amount is smaller than
6223 guard_size - guard_used_by_caller because we won't be doing any probes
6224 then. In such situations the register should remain live with the correct
6226 bool can_inherit_p
= (initial_adjust
.is_constant ()
6227 && final_adjust
.is_constant ())
6228 && (!flag_stack_clash_protection
6229 || known_lt (initial_adjust
,
6230 guard_size
- guard_used_by_caller
));
6232 /* We need to add memory barrier to prevent read from deallocated stack. */
6234 = maybe_ne (get_frame_size ()
6235 + cfun
->machine
->frame
.saved_varargs_size
, 0);
6237 /* Emit a barrier to prevent loads from a deallocated stack. */
6238 if (maybe_gt (final_adjust
, crtl
->outgoing_args_size
)
6239 || cfun
->calls_alloca
6240 || crtl
->calls_eh_return
)
6242 emit_insn (gen_stack_tie (stack_pointer_rtx
, stack_pointer_rtx
));
6243 need_barrier_p
= false;
6246 /* Restore the stack pointer from the frame pointer if it may not
6247 be the same as the stack pointer. */
6248 rtx tmp0_rtx
= gen_rtx_REG (Pmode
, EP0_REGNUM
);
6249 rtx tmp1_rtx
= gen_rtx_REG (Pmode
, EP1_REGNUM
);
6250 if (frame_pointer_needed
6251 && (maybe_ne (final_adjust
, 0) || cfun
->calls_alloca
))
6252 /* If writeback is used when restoring callee-saves, the CFA
6253 is restored on the instruction doing the writeback. */
6254 aarch64_add_offset (Pmode
, stack_pointer_rtx
,
6255 hard_frame_pointer_rtx
, -callee_offset
,
6256 tmp1_rtx
, tmp0_rtx
, callee_adjust
== 0);
6258 /* The case where we need to re-use the register here is very rare, so
6259 avoid the complicated condition and just always emit a move if the
6260 immediate doesn't fit. */
6261 aarch64_add_sp (tmp1_rtx
, tmp0_rtx
, final_adjust
, true);
6263 aarch64_restore_callee_saves (DImode
, callee_offset
, R0_REGNUM
, R30_REGNUM
,
6264 callee_adjust
!= 0, &cfi_ops
);
6265 if (aarch64_simd_decl_p (cfun
->decl
))
6266 aarch64_restore_callee_saves (TFmode
, callee_offset
, V0_REGNUM
, V31_REGNUM
,
6267 callee_adjust
!= 0, &cfi_ops
);
6269 aarch64_restore_callee_saves (DFmode
, callee_offset
, V0_REGNUM
, V31_REGNUM
,
6270 callee_adjust
!= 0, &cfi_ops
);
6273 emit_insn (gen_stack_tie (stack_pointer_rtx
, stack_pointer_rtx
));
6275 if (callee_adjust
!= 0)
6276 aarch64_pop_regs (reg1
, reg2
, callee_adjust
, &cfi_ops
);
6278 if (callee_adjust
!= 0 || maybe_gt (initial_adjust
, 65536))
6280 /* Emit delayed restores and set the CFA to be SP + initial_adjust. */
6281 insn
= get_last_insn ();
6282 rtx new_cfa
= plus_constant (Pmode
, stack_pointer_rtx
, initial_adjust
);
6283 REG_NOTES (insn
) = alloc_reg_note (REG_CFA_DEF_CFA
, new_cfa
, cfi_ops
);
6284 RTX_FRAME_RELATED_P (insn
) = 1;
6288 /* Liveness of EP0_REGNUM can not be trusted across function calls either, so
6289 add restriction on emit_move optimization to leaf functions. */
6290 aarch64_add_sp (tmp0_rtx
, tmp1_rtx
, initial_adjust
,
6291 (!can_inherit_p
|| !crtl
->is_leaf
6292 || df_regs_ever_live_p (EP0_REGNUM
)));
6296 /* Emit delayed restores and reset the CFA to be SP. */
6297 insn
= get_last_insn ();
6298 cfi_ops
= alloc_reg_note (REG_CFA_DEF_CFA
, stack_pointer_rtx
, cfi_ops
);
6299 REG_NOTES (insn
) = cfi_ops
;
6300 RTX_FRAME_RELATED_P (insn
) = 1;
6303 /* We prefer to emit the combined return/authenticate instruction RETAA,
6304 however there are three cases in which we must instead emit an explicit
6305 authentication instruction.
6307 1) Sibcalls don't return in a normal way, so if we're about to call one
6308 we must authenticate.
6310 2) The RETAA instruction is not available before ARMv8.3-A, so if we are
6311 generating code for !TARGET_ARMV8_3 we can't use it and must
6312 explicitly authenticate.
6314 3) On an eh_return path we make extra stack adjustments to update the
6315 canonical frame address to be the exception handler's CFA. We want
6316 to authenticate using the CFA of the function which calls eh_return.
6318 if (aarch64_return_address_signing_enabled ()
6319 && (for_sibcall
|| !TARGET_ARMV8_3
|| crtl
->calls_eh_return
))
6321 switch (aarch64_ra_sign_key
)
6324 insn
= emit_insn (gen_autiasp ());
6327 insn
= emit_insn (gen_autibsp ());
6332 add_reg_note (insn
, REG_CFA_TOGGLE_RA_MANGLE
, const0_rtx
);
6333 RTX_FRAME_RELATED_P (insn
) = 1;
6336 /* Stack adjustment for exception handler. */
6337 if (crtl
->calls_eh_return
&& !for_sibcall
)
6339 /* We need to unwind the stack by the offset computed by
6340 EH_RETURN_STACKADJ_RTX. We have already reset the CFA
6341 to be SP; letting the CFA move during this adjustment
6342 is just as correct as retaining the CFA from the body
6343 of the function. Therefore, do nothing special. */
6344 emit_insn (gen_add2_insn (stack_pointer_rtx
, EH_RETURN_STACKADJ_RTX
));
6347 emit_use (gen_rtx_REG (DImode
, LR_REGNUM
));
6349 emit_jump_insn (ret_rtx
);
6352 /* Implement EH_RETURN_HANDLER_RTX. EH returns need to either return
6353 normally or return to a previous frame after unwinding.
6355 An EH return uses a single shared return sequence. The epilogue is
6356 exactly like a normal epilogue except that it has an extra input
6357 register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
6358 that must be applied after the frame has been destroyed. An extra label
6359 is inserted before the epilogue which initializes this register to zero,
6360 and this is the entry point for a normal return.
6362 An actual EH return updates the return address, initializes the stack
6363 adjustment and jumps directly into the epilogue (bypassing the zeroing
6364 of the adjustment). Since the return address is typically saved on the
6365 stack when a function makes a call, the saved LR must be updated outside
6368 This poses problems as the store is generated well before the epilogue,
6369 so the offset of LR is not known yet. Also optimizations will remove the
6370 store as it appears dead, even after the epilogue is generated (as the
6371 base or offset for loading LR is different in many cases).
6373 To avoid these problems this implementation forces the frame pointer
6374 in eh_return functions so that the location of LR is fixed and known early.
6375 It also marks the store volatile, so no optimization is permitted to
6376 remove the store. */
6378 aarch64_eh_return_handler_rtx (void)
6380 rtx tmp
= gen_frame_mem (Pmode
,
6381 plus_constant (Pmode
, hard_frame_pointer_rtx
, UNITS_PER_WORD
));
6383 /* Mark the store volatile, so no optimization is permitted to remove it. */
6384 MEM_VOLATILE_P (tmp
) = true;
6388 /* Output code to add DELTA to the first argument, and then jump
6389 to FUNCTION. Used for C++ multiple inheritance. */
6391 aarch64_output_mi_thunk (FILE *file
, tree thunk ATTRIBUTE_UNUSED
,
6392 HOST_WIDE_INT delta
,
6393 HOST_WIDE_INT vcall_offset
,
6396 /* The this pointer is always in x0. Note that this differs from
6397 Arm where the this pointer maybe bumped to r1 if r0 is required
6398 to return a pointer to an aggregate. On AArch64 a result value
6399 pointer will be in x8. */
6400 int this_regno
= R0_REGNUM
;
6401 rtx this_rtx
, temp0
, temp1
, addr
, funexp
;
6403 const char *fnname
= IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (thunk
));
6405 if (aarch64_bti_enabled ())
6406 emit_insn (gen_bti_c());
6408 reload_completed
= 1;
6409 emit_note (NOTE_INSN_PROLOGUE_END
);
6411 this_rtx
= gen_rtx_REG (Pmode
, this_regno
);
6412 temp0
= gen_rtx_REG (Pmode
, EP0_REGNUM
);
6413 temp1
= gen_rtx_REG (Pmode
, EP1_REGNUM
);
6415 if (vcall_offset
== 0)
6416 aarch64_add_offset (Pmode
, this_rtx
, this_rtx
, delta
, temp1
, temp0
, false);
6419 gcc_assert ((vcall_offset
& (POINTER_BYTES
- 1)) == 0);
6424 if (delta
>= -256 && delta
< 256)
6425 addr
= gen_rtx_PRE_MODIFY (Pmode
, this_rtx
,
6426 plus_constant (Pmode
, this_rtx
, delta
));
6428 aarch64_add_offset (Pmode
, this_rtx
, this_rtx
, delta
,
6429 temp1
, temp0
, false);
6432 if (Pmode
== ptr_mode
)
6433 aarch64_emit_move (temp0
, gen_rtx_MEM (ptr_mode
, addr
));
6435 aarch64_emit_move (temp0
,
6436 gen_rtx_ZERO_EXTEND (Pmode
,
6437 gen_rtx_MEM (ptr_mode
, addr
)));
6439 if (vcall_offset
>= -256 && vcall_offset
< 4096 * POINTER_BYTES
)
6440 addr
= plus_constant (Pmode
, temp0
, vcall_offset
);
6443 aarch64_internal_mov_immediate (temp1
, GEN_INT (vcall_offset
), true,
6445 addr
= gen_rtx_PLUS (Pmode
, temp0
, temp1
);
6448 if (Pmode
== ptr_mode
)
6449 aarch64_emit_move (temp1
, gen_rtx_MEM (ptr_mode
,addr
));
6451 aarch64_emit_move (temp1
,
6452 gen_rtx_SIGN_EXTEND (Pmode
,
6453 gen_rtx_MEM (ptr_mode
, addr
)));
6455 emit_insn (gen_add2_insn (this_rtx
, temp1
));
6458 /* Generate a tail call to the target function. */
6459 if (!TREE_USED (function
))
6461 assemble_external (function
);
6462 TREE_USED (function
) = 1;
6464 funexp
= XEXP (DECL_RTL (function
), 0);
6465 funexp
= gen_rtx_MEM (FUNCTION_MODE
, funexp
);
6466 insn
= emit_call_insn (gen_sibcall (funexp
, const0_rtx
, NULL_RTX
));
6467 SIBLING_CALL_P (insn
) = 1;
6469 insn
= get_insns ();
6470 shorten_branches (insn
);
6472 assemble_start_function (thunk
, fnname
);
6473 final_start_function (insn
, file
, 1);
6474 final (insn
, file
, 1);
6475 final_end_function ();
6476 assemble_end_function (thunk
, fnname
);
6478 /* Stop pretending to be a post-reload pass. */
6479 reload_completed
= 0;
6483 aarch64_tls_referenced_p (rtx x
)
6485 if (!TARGET_HAVE_TLS
)
6487 subrtx_iterator::array_type array
;
6488 FOR_EACH_SUBRTX (iter
, array
, x
, ALL
)
6490 const_rtx x
= *iter
;
6491 if (GET_CODE (x
) == SYMBOL_REF
&& SYMBOL_REF_TLS_MODEL (x
) != 0)
6493 /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
6494 TLS offsets, not real symbol references. */
6495 if (GET_CODE (x
) == UNSPEC
&& XINT (x
, 1) == UNSPEC_TLS
)
6496 iter
.skip_subrtxes ();
6502 /* Return true if val can be encoded as a 12-bit unsigned immediate with
6503 a left shift of 0 or 12 bits. */
6505 aarch64_uimm12_shift (HOST_WIDE_INT val
)
6507 return ((val
& (((HOST_WIDE_INT
) 0xfff) << 0)) == val
6508 || (val
& (((HOST_WIDE_INT
) 0xfff) << 12)) == val
6512 /* Returns the nearest value to VAL that will fit as a 12-bit unsigned immediate
6513 that can be created with a left shift of 0 or 12. */
6514 static HOST_WIDE_INT
6515 aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val
)
6517 /* Check to see if the value fits in 24 bits, as that is the maximum we can
6518 handle correctly. */
6519 gcc_assert ((val
& 0xffffff) == val
);
6521 if (((val
& 0xfff) << 0) == val
)
6524 return val
& (0xfff << 12);
6527 /* Return true if val is an immediate that can be loaded into a
6528 register by a MOVZ instruction. */
6530 aarch64_movw_imm (HOST_WIDE_INT val
, scalar_int_mode mode
)
6532 if (GET_MODE_SIZE (mode
) > 4)
6534 if ((val
& (((HOST_WIDE_INT
) 0xffff) << 32)) == val
6535 || (val
& (((HOST_WIDE_INT
) 0xffff) << 48)) == val
)
6540 /* Ignore sign extension. */
6541 val
&= (HOST_WIDE_INT
) 0xffffffff;
6543 return ((val
& (((HOST_WIDE_INT
) 0xffff) << 0)) == val
6544 || (val
& (((HOST_WIDE_INT
) 0xffff) << 16)) == val
);
6547 /* VAL is a value with the inner mode of MODE. Replicate it to fill a
6548 64-bit (DImode) integer. */
6550 static unsigned HOST_WIDE_INT
6551 aarch64_replicate_bitmask_imm (unsigned HOST_WIDE_INT val
, machine_mode mode
)
6553 unsigned int size
= GET_MODE_UNIT_PRECISION (mode
);
6556 val
&= (HOST_WIDE_INT_1U
<< size
) - 1;
6563 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2. */
6565 static const unsigned HOST_WIDE_INT bitmask_imm_mul
[] =
6567 0x0000000100000001ull
,
6568 0x0001000100010001ull
,
6569 0x0101010101010101ull
,
6570 0x1111111111111111ull
,
6571 0x5555555555555555ull
,
6575 /* Return true if val is a valid bitmask immediate. */
6578 aarch64_bitmask_imm (HOST_WIDE_INT val_in
, machine_mode mode
)
6580 unsigned HOST_WIDE_INT val
, tmp
, mask
, first_one
, next_one
;
6583 /* Check for a single sequence of one bits and return quickly if so.
6584 The special cases of all ones and all zeroes returns false. */
6585 val
= aarch64_replicate_bitmask_imm (val_in
, mode
);
6586 tmp
= val
+ (val
& -val
);
6588 if (tmp
== (tmp
& -tmp
))
6589 return (val
+ 1) > 1;
6591 /* Replicate 32-bit immediates so we can treat them as 64-bit. */
6593 val
= (val
<< 32) | (val
& 0xffffffff);
6595 /* Invert if the immediate doesn't start with a zero bit - this means we
6596 only need to search for sequences of one bits. */
6600 /* Find the first set bit and set tmp to val with the first sequence of one
6601 bits removed. Return success if there is a single sequence of ones. */
6602 first_one
= val
& -val
;
6603 tmp
= val
& (val
+ first_one
);
6608 /* Find the next set bit and compute the difference in bit position. */
6609 next_one
= tmp
& -tmp
;
6610 bits
= clz_hwi (first_one
) - clz_hwi (next_one
);
6613 /* Check the bit position difference is a power of 2, and that the first
6614 sequence of one bits fits within 'bits' bits. */
6615 if ((mask
>> bits
) != 0 || bits
!= (bits
& -bits
))
6618 /* Check the sequence of one bits is repeated 64/bits times. */
6619 return val
== mask
* bitmask_imm_mul
[__builtin_clz (bits
) - 26];
6622 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
6623 Assumed precondition: VAL_IN Is not zero. */
6625 unsigned HOST_WIDE_INT
6626 aarch64_and_split_imm1 (HOST_WIDE_INT val_in
)
6628 int lowest_bit_set
= ctz_hwi (val_in
);
6629 int highest_bit_set
= floor_log2 (val_in
);
6630 gcc_assert (val_in
!= 0);
6632 return ((HOST_WIDE_INT_UC (2) << highest_bit_set
) -
6633 (HOST_WIDE_INT_1U
<< lowest_bit_set
));
6636 /* Create constant where bits outside of lowest bit set to highest bit set
6639 unsigned HOST_WIDE_INT
6640 aarch64_and_split_imm2 (HOST_WIDE_INT val_in
)
6642 return val_in
| ~aarch64_and_split_imm1 (val_in
);
6645 /* Return true if VAL_IN is a valid 'and' bitmask immediate. */
6648 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in
, machine_mode mode
)
6650 scalar_int_mode int_mode
;
6651 if (!is_a
<scalar_int_mode
> (mode
, &int_mode
))
6654 if (aarch64_bitmask_imm (val_in
, int_mode
))
6657 if (aarch64_move_imm (val_in
, int_mode
))
6660 unsigned HOST_WIDE_INT imm2
= aarch64_and_split_imm2 (val_in
);
6662 return aarch64_bitmask_imm (imm2
, int_mode
);
6665 /* Return true if val is an immediate that can be loaded into a
6666 register in a single instruction. */
6668 aarch64_move_imm (HOST_WIDE_INT val
, machine_mode mode
)
6670 scalar_int_mode int_mode
;
6671 if (!is_a
<scalar_int_mode
> (mode
, &int_mode
))
6674 if (aarch64_movw_imm (val
, int_mode
) || aarch64_movw_imm (~val
, int_mode
))
6676 return aarch64_bitmask_imm (val
, int_mode
);
6680 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED
, rtx x
)
6684 if (GET_CODE (x
) == HIGH
)
6687 /* There's no way to calculate VL-based values using relocations. */
6688 subrtx_iterator::array_type array
;
6689 FOR_EACH_SUBRTX (iter
, array
, x
, ALL
)
6690 if (GET_CODE (*iter
) == CONST_POLY_INT
)
6693 split_const (x
, &base
, &offset
);
6694 if (GET_CODE (base
) == SYMBOL_REF
|| GET_CODE (base
) == LABEL_REF
)
6696 if (aarch64_classify_symbol (base
, INTVAL (offset
))
6697 != SYMBOL_FORCE_TO_MEM
)
6700 /* Avoid generating a 64-bit relocation in ILP32; leave
6701 to aarch64_expand_mov_immediate to handle it properly. */
6702 return mode
!= ptr_mode
;
6705 return aarch64_tls_referenced_p (x
);
6708 /* Implement TARGET_CASE_VALUES_THRESHOLD.
6709 The expansion for a table switch is quite expensive due to the number
6710 of instructions, the table lookup and hard to predict indirect jump.
6711 When optimizing for speed, and -O3 enabled, use the per-core tuning if
6712 set, otherwise use tables for > 16 cases as a tradeoff between size and
6713 performance. When optimizing for size, use the default setting. */
6716 aarch64_case_values_threshold (void)
6718 /* Use the specified limit for the number of cases before using jump
6719 tables at higher optimization levels. */
6721 && selected_cpu
->tune
->max_case_values
!= 0)
6722 return selected_cpu
->tune
->max_case_values
;
6724 return optimize_size
? default_case_values_threshold () : 17;
6727 /* Return true if register REGNO is a valid index register.
6728 STRICT_P is true if REG_OK_STRICT is in effect. */
6731 aarch64_regno_ok_for_index_p (int regno
, bool strict_p
)
6733 if (!HARD_REGISTER_NUM_P (regno
))
6741 regno
= reg_renumber
[regno
];
6743 return GP_REGNUM_P (regno
);
6746 /* Return true if register REGNO is a valid base register for mode MODE.
6747 STRICT_P is true if REG_OK_STRICT is in effect. */
6750 aarch64_regno_ok_for_base_p (int regno
, bool strict_p
)
6752 if (!HARD_REGISTER_NUM_P (regno
))
6760 regno
= reg_renumber
[regno
];
6763 /* The fake registers will be eliminated to either the stack or
6764 hard frame pointer, both of which are usually valid base registers.
6765 Reload deals with the cases where the eliminated form isn't valid. */
6766 return (GP_REGNUM_P (regno
)
6767 || regno
== SP_REGNUM
6768 || regno
== FRAME_POINTER_REGNUM
6769 || regno
== ARG_POINTER_REGNUM
);
6772 /* Return true if X is a valid base register for mode MODE.
6773 STRICT_P is true if REG_OK_STRICT is in effect. */
6776 aarch64_base_register_rtx_p (rtx x
, bool strict_p
)
6779 && GET_CODE (x
) == SUBREG
6780 && contains_reg_of_mode
[GENERAL_REGS
][GET_MODE (SUBREG_REG (x
))])
6783 return (REG_P (x
) && aarch64_regno_ok_for_base_p (REGNO (x
), strict_p
));
6786 /* Return true if address offset is a valid index. If it is, fill in INFO
6787 appropriately. STRICT_P is true if REG_OK_STRICT is in effect. */
6790 aarch64_classify_index (struct aarch64_address_info
*info
, rtx x
,
6791 machine_mode mode
, bool strict_p
)
6793 enum aarch64_address_type type
;
6798 if ((REG_P (x
) || GET_CODE (x
) == SUBREG
)
6799 && GET_MODE (x
) == Pmode
)
6801 type
= ADDRESS_REG_REG
;
6805 /* (sign_extend:DI (reg:SI)) */
6806 else if ((GET_CODE (x
) == SIGN_EXTEND
6807 || GET_CODE (x
) == ZERO_EXTEND
)
6808 && GET_MODE (x
) == DImode
6809 && GET_MODE (XEXP (x
, 0)) == SImode
)
6811 type
= (GET_CODE (x
) == SIGN_EXTEND
)
6812 ? ADDRESS_REG_SXTW
: ADDRESS_REG_UXTW
;
6813 index
= XEXP (x
, 0);
6816 /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
6817 else if (GET_CODE (x
) == MULT
6818 && (GET_CODE (XEXP (x
, 0)) == SIGN_EXTEND
6819 || GET_CODE (XEXP (x
, 0)) == ZERO_EXTEND
)
6820 && GET_MODE (XEXP (x
, 0)) == DImode
6821 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == SImode
6822 && CONST_INT_P (XEXP (x
, 1)))
6824 type
= (GET_CODE (XEXP (x
, 0)) == SIGN_EXTEND
)
6825 ? ADDRESS_REG_SXTW
: ADDRESS_REG_UXTW
;
6826 index
= XEXP (XEXP (x
, 0), 0);
6827 shift
= exact_log2 (INTVAL (XEXP (x
, 1)));
6829 /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
6830 else if (GET_CODE (x
) == ASHIFT
6831 && (GET_CODE (XEXP (x
, 0)) == SIGN_EXTEND
6832 || GET_CODE (XEXP (x
, 0)) == ZERO_EXTEND
)
6833 && GET_MODE (XEXP (x
, 0)) == DImode
6834 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == SImode
6835 && CONST_INT_P (XEXP (x
, 1)))
6837 type
= (GET_CODE (XEXP (x
, 0)) == SIGN_EXTEND
)
6838 ? ADDRESS_REG_SXTW
: ADDRESS_REG_UXTW
;
6839 index
= XEXP (XEXP (x
, 0), 0);
6840 shift
= INTVAL (XEXP (x
, 1));
6842 /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
6843 else if ((GET_CODE (x
) == SIGN_EXTRACT
6844 || GET_CODE (x
) == ZERO_EXTRACT
)
6845 && GET_MODE (x
) == DImode
6846 && GET_CODE (XEXP (x
, 0)) == MULT
6847 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == DImode
6848 && CONST_INT_P (XEXP (XEXP (x
, 0), 1)))
6850 type
= (GET_CODE (x
) == SIGN_EXTRACT
)
6851 ? ADDRESS_REG_SXTW
: ADDRESS_REG_UXTW
;
6852 index
= XEXP (XEXP (x
, 0), 0);
6853 shift
= exact_log2 (INTVAL (XEXP (XEXP (x
, 0), 1)));
6854 if (INTVAL (XEXP (x
, 1)) != 32 + shift
6855 || INTVAL (XEXP (x
, 2)) != 0)
6858 /* (and:DI (mult:DI (reg:DI) (const_int scale))
6859 (const_int 0xffffffff<<shift)) */
6860 else if (GET_CODE (x
) == AND
6861 && GET_MODE (x
) == DImode
6862 && GET_CODE (XEXP (x
, 0)) == MULT
6863 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == DImode
6864 && CONST_INT_P (XEXP (XEXP (x
, 0), 1))
6865 && CONST_INT_P (XEXP (x
, 1)))
6867 type
= ADDRESS_REG_UXTW
;
6868 index
= XEXP (XEXP (x
, 0), 0);
6869 shift
= exact_log2 (INTVAL (XEXP (XEXP (x
, 0), 1)));
6870 if (INTVAL (XEXP (x
, 1)) != (HOST_WIDE_INT
)0xffffffff << shift
)
6873 /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
6874 else if ((GET_CODE (x
) == SIGN_EXTRACT
6875 || GET_CODE (x
) == ZERO_EXTRACT
)
6876 && GET_MODE (x
) == DImode
6877 && GET_CODE (XEXP (x
, 0)) == ASHIFT
6878 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == DImode
6879 && CONST_INT_P (XEXP (XEXP (x
, 0), 1)))
6881 type
= (GET_CODE (x
) == SIGN_EXTRACT
)
6882 ? ADDRESS_REG_SXTW
: ADDRESS_REG_UXTW
;
6883 index
= XEXP (XEXP (x
, 0), 0);
6884 shift
= INTVAL (XEXP (XEXP (x
, 0), 1));
6885 if (INTVAL (XEXP (x
, 1)) != 32 + shift
6886 || INTVAL (XEXP (x
, 2)) != 0)
6889 /* (and:DI (ashift:DI (reg:DI) (const_int shift))
6890 (const_int 0xffffffff<<shift)) */
6891 else if (GET_CODE (x
) == AND
6892 && GET_MODE (x
) == DImode
6893 && GET_CODE (XEXP (x
, 0)) == ASHIFT
6894 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == DImode
6895 && CONST_INT_P (XEXP (XEXP (x
, 0), 1))
6896 && CONST_INT_P (XEXP (x
, 1)))
6898 type
= ADDRESS_REG_UXTW
;
6899 index
= XEXP (XEXP (x
, 0), 0);
6900 shift
= INTVAL (XEXP (XEXP (x
, 0), 1));
6901 if (INTVAL (XEXP (x
, 1)) != (HOST_WIDE_INT
)0xffffffff << shift
)
6904 /* (mult:P (reg:P) (const_int scale)) */
6905 else if (GET_CODE (x
) == MULT
6906 && GET_MODE (x
) == Pmode
6907 && GET_MODE (XEXP (x
, 0)) == Pmode
6908 && CONST_INT_P (XEXP (x
, 1)))
6910 type
= ADDRESS_REG_REG
;
6911 index
= XEXP (x
, 0);
6912 shift
= exact_log2 (INTVAL (XEXP (x
, 1)));
6914 /* (ashift:P (reg:P) (const_int shift)) */
6915 else if (GET_CODE (x
) == ASHIFT
6916 && GET_MODE (x
) == Pmode
6917 && GET_MODE (XEXP (x
, 0)) == Pmode
6918 && CONST_INT_P (XEXP (x
, 1)))
6920 type
= ADDRESS_REG_REG
;
6921 index
= XEXP (x
, 0);
6922 shift
= INTVAL (XEXP (x
, 1));
6928 && GET_CODE (index
) == SUBREG
6929 && contains_reg_of_mode
[GENERAL_REGS
][GET_MODE (SUBREG_REG (index
))])
6930 index
= SUBREG_REG (index
);
6932 if (aarch64_sve_data_mode_p (mode
))
6934 if (type
!= ADDRESS_REG_REG
6935 || (1 << shift
) != GET_MODE_UNIT_SIZE (mode
))
6941 && !(IN_RANGE (shift
, 1, 3)
6942 && known_eq (1 << shift
, GET_MODE_SIZE (mode
))))
6947 && aarch64_regno_ok_for_index_p (REGNO (index
), strict_p
))
6950 info
->offset
= index
;
6951 info
->shift
= shift
;
6958 /* Return true if MODE is one of the modes for which we
6959 support LDP/STP operations. */
6962 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode
)
6964 return mode
== SImode
|| mode
== DImode
6965 || mode
== SFmode
|| mode
== DFmode
6966 || (aarch64_vector_mode_supported_p (mode
)
6967 && (known_eq (GET_MODE_SIZE (mode
), 8)
6968 || (known_eq (GET_MODE_SIZE (mode
), 16)
6969 && (aarch64_tune_params
.extra_tuning_flags
6970 & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS
) == 0)));
6973 /* Return true if REGNO is a virtual pointer register, or an eliminable
6974 "soft" frame register. Like REGNO_PTR_FRAME_P except that we don't
6975 include stack_pointer or hard_frame_pointer. */
6977 virt_or_elim_regno_p (unsigned regno
)
6979 return ((regno
>= FIRST_VIRTUAL_REGISTER
6980 && regno
<= LAST_VIRTUAL_POINTER_REGISTER
)
6981 || regno
== FRAME_POINTER_REGNUM
6982 || regno
== ARG_POINTER_REGNUM
);
6985 /* Return true if X is a valid address of type TYPE for machine mode MODE.
6986 If it is, fill in INFO appropriately. STRICT_P is true if
6987 REG_OK_STRICT is in effect. */
6990 aarch64_classify_address (struct aarch64_address_info
*info
,
6991 rtx x
, machine_mode mode
, bool strict_p
,
6992 aarch64_addr_query_type type
)
6994 enum rtx_code code
= GET_CODE (x
);
6998 HOST_WIDE_INT const_size
;
7000 /* On BE, we use load/store pair for all large int mode load/stores.
7001 TI/TFmode may also use a load/store pair. */
7002 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
7003 bool advsimd_struct_p
= (vec_flags
== (VEC_ADVSIMD
| VEC_STRUCT
));
7004 bool load_store_pair_p
= (type
== ADDR_QUERY_LDP_STP
7005 || type
== ADDR_QUERY_LDP_STP_N
7008 || (BYTES_BIG_ENDIAN
&& advsimd_struct_p
));
7010 /* If we are dealing with ADDR_QUERY_LDP_STP_N that means the incoming mode
7011 corresponds to the actual size of the memory being loaded/stored and the
7012 mode of the corresponding addressing mode is half of that. */
7013 if (type
== ADDR_QUERY_LDP_STP_N
7014 && known_eq (GET_MODE_SIZE (mode
), 16))
7017 bool allow_reg_index_p
= (!load_store_pair_p
7018 && (known_lt (GET_MODE_SIZE (mode
), 16)
7019 || vec_flags
== VEC_ADVSIMD
7020 || vec_flags
& VEC_SVE_DATA
));
7022 /* For SVE, only accept [Rn], [Rn, Rm, LSL #shift] and
7023 [Rn, #offset, MUL VL]. */
7024 if ((vec_flags
& (VEC_SVE_DATA
| VEC_SVE_PRED
)) != 0
7025 && (code
!= REG
&& code
!= PLUS
))
7028 /* On LE, for AdvSIMD, don't support anything other than POST_INC or
7030 if (advsimd_struct_p
7031 && !BYTES_BIG_ENDIAN
7032 && (code
!= POST_INC
&& code
!= REG
))
7035 gcc_checking_assert (GET_MODE (x
) == VOIDmode
7036 || SCALAR_INT_MODE_P (GET_MODE (x
)));
7042 info
->type
= ADDRESS_REG_IMM
;
7044 info
->offset
= const0_rtx
;
7045 info
->const_offset
= 0;
7046 return aarch64_base_register_rtx_p (x
, strict_p
);
7054 && virt_or_elim_regno_p (REGNO (op0
))
7055 && poly_int_rtx_p (op1
, &offset
))
7057 info
->type
= ADDRESS_REG_IMM
;
7060 info
->const_offset
= offset
;
7065 if (maybe_ne (GET_MODE_SIZE (mode
), 0)
7066 && aarch64_base_register_rtx_p (op0
, strict_p
)
7067 && poly_int_rtx_p (op1
, &offset
))
7069 info
->type
= ADDRESS_REG_IMM
;
7072 info
->const_offset
= offset
;
7074 /* TImode and TFmode values are allowed in both pairs of X
7075 registers and individual Q registers. The available
7077 X,X: 7-bit signed scaled offset
7078 Q: 9-bit signed offset
7079 We conservatively require an offset representable in either mode.
7080 When performing the check for pairs of X registers i.e. LDP/STP
7081 pass down DImode since that is the natural size of the LDP/STP
7082 instruction memory accesses. */
7083 if (mode
== TImode
|| mode
== TFmode
)
7084 return (aarch64_offset_7bit_signed_scaled_p (DImode
, offset
)
7085 && (aarch64_offset_9bit_signed_unscaled_p (mode
, offset
)
7086 || offset_12bit_unsigned_scaled_p (mode
, offset
)));
7088 /* A 7bit offset check because OImode will emit a ldp/stp
7089 instruction (only big endian will get here).
7090 For ldp/stp instructions, the offset is scaled for the size of a
7091 single element of the pair. */
7093 return aarch64_offset_7bit_signed_scaled_p (TImode
, offset
);
7095 /* Three 9/12 bit offsets checks because CImode will emit three
7096 ldr/str instructions (only big endian will get here). */
7098 return (aarch64_offset_7bit_signed_scaled_p (TImode
, offset
)
7099 && (aarch64_offset_9bit_signed_unscaled_p (V16QImode
,
7101 || offset_12bit_unsigned_scaled_p (V16QImode
,
7104 /* Two 7bit offsets checks because XImode will emit two ldp/stp
7105 instructions (only big endian will get here). */
7107 return (aarch64_offset_7bit_signed_scaled_p (TImode
, offset
)
7108 && aarch64_offset_7bit_signed_scaled_p (TImode
,
7111 /* Make "m" use the LD1 offset range for SVE data modes, so
7112 that pre-RTL optimizers like ivopts will work to that
7113 instead of the wider LDR/STR range. */
7114 if (vec_flags
== VEC_SVE_DATA
)
7115 return (type
== ADDR_QUERY_M
7116 ? offset_4bit_signed_scaled_p (mode
, offset
)
7117 : offset_9bit_signed_scaled_p (mode
, offset
));
7119 if (vec_flags
== (VEC_SVE_DATA
| VEC_STRUCT
))
7121 poly_int64 end_offset
= (offset
7122 + GET_MODE_SIZE (mode
)
7123 - BYTES_PER_SVE_VECTOR
);
7124 return (type
== ADDR_QUERY_M
7125 ? offset_4bit_signed_scaled_p (mode
, offset
)
7126 : (offset_9bit_signed_scaled_p (SVE_BYTE_MODE
, offset
)
7127 && offset_9bit_signed_scaled_p (SVE_BYTE_MODE
,
7131 if (vec_flags
== VEC_SVE_PRED
)
7132 return offset_9bit_signed_scaled_p (mode
, offset
);
7134 if (load_store_pair_p
)
7135 return ((known_eq (GET_MODE_SIZE (mode
), 4)
7136 || known_eq (GET_MODE_SIZE (mode
), 8)
7137 || known_eq (GET_MODE_SIZE (mode
), 16))
7138 && aarch64_offset_7bit_signed_scaled_p (mode
, offset
));
7140 return (aarch64_offset_9bit_signed_unscaled_p (mode
, offset
)
7141 || offset_12bit_unsigned_scaled_p (mode
, offset
));
7144 if (allow_reg_index_p
)
7146 /* Look for base + (scaled/extended) index register. */
7147 if (aarch64_base_register_rtx_p (op0
, strict_p
)
7148 && aarch64_classify_index (info
, op1
, mode
, strict_p
))
7153 if (aarch64_base_register_rtx_p (op1
, strict_p
)
7154 && aarch64_classify_index (info
, op0
, mode
, strict_p
))
7167 info
->type
= ADDRESS_REG_WB
;
7168 info
->base
= XEXP (x
, 0);
7169 info
->offset
= NULL_RTX
;
7170 return aarch64_base_register_rtx_p (info
->base
, strict_p
);
7174 info
->type
= ADDRESS_REG_WB
;
7175 info
->base
= XEXP (x
, 0);
7176 if (GET_CODE (XEXP (x
, 1)) == PLUS
7177 && poly_int_rtx_p (XEXP (XEXP (x
, 1), 1), &offset
)
7178 && rtx_equal_p (XEXP (XEXP (x
, 1), 0), info
->base
)
7179 && aarch64_base_register_rtx_p (info
->base
, strict_p
))
7181 info
->offset
= XEXP (XEXP (x
, 1), 1);
7182 info
->const_offset
= offset
;
7184 /* TImode and TFmode values are allowed in both pairs of X
7185 registers and individual Q registers. The available
7187 X,X: 7-bit signed scaled offset
7188 Q: 9-bit signed offset
7189 We conservatively require an offset representable in either mode.
7191 if (mode
== TImode
|| mode
== TFmode
)
7192 return (aarch64_offset_7bit_signed_scaled_p (mode
, offset
)
7193 && aarch64_offset_9bit_signed_unscaled_p (mode
, offset
));
7195 if (load_store_pair_p
)
7196 return ((known_eq (GET_MODE_SIZE (mode
), 4)
7197 || known_eq (GET_MODE_SIZE (mode
), 8)
7198 || known_eq (GET_MODE_SIZE (mode
), 16))
7199 && aarch64_offset_7bit_signed_scaled_p (mode
, offset
));
7201 return aarch64_offset_9bit_signed_unscaled_p (mode
, offset
);
7208 /* load literal: pc-relative constant pool entry. Only supported
7209 for SI mode or larger. */
7210 info
->type
= ADDRESS_SYMBOLIC
;
7212 if (!load_store_pair_p
7213 && GET_MODE_SIZE (mode
).is_constant (&const_size
)
7218 split_const (x
, &sym
, &addend
);
7219 return ((GET_CODE (sym
) == LABEL_REF
7220 || (GET_CODE (sym
) == SYMBOL_REF
7221 && CONSTANT_POOL_ADDRESS_P (sym
)
7222 && aarch64_pcrelative_literal_loads
)));
7227 info
->type
= ADDRESS_LO_SUM
;
7228 info
->base
= XEXP (x
, 0);
7229 info
->offset
= XEXP (x
, 1);
7230 if (allow_reg_index_p
7231 && aarch64_base_register_rtx_p (info
->base
, strict_p
))
7234 split_const (info
->offset
, &sym
, &offs
);
7235 if (GET_CODE (sym
) == SYMBOL_REF
7236 && (aarch64_classify_symbol (sym
, INTVAL (offs
))
7237 == SYMBOL_SMALL_ABSOLUTE
))
7239 /* The symbol and offset must be aligned to the access size. */
7242 if (CONSTANT_POOL_ADDRESS_P (sym
))
7243 align
= GET_MODE_ALIGNMENT (get_pool_mode (sym
));
7244 else if (TREE_CONSTANT_POOL_ADDRESS_P (sym
))
7246 tree exp
= SYMBOL_REF_DECL (sym
);
7247 align
= TYPE_ALIGN (TREE_TYPE (exp
));
7248 align
= aarch64_constant_alignment (exp
, align
);
7250 else if (SYMBOL_REF_DECL (sym
))
7251 align
= DECL_ALIGN (SYMBOL_REF_DECL (sym
));
7252 else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym
)
7253 && SYMBOL_REF_BLOCK (sym
) != NULL
)
7254 align
= SYMBOL_REF_BLOCK (sym
)->alignment
;
7256 align
= BITS_PER_UNIT
;
7258 poly_int64 ref_size
= GET_MODE_SIZE (mode
);
7259 if (known_eq (ref_size
, 0))
7260 ref_size
= GET_MODE_SIZE (DImode
);
7262 return (multiple_p (INTVAL (offs
), ref_size
)
7263 && multiple_p (align
/ BITS_PER_UNIT
, ref_size
));
7273 /* Return true if the address X is valid for a PRFM instruction.
7274 STRICT_P is true if we should do strict checking with
7275 aarch64_classify_address. */
7278 aarch64_address_valid_for_prefetch_p (rtx x
, bool strict_p
)
7280 struct aarch64_address_info addr
;
7282 /* PRFM accepts the same addresses as DImode... */
7283 bool res
= aarch64_classify_address (&addr
, x
, DImode
, strict_p
);
7287 /* ... except writeback forms. */
7288 return addr
.type
!= ADDRESS_REG_WB
;
7292 aarch64_symbolic_address_p (rtx x
)
7296 split_const (x
, &x
, &offset
);
7297 return GET_CODE (x
) == SYMBOL_REF
|| GET_CODE (x
) == LABEL_REF
;
7300 /* Classify the base of symbolic expression X. */
7302 enum aarch64_symbol_type
7303 aarch64_classify_symbolic_expression (rtx x
)
7307 split_const (x
, &x
, &offset
);
7308 return aarch64_classify_symbol (x
, INTVAL (offset
));
7312 /* Return TRUE if X is a legitimate address for accessing memory in
7315 aarch64_legitimate_address_hook_p (machine_mode mode
, rtx x
, bool strict_p
)
7317 struct aarch64_address_info addr
;
7319 return aarch64_classify_address (&addr
, x
, mode
, strict_p
);
7322 /* Return TRUE if X is a legitimate address of type TYPE for accessing
7323 memory in mode MODE. STRICT_P is true if REG_OK_STRICT is in effect. */
7325 aarch64_legitimate_address_p (machine_mode mode
, rtx x
, bool strict_p
,
7326 aarch64_addr_query_type type
)
7328 struct aarch64_address_info addr
;
7330 return aarch64_classify_address (&addr
, x
, mode
, strict_p
, type
);
7333 /* Implement TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT. */
7336 aarch64_legitimize_address_displacement (rtx
*offset1
, rtx
*offset2
,
7337 poly_int64 orig_offset
,
7341 if (GET_MODE_SIZE (mode
).is_constant (&size
))
7343 HOST_WIDE_INT const_offset
, second_offset
;
7345 /* A general SVE offset is A * VQ + B. Remove the A component from
7346 coefficient 0 in order to get the constant B. */
7347 const_offset
= orig_offset
.coeffs
[0] - orig_offset
.coeffs
[1];
7349 /* Split an out-of-range address displacement into a base and
7350 offset. Use 4KB range for 1- and 2-byte accesses and a 16KB
7351 range otherwise to increase opportunities for sharing the base
7352 address of different sizes. Unaligned accesses use the signed
7353 9-bit range, TImode/TFmode use the intersection of signed
7354 scaled 7-bit and signed 9-bit offset. */
7355 if (mode
== TImode
|| mode
== TFmode
)
7356 second_offset
= ((const_offset
+ 0x100) & 0x1f8) - 0x100;
7357 else if ((const_offset
& (size
- 1)) != 0)
7358 second_offset
= ((const_offset
+ 0x100) & 0x1ff) - 0x100;
7360 second_offset
= const_offset
& (size
< 4 ? 0xfff : 0x3ffc);
7362 if (second_offset
== 0 || known_eq (orig_offset
, second_offset
))
7365 /* Split the offset into second_offset and the rest. */
7366 *offset1
= gen_int_mode (orig_offset
- second_offset
, Pmode
);
7367 *offset2
= gen_int_mode (second_offset
, Pmode
);
7372 /* Get the mode we should use as the basis of the range. For structure
7373 modes this is the mode of one vector. */
7374 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
7375 machine_mode step_mode
7376 = (vec_flags
& VEC_STRUCT
) != 0 ? SVE_BYTE_MODE
: mode
;
7378 /* Get the "mul vl" multiplier we'd like to use. */
7379 HOST_WIDE_INT factor
= GET_MODE_SIZE (step_mode
).coeffs
[1];
7380 HOST_WIDE_INT vnum
= orig_offset
.coeffs
[1] / factor
;
7381 if (vec_flags
& VEC_SVE_DATA
)
7382 /* LDR supports a 9-bit range, but the move patterns for
7383 structure modes require all vectors to be in range of the
7384 same base. The simplest way of accomodating that while still
7385 promoting reuse of anchor points between different modes is
7386 to use an 8-bit range unconditionally. */
7387 vnum
= ((vnum
+ 128) & 255) - 128;
7389 /* Predicates are only handled singly, so we might as well use
7391 vnum
= ((vnum
+ 256) & 511) - 256;
7395 /* Convert the "mul vl" multiplier into a byte offset. */
7396 poly_int64 second_offset
= GET_MODE_SIZE (step_mode
) * vnum
;
7397 if (known_eq (second_offset
, orig_offset
))
7400 /* Split the offset into second_offset and the rest. */
7401 *offset1
= gen_int_mode (orig_offset
- second_offset
, Pmode
);
7402 *offset2
= gen_int_mode (second_offset
, Pmode
);
7407 /* Return the binary representation of floating point constant VALUE in INTVAL.
7408 If the value cannot be converted, return false without setting INTVAL.
7409 The conversion is done in the given MODE. */
7411 aarch64_reinterpret_float_as_int (rtx value
, unsigned HOST_WIDE_INT
*intval
)
7414 /* We make a general exception for 0. */
7415 if (aarch64_float_const_zero_rtx_p (value
))
7421 scalar_float_mode mode
;
7422 if (GET_CODE (value
) != CONST_DOUBLE
7423 || !is_a
<scalar_float_mode
> (GET_MODE (value
), &mode
)
7424 || GET_MODE_BITSIZE (mode
) > HOST_BITS_PER_WIDE_INT
7425 /* Only support up to DF mode. */
7426 || GET_MODE_BITSIZE (mode
) > GET_MODE_BITSIZE (DFmode
))
7429 unsigned HOST_WIDE_INT ival
= 0;
7432 real_to_target (res
,
7433 CONST_DOUBLE_REAL_VALUE (value
),
7434 REAL_MODE_FORMAT (mode
));
7438 int order
= BYTES_BIG_ENDIAN
? 1 : 0;
7439 ival
= zext_hwi (res
[order
], 32);
7440 ival
|= (zext_hwi (res
[1 - order
], 32) << 32);
7443 ival
= zext_hwi (res
[0], 32);
7449 /* Return TRUE if rtx X is an immediate constant that can be moved using a
7450 single MOV(+MOVK) followed by an FMOV. */
7452 aarch64_float_const_rtx_p (rtx x
)
7454 machine_mode mode
= GET_MODE (x
);
7455 if (mode
== VOIDmode
)
7458 /* Determine whether it's cheaper to write float constants as
7459 mov/movk pairs over ldr/adrp pairs. */
7460 unsigned HOST_WIDE_INT ival
;
7462 if (GET_CODE (x
) == CONST_DOUBLE
7463 && SCALAR_FLOAT_MODE_P (mode
)
7464 && aarch64_reinterpret_float_as_int (x
, &ival
))
7466 scalar_int_mode imode
= (mode
== HFmode
7468 : int_mode_for_mode (mode
).require ());
7469 int num_instr
= aarch64_internal_mov_immediate
7470 (NULL_RTX
, gen_int_mode (ival
, imode
), false, imode
);
7471 return num_instr
< 3;
7477 /* Return TRUE if rtx X is immediate constant 0.0 */
7479 aarch64_float_const_zero_rtx_p (rtx x
)
7481 if (GET_MODE (x
) == VOIDmode
)
7484 if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x
)))
7485 return !HONOR_SIGNED_ZEROS (GET_MODE (x
));
7486 return real_equal (CONST_DOUBLE_REAL_VALUE (x
), &dconst0
);
7489 /* Return TRUE if rtx X is immediate constant that fits in a single
7490 MOVI immediate operation. */
7492 aarch64_can_const_movi_rtx_p (rtx x
, machine_mode mode
)
7498 scalar_int_mode imode
;
7499 unsigned HOST_WIDE_INT ival
;
7501 if (GET_CODE (x
) == CONST_DOUBLE
7502 && SCALAR_FLOAT_MODE_P (mode
))
7504 if (!aarch64_reinterpret_float_as_int (x
, &ival
))
7507 /* We make a general exception for 0. */
7508 if (aarch64_float_const_zero_rtx_p (x
))
7511 imode
= int_mode_for_mode (mode
).require ();
7513 else if (GET_CODE (x
) == CONST_INT
7514 && is_a
<scalar_int_mode
> (mode
, &imode
))
7519 /* use a 64 bit mode for everything except for DI/DF mode, where we use
7520 a 128 bit vector mode. */
7521 int width
= GET_MODE_BITSIZE (imode
) == 64 ? 128 : 64;
7523 vmode
= aarch64_simd_container_mode (imode
, width
);
7524 rtx v_op
= aarch64_simd_gen_const_vector_dup (vmode
, ival
);
7526 return aarch64_simd_valid_immediate (v_op
, NULL
);
7530 /* Return the fixed registers used for condition codes. */
7533 aarch64_fixed_condition_code_regs (unsigned int *p1
, unsigned int *p2
)
7536 *p2
= INVALID_REGNUM
;
7540 /* This function is used by the call expanders of the machine description.
7541 RESULT is the register in which the result is returned. It's NULL for
7542 "call" and "sibcall".
7543 MEM is the location of the function call.
7544 SIBCALL indicates whether this function call is normal call or sibling call.
7545 It will generate different pattern accordingly. */
7548 aarch64_expand_call (rtx result
, rtx mem
, bool sibcall
)
7550 rtx call
, callee
, tmp
;
7554 gcc_assert (MEM_P (mem
));
7555 callee
= XEXP (mem
, 0);
7556 mode
= GET_MODE (callee
);
7557 gcc_assert (mode
== Pmode
);
7559 /* Decide if we should generate indirect calls by loading the
7560 address of the callee into a register before performing
7561 the branch-and-link. */
7562 if (SYMBOL_REF_P (callee
)
7563 ? (aarch64_is_long_call_p (callee
)
7564 || aarch64_is_noplt_call_p (callee
))
7566 XEXP (mem
, 0) = force_reg (mode
, callee
);
7568 call
= gen_rtx_CALL (VOIDmode
, mem
, const0_rtx
);
7570 if (result
!= NULL_RTX
)
7571 call
= gen_rtx_SET (result
, call
);
7576 tmp
= gen_rtx_CLOBBER (VOIDmode
, gen_rtx_REG (Pmode
, LR_REGNUM
));
7578 vec
= gen_rtvec (2, call
, tmp
);
7579 call
= gen_rtx_PARALLEL (VOIDmode
, vec
);
7581 aarch64_emit_call_insn (call
);
7584 /* Emit call insn with PAT and do aarch64-specific handling. */
7587 aarch64_emit_call_insn (rtx pat
)
7589 rtx insn
= emit_call_insn (pat
);
7591 rtx
*fusage
= &CALL_INSN_FUNCTION_USAGE (insn
);
7592 clobber_reg (fusage
, gen_rtx_REG (word_mode
, IP0_REGNUM
));
7593 clobber_reg (fusage
, gen_rtx_REG (word_mode
, IP1_REGNUM
));
7597 aarch64_select_cc_mode (RTX_CODE code
, rtx x
, rtx y
)
7599 machine_mode mode_x
= GET_MODE (x
);
7600 rtx_code code_x
= GET_CODE (x
);
7602 /* All floating point compares return CCFP if it is an equality
7603 comparison, and CCFPE otherwise. */
7604 if (GET_MODE_CLASS (mode_x
) == MODE_FLOAT
)
7631 /* Equality comparisons of short modes against zero can be performed
7632 using the TST instruction with the appropriate bitmask. */
7633 if (y
== const0_rtx
&& (REG_P (x
) || SUBREG_P (x
))
7634 && (code
== EQ
|| code
== NE
)
7635 && (mode_x
== HImode
|| mode_x
== QImode
))
7638 /* Similarly, comparisons of zero_extends from shorter modes can
7639 be performed using an ANDS with an immediate mask. */
7640 if (y
== const0_rtx
&& code_x
== ZERO_EXTEND
7641 && (mode_x
== SImode
|| mode_x
== DImode
)
7642 && (GET_MODE (XEXP (x
, 0)) == HImode
|| GET_MODE (XEXP (x
, 0)) == QImode
)
7643 && (code
== EQ
|| code
== NE
))
7646 if ((mode_x
== SImode
|| mode_x
== DImode
)
7648 && (code
== EQ
|| code
== NE
|| code
== LT
|| code
== GE
)
7649 && (code_x
== PLUS
|| code_x
== MINUS
|| code_x
== AND
7651 || (code_x
== ZERO_EXTRACT
&& CONST_INT_P (XEXP (x
, 1))
7652 && CONST_INT_P (XEXP (x
, 2)))))
7655 /* A compare with a shifted operand. Because of canonicalization,
7656 the comparison will have to be swapped when we emit the assembly
7658 if ((mode_x
== SImode
|| mode_x
== DImode
)
7659 && (REG_P (y
) || GET_CODE (y
) == SUBREG
|| y
== const0_rtx
)
7660 && (code_x
== ASHIFT
|| code_x
== ASHIFTRT
7661 || code_x
== LSHIFTRT
7662 || code_x
== ZERO_EXTEND
|| code_x
== SIGN_EXTEND
))
7665 /* Similarly for a negated operand, but we can only do this for
7667 if ((mode_x
== SImode
|| mode_x
== DImode
)
7668 && (REG_P (y
) || GET_CODE (y
) == SUBREG
)
7669 && (code
== EQ
|| code
== NE
)
7673 /* A test for unsigned overflow from an addition. */
7674 if ((mode_x
== DImode
|| mode_x
== TImode
)
7675 && (code
== LTU
|| code
== GEU
)
7677 && rtx_equal_p (XEXP (x
, 0), y
))
7680 /* A test for unsigned overflow from an add with carry. */
7681 if ((mode_x
== DImode
|| mode_x
== TImode
)
7682 && (code
== LTU
|| code
== GEU
)
7684 && CONST_SCALAR_INT_P (y
)
7685 && (rtx_mode_t (y
, mode_x
)
7686 == (wi::shwi (1, mode_x
)
7687 << (GET_MODE_BITSIZE (mode_x
).to_constant () / 2))))
7690 /* A test for signed overflow. */
7691 if ((mode_x
== DImode
|| mode_x
== TImode
)
7694 && GET_CODE (y
) == SIGN_EXTEND
)
7697 /* For everything else, return CCmode. */
7702 aarch64_get_condition_code_1 (machine_mode
, enum rtx_code
);
7705 aarch64_get_condition_code (rtx x
)
7707 machine_mode mode
= GET_MODE (XEXP (x
, 0));
7708 enum rtx_code comp_code
= GET_CODE (x
);
7710 if (GET_MODE_CLASS (mode
) != MODE_CC
)
7711 mode
= SELECT_CC_MODE (comp_code
, XEXP (x
, 0), XEXP (x
, 1));
7712 return aarch64_get_condition_code_1 (mode
, comp_code
);
7716 aarch64_get_condition_code_1 (machine_mode mode
, enum rtx_code comp_code
)
7724 case GE
: return AARCH64_GE
;
7725 case GT
: return AARCH64_GT
;
7726 case LE
: return AARCH64_LS
;
7727 case LT
: return AARCH64_MI
;
7728 case NE
: return AARCH64_NE
;
7729 case EQ
: return AARCH64_EQ
;
7730 case ORDERED
: return AARCH64_VC
;
7731 case UNORDERED
: return AARCH64_VS
;
7732 case UNLT
: return AARCH64_LT
;
7733 case UNLE
: return AARCH64_LE
;
7734 case UNGT
: return AARCH64_HI
;
7735 case UNGE
: return AARCH64_PL
;
7743 case NE
: return AARCH64_NE
;
7744 case EQ
: return AARCH64_EQ
;
7745 case GE
: return AARCH64_GE
;
7746 case GT
: return AARCH64_GT
;
7747 case LE
: return AARCH64_LE
;
7748 case LT
: return AARCH64_LT
;
7749 case GEU
: return AARCH64_CS
;
7750 case GTU
: return AARCH64_HI
;
7751 case LEU
: return AARCH64_LS
;
7752 case LTU
: return AARCH64_CC
;
7760 case NE
: return AARCH64_NE
;
7761 case EQ
: return AARCH64_EQ
;
7762 case GE
: return AARCH64_LE
;
7763 case GT
: return AARCH64_LT
;
7764 case LE
: return AARCH64_GE
;
7765 case LT
: return AARCH64_GT
;
7766 case GEU
: return AARCH64_LS
;
7767 case GTU
: return AARCH64_CC
;
7768 case LEU
: return AARCH64_CS
;
7769 case LTU
: return AARCH64_HI
;
7777 case NE
: return AARCH64_NE
; /* = any */
7778 case EQ
: return AARCH64_EQ
; /* = none */
7779 case GE
: return AARCH64_PL
; /* = nfrst */
7780 case LT
: return AARCH64_MI
; /* = first */
7781 case GEU
: return AARCH64_CS
; /* = nlast */
7782 case GTU
: return AARCH64_HI
; /* = pmore */
7783 case LEU
: return AARCH64_LS
; /* = plast */
7784 case LTU
: return AARCH64_CC
; /* = last */
7792 case NE
: return AARCH64_NE
;
7793 case EQ
: return AARCH64_EQ
;
7794 case GE
: return AARCH64_PL
;
7795 case LT
: return AARCH64_MI
;
7803 case NE
: return AARCH64_NE
;
7804 case EQ
: return AARCH64_EQ
;
7812 case LTU
: return AARCH64_CS
;
7813 case GEU
: return AARCH64_CC
;
7821 case GEU
: return AARCH64_CS
;
7822 case LTU
: return AARCH64_CC
;
7830 case NE
: return AARCH64_VS
;
7831 case EQ
: return AARCH64_VC
;
7844 aarch64_const_vec_all_same_in_range_p (rtx x
,
7845 HOST_WIDE_INT minval
,
7846 HOST_WIDE_INT maxval
)
7849 return (const_vec_duplicate_p (x
, &elt
)
7850 && CONST_INT_P (elt
)
7851 && IN_RANGE (INTVAL (elt
), minval
, maxval
));
7855 aarch64_const_vec_all_same_int_p (rtx x
, HOST_WIDE_INT val
)
7857 return aarch64_const_vec_all_same_in_range_p (x
, val
, val
);
7860 /* Return true if VEC is a constant in which every element is in the range
7861 [MINVAL, MAXVAL]. The elements do not need to have the same value. */
7864 aarch64_const_vec_all_in_range_p (rtx vec
,
7865 HOST_WIDE_INT minval
,
7866 HOST_WIDE_INT maxval
)
7868 if (GET_CODE (vec
) != CONST_VECTOR
7869 || GET_MODE_CLASS (GET_MODE (vec
)) != MODE_VECTOR_INT
)
7873 if (!CONST_VECTOR_STEPPED_P (vec
))
7874 nunits
= const_vector_encoded_nelts (vec
);
7875 else if (!CONST_VECTOR_NUNITS (vec
).is_constant (&nunits
))
7878 for (int i
= 0; i
< nunits
; i
++)
7880 rtx vec_elem
= CONST_VECTOR_ELT (vec
, i
);
7881 if (!CONST_INT_P (vec_elem
)
7882 || !IN_RANGE (INTVAL (vec_elem
), minval
, maxval
))
7889 #define AARCH64_CC_V 1
7890 #define AARCH64_CC_C (1 << 1)
7891 #define AARCH64_CC_Z (1 << 2)
7892 #define AARCH64_CC_N (1 << 3)
7894 /* N Z C V flags for ccmp. Indexed by AARCH64_COND_CODE. */
7895 static const int aarch64_nzcv_codes
[] =
7897 0, /* EQ, Z == 1. */
7898 AARCH64_CC_Z
, /* NE, Z == 0. */
7899 0, /* CS, C == 1. */
7900 AARCH64_CC_C
, /* CC, C == 0. */
7901 0, /* MI, N == 1. */
7902 AARCH64_CC_N
, /* PL, N == 0. */
7903 0, /* VS, V == 1. */
7904 AARCH64_CC_V
, /* VC, V == 0. */
7905 0, /* HI, C ==1 && Z == 0. */
7906 AARCH64_CC_C
, /* LS, !(C == 1 && Z == 0). */
7907 AARCH64_CC_V
, /* GE, N == V. */
7908 0, /* LT, N != V. */
7909 AARCH64_CC_Z
, /* GT, Z == 0 && N == V. */
7910 0, /* LE, !(Z == 0 && N == V). */
7915 /* Print floating-point vector immediate operand X to F, negating it
7916 first if NEGATE is true. Return true on success, false if it isn't
7917 a constant we can handle. */
7920 aarch64_print_vector_float_operand (FILE *f
, rtx x
, bool negate
)
7924 if (!const_vec_duplicate_p (x
, &elt
))
7927 REAL_VALUE_TYPE r
= *CONST_DOUBLE_REAL_VALUE (elt
);
7929 r
= real_value_negate (&r
);
7931 /* We only handle the SVE single-bit immediates here. */
7932 if (real_equal (&r
, &dconst0
))
7933 asm_fprintf (f
, "0.0");
7934 else if (real_equal (&r
, &dconst1
))
7935 asm_fprintf (f
, "1.0");
7936 else if (real_equal (&r
, &dconsthalf
))
7937 asm_fprintf (f
, "0.5");
7944 /* Return the equivalent letter for size. */
7946 sizetochar (int size
)
7950 case 64: return 'd';
7951 case 32: return 's';
7952 case 16: return 'h';
7953 case 8 : return 'b';
7954 default: gcc_unreachable ();
7958 /* Print operand X to file F in a target specific manner according to CODE.
7959 The acceptable formatting commands given by CODE are:
7960 'c': An integer or symbol address without a preceding #
7962 'C': Take the duplicated element in a vector constant
7963 and print it in hex.
7964 'D': Take the duplicated element in a vector constant
7965 and print it as an unsigned integer, in decimal.
7966 'e': Print the sign/zero-extend size as a character 8->b,
7968 'p': Prints N such that 2^N == X (X must be power of 2 and
7970 'P': Print the number of non-zero bits in X (a const_int).
7971 'H': Print the higher numbered register of a pair (TImode)
7973 'm': Print a condition (eq, ne, etc).
7974 'M': Same as 'm', but invert condition.
7975 'N': Take the duplicated element in a vector constant
7976 and print the negative of it in decimal.
7977 'b/h/s/d/q': Print a scalar FP/SIMD register name.
7978 'S/T/U/V': Print a FP/SIMD register name for a register list.
7979 The register printed is the FP/SIMD register name
7980 of X + 0/1/2/3 for S/T/U/V.
7981 'R': Print a scalar FP/SIMD register name + 1.
7982 'X': Print bottom 16 bits of integer constant in hex.
7983 'w/x': Print a general register name or the zero register
7985 '0': Print a normal operand, if it's a general register,
7986 then we assume DImode.
7987 'k': Print NZCV for conditional compare instructions.
7988 'A': Output address constant representing the first
7989 argument of X, specifying a relocation offset
7991 'L': Output constant address specified by X
7992 with a relocation offset if appropriate.
7993 'G': Prints address of X, specifying a PC relative
7994 relocation mode if appropriate.
7995 'y': Output address of LDP or STP - this is used for
7996 some LDP/STPs which don't use a PARALLEL in their
7997 pattern (so the mode needs to be adjusted).
7998 'z': Output address of a typical LDP or STP. */
8001 aarch64_print_operand (FILE *f
, rtx x
, int code
)
8007 switch (GET_CODE (x
))
8010 fprintf (f
, HOST_WIDE_INT_PRINT_DEC
, INTVAL (x
));
8014 output_addr_const (f
, x
);
8018 if (GET_CODE (XEXP (x
, 0)) == PLUS
8019 && GET_CODE (XEXP (XEXP (x
, 0), 0)) == SYMBOL_REF
)
8021 output_addr_const (f
, x
);
8027 output_operand_lossage ("unsupported operand for code '%c'", code
);
8035 if (!CONST_INT_P (x
)
8036 || (n
= exact_log2 (INTVAL (x
) & ~7)) <= 0)
8038 output_operand_lossage ("invalid operand for '%%%c'", code
);
8054 output_operand_lossage ("invalid operand for '%%%c'", code
);
8064 if (!CONST_INT_P (x
) || (n
= exact_log2 (INTVAL (x
))) < 0)
8066 output_operand_lossage ("invalid operand for '%%%c'", code
);
8070 asm_fprintf (f
, "%d", n
);
8075 if (!CONST_INT_P (x
))
8077 output_operand_lossage ("invalid operand for '%%%c'", code
);
8081 asm_fprintf (f
, "%u", popcount_hwi (INTVAL (x
)));
8085 if (x
== const0_rtx
)
8087 asm_fprintf (f
, "xzr");
8091 if (!REG_P (x
) || !GP_REGNUM_P (REGNO (x
) + 1))
8093 output_operand_lossage ("invalid operand for '%%%c'", code
);
8097 asm_fprintf (f
, "%s", reg_names
[REGNO (x
) + 1]);
8104 /* CONST_TRUE_RTX means al/nv (al is the default, don't print it). */
8105 if (x
== const_true_rtx
)
8112 if (!COMPARISON_P (x
))
8114 output_operand_lossage ("invalid operand for '%%%c'", code
);
8118 cond_code
= aarch64_get_condition_code (x
);
8119 gcc_assert (cond_code
>= 0);
8121 cond_code
= AARCH64_INVERSE_CONDITION_CODE (cond_code
);
8122 if (GET_MODE (XEXP (x
, 0)) == CC_NZCmode
)
8123 fputs (aarch64_sve_condition_codes
[cond_code
], f
);
8125 fputs (aarch64_condition_codes
[cond_code
], f
);
8130 if (!const_vec_duplicate_p (x
, &elt
))
8132 output_operand_lossage ("invalid vector constant");
8136 if (GET_MODE_CLASS (GET_MODE (x
)) == MODE_VECTOR_INT
)
8137 asm_fprintf (f
, "%wd", -INTVAL (elt
));
8138 else if (GET_MODE_CLASS (GET_MODE (x
)) == MODE_VECTOR_FLOAT
8139 && aarch64_print_vector_float_operand (f
, x
, true))
8143 output_operand_lossage ("invalid vector constant");
8153 if (!REG_P (x
) || !FP_REGNUM_P (REGNO (x
)))
8155 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code
);
8158 asm_fprintf (f
, "%c%d", code
, REGNO (x
) - V0_REGNUM
);
8165 if (!REG_P (x
) || !FP_REGNUM_P (REGNO (x
)))
8167 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code
);
8170 asm_fprintf (f
, "%c%d",
8171 aarch64_sve_data_mode_p (GET_MODE (x
)) ? 'z' : 'v',
8172 REGNO (x
) - V0_REGNUM
+ (code
- 'S'));
8176 if (!REG_P (x
) || !FP_REGNUM_P (REGNO (x
)))
8178 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code
);
8181 asm_fprintf (f
, "q%d", REGNO (x
) - V0_REGNUM
+ 1);
8185 if (!CONST_INT_P (x
))
8187 output_operand_lossage ("invalid operand for '%%%c'", code
);
8190 asm_fprintf (f
, "0x%wx", UINTVAL (x
) & 0xffff);
8195 /* Print a replicated constant in hex. */
8196 if (!const_vec_duplicate_p (x
, &elt
) || !CONST_INT_P (elt
))
8198 output_operand_lossage ("invalid operand for '%%%c'", code
);
8201 scalar_mode inner_mode
= GET_MODE_INNER (GET_MODE (x
));
8202 asm_fprintf (f
, "0x%wx", UINTVAL (elt
) & GET_MODE_MASK (inner_mode
));
8208 /* Print a replicated constant in decimal, treating it as
8210 if (!const_vec_duplicate_p (x
, &elt
) || !CONST_INT_P (elt
))
8212 output_operand_lossage ("invalid operand for '%%%c'", code
);
8215 scalar_mode inner_mode
= GET_MODE_INNER (GET_MODE (x
));
8216 asm_fprintf (f
, "%wd", UINTVAL (elt
) & GET_MODE_MASK (inner_mode
));
8223 || (CONST_DOUBLE_P (x
) && aarch64_float_const_zero_rtx_p (x
)))
8225 asm_fprintf (f
, "%czr", code
);
8229 if (REG_P (x
) && GP_REGNUM_P (REGNO (x
)))
8231 asm_fprintf (f
, "%c%d", code
, REGNO (x
) - R0_REGNUM
);
8235 if (REG_P (x
) && REGNO (x
) == SP_REGNUM
)
8237 asm_fprintf (f
, "%ssp", code
== 'w' ? "w" : "");
8246 output_operand_lossage ("missing operand");
8250 switch (GET_CODE (x
))
8253 if (aarch64_sve_data_mode_p (GET_MODE (x
)))
8255 if (REG_NREGS (x
) == 1)
8256 asm_fprintf (f
, "z%d", REGNO (x
) - V0_REGNUM
);
8260 = sizetochar (GET_MODE_UNIT_BITSIZE (GET_MODE (x
)));
8261 asm_fprintf (f
, "{z%d.%c - z%d.%c}",
8262 REGNO (x
) - V0_REGNUM
, suffix
,
8263 END_REGNO (x
) - V0_REGNUM
- 1, suffix
);
8267 asm_fprintf (f
, "%s", reg_names
[REGNO (x
)]);
8271 output_address (GET_MODE (x
), XEXP (x
, 0));
8276 output_addr_const (asm_out_file
, x
);
8280 asm_fprintf (f
, "%wd", INTVAL (x
));
8284 if (!VECTOR_MODE_P (GET_MODE (x
)))
8286 output_addr_const (asm_out_file
, x
);
8292 if (!const_vec_duplicate_p (x
, &elt
))
8294 output_operand_lossage ("invalid vector constant");
8298 if (GET_MODE_CLASS (GET_MODE (x
)) == MODE_VECTOR_INT
)
8299 asm_fprintf (f
, "%wd", INTVAL (elt
));
8300 else if (GET_MODE_CLASS (GET_MODE (x
)) == MODE_VECTOR_FLOAT
8301 && aarch64_print_vector_float_operand (f
, x
, false))
8305 output_operand_lossage ("invalid vector constant");
8311 /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
8312 be getting CONST_DOUBLEs holding integers. */
8313 gcc_assert (GET_MODE (x
) != VOIDmode
);
8314 if (aarch64_float_const_zero_rtx_p (x
))
8319 else if (aarch64_float_const_representable_p (x
))
8322 char float_buf
[buf_size
] = {'\0'};
8323 real_to_decimal_for_mode (float_buf
,
8324 CONST_DOUBLE_REAL_VALUE (x
),
8327 asm_fprintf (asm_out_file
, "%s", float_buf
);
8331 output_operand_lossage ("invalid constant");
8334 output_operand_lossage ("invalid operand");
8340 if (GET_CODE (x
) == HIGH
)
8343 switch (aarch64_classify_symbolic_expression (x
))
8345 case SYMBOL_SMALL_GOT_4G
:
8346 asm_fprintf (asm_out_file
, ":got:");
8349 case SYMBOL_SMALL_TLSGD
:
8350 asm_fprintf (asm_out_file
, ":tlsgd:");
8353 case SYMBOL_SMALL_TLSDESC
:
8354 asm_fprintf (asm_out_file
, ":tlsdesc:");
8357 case SYMBOL_SMALL_TLSIE
:
8358 asm_fprintf (asm_out_file
, ":gottprel:");
8361 case SYMBOL_TLSLE24
:
8362 asm_fprintf (asm_out_file
, ":tprel:");
8365 case SYMBOL_TINY_GOT
:
8372 output_addr_const (asm_out_file
, x
);
8376 switch (aarch64_classify_symbolic_expression (x
))
8378 case SYMBOL_SMALL_GOT_4G
:
8379 asm_fprintf (asm_out_file
, ":lo12:");
8382 case SYMBOL_SMALL_TLSGD
:
8383 asm_fprintf (asm_out_file
, ":tlsgd_lo12:");
8386 case SYMBOL_SMALL_TLSDESC
:
8387 asm_fprintf (asm_out_file
, ":tlsdesc_lo12:");
8390 case SYMBOL_SMALL_TLSIE
:
8391 asm_fprintf (asm_out_file
, ":gottprel_lo12:");
8394 case SYMBOL_TLSLE12
:
8395 asm_fprintf (asm_out_file
, ":tprel_lo12:");
8398 case SYMBOL_TLSLE24
:
8399 asm_fprintf (asm_out_file
, ":tprel_lo12_nc:");
8402 case SYMBOL_TINY_GOT
:
8403 asm_fprintf (asm_out_file
, ":got:");
8406 case SYMBOL_TINY_TLSIE
:
8407 asm_fprintf (asm_out_file
, ":gottprel:");
8413 output_addr_const (asm_out_file
, x
);
8417 switch (aarch64_classify_symbolic_expression (x
))
8419 case SYMBOL_TLSLE24
:
8420 asm_fprintf (asm_out_file
, ":tprel_hi12:");
8425 output_addr_const (asm_out_file
, x
);
8430 HOST_WIDE_INT cond_code
;
8432 if (!CONST_INT_P (x
))
8434 output_operand_lossage ("invalid operand for '%%%c'", code
);
8438 cond_code
= INTVAL (x
);
8439 gcc_assert (cond_code
>= 0 && cond_code
<= AARCH64_NV
);
8440 asm_fprintf (f
, "%d", aarch64_nzcv_codes
[cond_code
]);
8447 machine_mode mode
= GET_MODE (x
);
8449 if (GET_CODE (x
) != MEM
8450 || (code
== 'y' && maybe_ne (GET_MODE_SIZE (mode
), 16)))
8452 output_operand_lossage ("invalid operand for '%%%c'", code
);
8456 if (!aarch64_print_address_internal (f
, mode
, XEXP (x
, 0),
8458 ? ADDR_QUERY_LDP_STP_N
8459 : ADDR_QUERY_LDP_STP
))
8460 output_operand_lossage ("invalid operand prefix '%%%c'", code
);
8465 output_operand_lossage ("invalid operand prefix '%%%c'", code
);
8470 /* Print address 'x' of a memory access with mode 'mode'.
8471 'op' is the context required by aarch64_classify_address. It can either be
8472 MEM for a normal memory access or PARALLEL for LDP/STP. */
8474 aarch64_print_address_internal (FILE *f
, machine_mode mode
, rtx x
,
8475 aarch64_addr_query_type type
)
8477 struct aarch64_address_info addr
;
8480 /* Check all addresses are Pmode - including ILP32. */
8481 if (GET_MODE (x
) != Pmode
8482 && (!CONST_INT_P (x
)
8483 || trunc_int_for_mode (INTVAL (x
), Pmode
) != INTVAL (x
)))
8485 output_operand_lossage ("invalid address mode");
8489 if (aarch64_classify_address (&addr
, x
, mode
, true, type
))
8492 case ADDRESS_REG_IMM
:
8493 if (known_eq (addr
.const_offset
, 0))
8494 asm_fprintf (f
, "[%s]", reg_names
[REGNO (addr
.base
)]);
8495 else if (aarch64_sve_data_mode_p (mode
))
8498 = exact_div (addr
.const_offset
,
8499 BYTES_PER_SVE_VECTOR
).to_constant ();
8500 asm_fprintf (f
, "[%s, #%wd, mul vl]",
8501 reg_names
[REGNO (addr
.base
)], vnum
);
8503 else if (aarch64_sve_pred_mode_p (mode
))
8506 = exact_div (addr
.const_offset
,
8507 BYTES_PER_SVE_PRED
).to_constant ();
8508 asm_fprintf (f
, "[%s, #%wd, mul vl]",
8509 reg_names
[REGNO (addr
.base
)], vnum
);
8512 asm_fprintf (f
, "[%s, %wd]", reg_names
[REGNO (addr
.base
)],
8513 INTVAL (addr
.offset
));
8516 case ADDRESS_REG_REG
:
8517 if (addr
.shift
== 0)
8518 asm_fprintf (f
, "[%s, %s]", reg_names
[REGNO (addr
.base
)],
8519 reg_names
[REGNO (addr
.offset
)]);
8521 asm_fprintf (f
, "[%s, %s, lsl %u]", reg_names
[REGNO (addr
.base
)],
8522 reg_names
[REGNO (addr
.offset
)], addr
.shift
);
8525 case ADDRESS_REG_UXTW
:
8526 if (addr
.shift
== 0)
8527 asm_fprintf (f
, "[%s, w%d, uxtw]", reg_names
[REGNO (addr
.base
)],
8528 REGNO (addr
.offset
) - R0_REGNUM
);
8530 asm_fprintf (f
, "[%s, w%d, uxtw %u]", reg_names
[REGNO (addr
.base
)],
8531 REGNO (addr
.offset
) - R0_REGNUM
, addr
.shift
);
8534 case ADDRESS_REG_SXTW
:
8535 if (addr
.shift
== 0)
8536 asm_fprintf (f
, "[%s, w%d, sxtw]", reg_names
[REGNO (addr
.base
)],
8537 REGNO (addr
.offset
) - R0_REGNUM
);
8539 asm_fprintf (f
, "[%s, w%d, sxtw %u]", reg_names
[REGNO (addr
.base
)],
8540 REGNO (addr
.offset
) - R0_REGNUM
, addr
.shift
);
8543 case ADDRESS_REG_WB
:
8544 /* Writeback is only supported for fixed-width modes. */
8545 size
= GET_MODE_SIZE (mode
).to_constant ();
8546 switch (GET_CODE (x
))
8549 asm_fprintf (f
, "[%s, %d]!", reg_names
[REGNO (addr
.base
)], size
);
8552 asm_fprintf (f
, "[%s], %d", reg_names
[REGNO (addr
.base
)], size
);
8555 asm_fprintf (f
, "[%s, -%d]!", reg_names
[REGNO (addr
.base
)], size
);
8558 asm_fprintf (f
, "[%s], -%d", reg_names
[REGNO (addr
.base
)], size
);
8561 asm_fprintf (f
, "[%s, %wd]!", reg_names
[REGNO (addr
.base
)],
8562 INTVAL (addr
.offset
));
8565 asm_fprintf (f
, "[%s], %wd", reg_names
[REGNO (addr
.base
)],
8566 INTVAL (addr
.offset
));
8573 case ADDRESS_LO_SUM
:
8574 asm_fprintf (f
, "[%s, #:lo12:", reg_names
[REGNO (addr
.base
)]);
8575 output_addr_const (f
, addr
.offset
);
8576 asm_fprintf (f
, "]");
8579 case ADDRESS_SYMBOLIC
:
8580 output_addr_const (f
, x
);
8587 /* Print address 'x' of a memory access with mode 'mode'. */
8589 aarch64_print_operand_address (FILE *f
, machine_mode mode
, rtx x
)
8591 if (!aarch64_print_address_internal (f
, mode
, x
, ADDR_QUERY_ANY
))
8592 output_addr_const (f
, x
);
8596 aarch64_label_mentioned_p (rtx x
)
8601 if (GET_CODE (x
) == LABEL_REF
)
8604 /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
8605 referencing instruction, but they are constant offsets, not
8607 if (GET_CODE (x
) == UNSPEC
&& XINT (x
, 1) == UNSPEC_TLS
)
8610 fmt
= GET_RTX_FORMAT (GET_CODE (x
));
8611 for (i
= GET_RTX_LENGTH (GET_CODE (x
)) - 1; i
>= 0; i
--)
8617 for (j
= XVECLEN (x
, i
) - 1; j
>= 0; j
--)
8618 if (aarch64_label_mentioned_p (XVECEXP (x
, i
, j
)))
8621 else if (fmt
[i
] == 'e' && aarch64_label_mentioned_p (XEXP (x
, i
)))
8628 /* Implement REGNO_REG_CLASS. */
8631 aarch64_regno_regclass (unsigned regno
)
8633 if (GP_REGNUM_P (regno
))
8634 return GENERAL_REGS
;
8636 if (regno
== SP_REGNUM
)
8639 if (regno
== FRAME_POINTER_REGNUM
8640 || regno
== ARG_POINTER_REGNUM
)
8641 return POINTER_REGS
;
8643 if (FP_REGNUM_P (regno
))
8644 return (FP_LO8_REGNUM_P (regno
) ? FP_LO8_REGS
8645 : FP_LO_REGNUM_P (regno
) ? FP_LO_REGS
: FP_REGS
);
8647 if (PR_REGNUM_P (regno
))
8648 return PR_LO_REGNUM_P (regno
) ? PR_LO_REGS
: PR_HI_REGS
;
8653 /* OFFSET is an address offset for mode MODE, which has SIZE bytes.
8654 If OFFSET is out of range, return an offset of an anchor point
8655 that is in range. Return 0 otherwise. */
8657 static HOST_WIDE_INT
8658 aarch64_anchor_offset (HOST_WIDE_INT offset
, HOST_WIDE_INT size
,
8661 /* Does it look like we'll need a 16-byte load/store-pair operation? */
8663 return (offset
+ 0x400) & ~0x7f0;
8665 /* For offsets that aren't a multiple of the access size, the limit is
8667 if (offset
& (size
- 1))
8669 /* BLKmode typically uses LDP of X-registers. */
8670 if (mode
== BLKmode
)
8671 return (offset
+ 512) & ~0x3ff;
8672 return (offset
+ 0x100) & ~0x1ff;
8675 /* Small negative offsets are supported. */
8676 if (IN_RANGE (offset
, -256, 0))
8679 if (mode
== TImode
|| mode
== TFmode
)
8680 return (offset
+ 0x100) & ~0x1ff;
8682 /* Use 12-bit offset by access size. */
8683 return offset
& (~0xfff * size
);
8687 aarch64_legitimize_address (rtx x
, rtx
/* orig_x */, machine_mode mode
)
8689 /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
8690 where mask is selected by alignment and size of the offset.
8691 We try to pick as large a range for the offset as possible to
8692 maximize the chance of a CSE. However, for aligned addresses
8693 we limit the range to 4k so that structures with different sized
8694 elements are likely to use the same base. We need to be careful
8695 not to split a CONST for some forms of address expression, otherwise
8696 it will generate sub-optimal code. */
8698 if (GET_CODE (x
) == PLUS
&& CONST_INT_P (XEXP (x
, 1)))
8700 rtx base
= XEXP (x
, 0);
8701 rtx offset_rtx
= XEXP (x
, 1);
8702 HOST_WIDE_INT offset
= INTVAL (offset_rtx
);
8704 if (GET_CODE (base
) == PLUS
)
8706 rtx op0
= XEXP (base
, 0);
8707 rtx op1
= XEXP (base
, 1);
8709 /* Force any scaling into a temp for CSE. */
8710 op0
= force_reg (Pmode
, op0
);
8711 op1
= force_reg (Pmode
, op1
);
8713 /* Let the pointer register be in op0. */
8714 if (REG_POINTER (op1
))
8715 std::swap (op0
, op1
);
8717 /* If the pointer is virtual or frame related, then we know that
8718 virtual register instantiation or register elimination is going
8719 to apply a second constant. We want the two constants folded
8720 together easily. Therefore, emit as (OP0 + CONST) + OP1. */
8721 if (virt_or_elim_regno_p (REGNO (op0
)))
8723 base
= expand_binop (Pmode
, add_optab
, op0
, offset_rtx
,
8724 NULL_RTX
, true, OPTAB_DIRECT
);
8725 return gen_rtx_PLUS (Pmode
, base
, op1
);
8728 /* Otherwise, in order to encourage CSE (and thence loop strength
8729 reduce) scaled addresses, emit as (OP0 + OP1) + CONST. */
8730 base
= expand_binop (Pmode
, add_optab
, op0
, op1
,
8731 NULL_RTX
, true, OPTAB_DIRECT
);
8732 x
= gen_rtx_PLUS (Pmode
, base
, offset_rtx
);
8736 if (GET_MODE_SIZE (mode
).is_constant (&size
))
8738 HOST_WIDE_INT base_offset
= aarch64_anchor_offset (offset
, size
,
8740 if (base_offset
!= 0)
8742 base
= plus_constant (Pmode
, base
, base_offset
);
8743 base
= force_operand (base
, NULL_RTX
);
8744 return plus_constant (Pmode
, base
, offset
- base_offset
);
8753 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED
, rtx x
,
8756 secondary_reload_info
*sri
)
8758 /* Use aarch64_sve_reload_be for SVE reloads that cannot be handled
8759 directly by the *aarch64_sve_mov<mode>_be move pattern. See the
8760 comment at the head of aarch64-sve.md for more details about the
8761 big-endian handling. */
8762 if (BYTES_BIG_ENDIAN
8763 && reg_class_subset_p (rclass
, FP_REGS
)
8764 && !((REG_P (x
) && HARD_REGISTER_P (x
))
8765 || aarch64_simd_valid_immediate (x
, NULL
))
8766 && aarch64_sve_data_mode_p (mode
))
8768 sri
->icode
= CODE_FOR_aarch64_sve_reload_be
;
8772 /* If we have to disable direct literal pool loads and stores because the
8773 function is too big, then we need a scratch register. */
8774 if (MEM_P (x
) && GET_CODE (x
) == SYMBOL_REF
&& CONSTANT_POOL_ADDRESS_P (x
)
8775 && (SCALAR_FLOAT_MODE_P (GET_MODE (x
))
8776 || targetm
.vector_mode_supported_p (GET_MODE (x
)))
8777 && !aarch64_pcrelative_literal_loads
)
8779 sri
->icode
= code_for_aarch64_reload_movcp (mode
, DImode
);
8783 /* Without the TARGET_SIMD instructions we cannot move a Q register
8784 to a Q register directly. We need a scratch. */
8785 if (REG_P (x
) && (mode
== TFmode
|| mode
== TImode
) && mode
== GET_MODE (x
)
8786 && FP_REGNUM_P (REGNO (x
)) && !TARGET_SIMD
8787 && reg_class_subset_p (rclass
, FP_REGS
))
8789 sri
->icode
= code_for_aarch64_reload_mov (mode
);
8793 /* A TFmode or TImode memory access should be handled via an FP_REGS
8794 because AArch64 has richer addressing modes for LDR/STR instructions
8795 than LDP/STP instructions. */
8796 if (TARGET_FLOAT
&& rclass
== GENERAL_REGS
8797 && known_eq (GET_MODE_SIZE (mode
), 16) && MEM_P (x
))
8800 if (rclass
== FP_REGS
&& (mode
== TImode
|| mode
== TFmode
) && CONSTANT_P(x
))
8801 return GENERAL_REGS
;
8807 aarch64_can_eliminate (const int from ATTRIBUTE_UNUSED
, const int to
)
8809 gcc_assert (from
== ARG_POINTER_REGNUM
|| from
== FRAME_POINTER_REGNUM
);
8811 /* If we need a frame pointer, ARG_POINTER_REGNUM and FRAME_POINTER_REGNUM
8812 can only eliminate to HARD_FRAME_POINTER_REGNUM. */
8813 if (frame_pointer_needed
)
8814 return to
== HARD_FRAME_POINTER_REGNUM
;
8819 aarch64_initial_elimination_offset (unsigned from
, unsigned to
)
8821 if (to
== HARD_FRAME_POINTER_REGNUM
)
8823 if (from
== ARG_POINTER_REGNUM
)
8824 return cfun
->machine
->frame
.hard_fp_offset
;
8826 if (from
== FRAME_POINTER_REGNUM
)
8827 return cfun
->machine
->frame
.hard_fp_offset
8828 - cfun
->machine
->frame
.locals_offset
;
8831 if (to
== STACK_POINTER_REGNUM
)
8833 if (from
== FRAME_POINTER_REGNUM
)
8834 return cfun
->machine
->frame
.frame_size
8835 - cfun
->machine
->frame
.locals_offset
;
8838 return cfun
->machine
->frame
.frame_size
;
8841 /* Implement RETURN_ADDR_RTX. We do not support moving back to a
8845 aarch64_return_addr (int count
, rtx frame ATTRIBUTE_UNUSED
)
8849 return get_hard_reg_initial_val (Pmode
, LR_REGNUM
);
8854 aarch64_asm_trampoline_template (FILE *f
)
8859 if (aarch64_bti_enabled ())
8861 asm_fprintf (f
, "\thint\t34 // bti c\n");
8868 asm_fprintf (f
, "\tldr\tw%d, .+%d\n", IP1_REGNUM
- R0_REGNUM
, offset1
);
8869 asm_fprintf (f
, "\tldr\tw%d, .+%d\n", STATIC_CHAIN_REGNUM
- R0_REGNUM
,
8874 asm_fprintf (f
, "\tldr\t%s, .+%d\n", reg_names
[IP1_REGNUM
], offset1
);
8875 asm_fprintf (f
, "\tldr\t%s, .+%d\n", reg_names
[STATIC_CHAIN_REGNUM
],
8878 asm_fprintf (f
, "\tbr\t%s\n", reg_names
[IP1_REGNUM
]);
8880 /* The trampoline needs an extra padding instruction. In case if BTI is
8881 enabled the padding instruction is replaced by the BTI instruction at
8883 if (!aarch64_bti_enabled ())
8884 assemble_aligned_integer (4, const0_rtx
);
8886 assemble_aligned_integer (POINTER_BYTES
, const0_rtx
);
8887 assemble_aligned_integer (POINTER_BYTES
, const0_rtx
);
8891 aarch64_trampoline_init (rtx m_tramp
, tree fndecl
, rtx chain_value
)
8893 rtx fnaddr
, mem
, a_tramp
;
8894 const int tramp_code_sz
= 16;
8896 /* Don't need to copy the trailing D-words, we fill those in below. */
8897 emit_block_move (m_tramp
, assemble_trampoline_template (),
8898 GEN_INT (tramp_code_sz
), BLOCK_OP_NORMAL
);
8899 mem
= adjust_address (m_tramp
, ptr_mode
, tramp_code_sz
);
8900 fnaddr
= XEXP (DECL_RTL (fndecl
), 0);
8901 if (GET_MODE (fnaddr
) != ptr_mode
)
8902 fnaddr
= convert_memory_address (ptr_mode
, fnaddr
);
8903 emit_move_insn (mem
, fnaddr
);
8905 mem
= adjust_address (m_tramp
, ptr_mode
, tramp_code_sz
+ POINTER_BYTES
);
8906 emit_move_insn (mem
, chain_value
);
8908 /* XXX We should really define a "clear_cache" pattern and use
8909 gen_clear_cache(). */
8910 a_tramp
= XEXP (m_tramp
, 0);
8911 emit_library_call (gen_rtx_SYMBOL_REF (Pmode
, "__clear_cache"),
8912 LCT_NORMAL
, VOIDmode
, a_tramp
, ptr_mode
,
8913 plus_constant (ptr_mode
, a_tramp
, TRAMPOLINE_SIZE
),
8917 static unsigned char
8918 aarch64_class_max_nregs (reg_class_t regclass
, machine_mode mode
)
8920 /* ??? Logically we should only need to provide a value when
8921 HARD_REGNO_MODE_OK says that at least one register in REGCLASS
8922 can hold MODE, but at the moment we need to handle all modes.
8923 Just ignore any runtime parts for registers that can't store them. */
8924 HOST_WIDE_INT lowest_size
= constant_lower_bound (GET_MODE_SIZE (mode
));
8928 case TAILCALL_ADDR_REGS
:
8932 case POINTER_AND_FP_REGS
:
8936 if (aarch64_sve_data_mode_p (mode
)
8937 && constant_multiple_p (GET_MODE_SIZE (mode
),
8938 BYTES_PER_SVE_VECTOR
, &nregs
))
8940 return (aarch64_vector_data_mode_p (mode
)
8941 ? CEIL (lowest_size
, UNITS_PER_VREG
)
8942 : CEIL (lowest_size
, UNITS_PER_WORD
));
8959 aarch64_preferred_reload_class (rtx x
, reg_class_t regclass
)
8961 if (regclass
== POINTER_REGS
)
8962 return GENERAL_REGS
;
8964 if (regclass
== STACK_REG
)
8967 && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x
)), POINTER_REGS
))
8973 /* Register eliminiation can result in a request for
8974 SP+constant->FP_REGS. We cannot support such operations which
8975 use SP as source and an FP_REG as destination, so reject out
8977 if (! reg_class_subset_p (regclass
, GENERAL_REGS
) && GET_CODE (x
) == PLUS
)
8979 rtx lhs
= XEXP (x
, 0);
8981 /* Look through a possible SUBREG introduced by ILP32. */
8982 if (GET_CODE (lhs
) == SUBREG
)
8983 lhs
= SUBREG_REG (lhs
);
8985 gcc_assert (REG_P (lhs
));
8986 gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs
)),
8995 aarch64_asm_output_labelref (FILE* f
, const char *name
)
8997 asm_fprintf (f
, "%U%s", name
);
9001 aarch64_elf_asm_constructor (rtx symbol
, int priority
)
9003 if (priority
== DEFAULT_INIT_PRIORITY
)
9004 default_ctor_section_asm_out_constructor (symbol
, priority
);
9008 /* While priority is known to be in range [0, 65535], so 18 bytes
9009 would be enough, the compiler might not know that. To avoid
9010 -Wformat-truncation false positive, use a larger size. */
9012 snprintf (buf
, sizeof (buf
), ".init_array.%.5u", priority
);
9013 s
= get_section (buf
, SECTION_WRITE
| SECTION_NOTYPE
, NULL
);
9014 switch_to_section (s
);
9015 assemble_align (POINTER_SIZE
);
9016 assemble_aligned_integer (POINTER_BYTES
, symbol
);
9021 aarch64_elf_asm_destructor (rtx symbol
, int priority
)
9023 if (priority
== DEFAULT_INIT_PRIORITY
)
9024 default_dtor_section_asm_out_destructor (symbol
, priority
);
9028 /* While priority is known to be in range [0, 65535], so 18 bytes
9029 would be enough, the compiler might not know that. To avoid
9030 -Wformat-truncation false positive, use a larger size. */
9032 snprintf (buf
, sizeof (buf
), ".fini_array.%.5u", priority
);
9033 s
= get_section (buf
, SECTION_WRITE
| SECTION_NOTYPE
, NULL
);
9034 switch_to_section (s
);
9035 assemble_align (POINTER_SIZE
);
9036 assemble_aligned_integer (POINTER_BYTES
, symbol
);
9041 aarch64_output_casesi (rtx
*operands
)
9045 rtx diff_vec
= PATTERN (NEXT_INSN (as_a
<rtx_insn
*> (operands
[2])));
9047 static const char *const patterns
[4][2] =
9050 "ldrb\t%w3, [%0,%w1,uxtw]",
9051 "add\t%3, %4, %w3, sxtb #2"
9054 "ldrh\t%w3, [%0,%w1,uxtw #1]",
9055 "add\t%3, %4, %w3, sxth #2"
9058 "ldr\t%w3, [%0,%w1,uxtw #2]",
9059 "add\t%3, %4, %w3, sxtw #2"
9061 /* We assume that DImode is only generated when not optimizing and
9062 that we don't really need 64-bit address offsets. That would
9063 imply an object file with 8GB of code in a single function! */
9065 "ldr\t%w3, [%0,%w1,uxtw #2]",
9066 "add\t%3, %4, %w3, sxtw #2"
9070 gcc_assert (GET_CODE (diff_vec
) == ADDR_DIFF_VEC
);
9072 scalar_int_mode mode
= as_a
<scalar_int_mode
> (GET_MODE (diff_vec
));
9073 index
= exact_log2 (GET_MODE_SIZE (mode
));
9075 gcc_assert (index
>= 0 && index
<= 3);
9077 /* Need to implement table size reduction, by chaning the code below. */
9078 output_asm_insn (patterns
[index
][0], operands
);
9079 ASM_GENERATE_INTERNAL_LABEL (label
, "Lrtx", CODE_LABEL_NUMBER (operands
[2]));
9080 snprintf (buf
, sizeof (buf
),
9081 "adr\t%%4, %s", targetm
.strip_name_encoding (label
));
9082 output_asm_insn (buf
, operands
);
9083 output_asm_insn (patterns
[index
][1], operands
);
9084 output_asm_insn ("br\t%3", operands
);
9085 assemble_label (asm_out_file
, label
);
9090 /* Return size in bits of an arithmetic operand which is shifted/scaled and
9091 masked such that it is suitable for a UXTB, UXTH, or UXTW extend
9095 aarch64_uxt_size (int shift
, HOST_WIDE_INT mask
)
9097 if (shift
>= 0 && shift
<= 3)
9100 for (size
= 8; size
<= 32; size
*= 2)
9102 HOST_WIDE_INT bits
= ((HOST_WIDE_INT
)1U << size
) - 1;
9103 if (mask
== bits
<< shift
)
9110 /* Constant pools are per function only when PC relative
9111 literal loads are true or we are in the large memory
9115 aarch64_can_use_per_function_literal_pools_p (void)
9117 return (aarch64_pcrelative_literal_loads
9118 || aarch64_cmodel
== AARCH64_CMODEL_LARGE
);
9122 aarch64_use_blocks_for_constant_p (machine_mode
, const_rtx
)
9124 /* We can't use blocks for constants when we're using a per-function
9126 return !aarch64_can_use_per_function_literal_pools_p ();
9129 /* Select appropriate section for constants depending
9130 on where we place literal pools. */
9133 aarch64_select_rtx_section (machine_mode mode
,
9135 unsigned HOST_WIDE_INT align
)
9137 if (aarch64_can_use_per_function_literal_pools_p ())
9138 return function_section (current_function_decl
);
9140 return default_elf_select_rtx_section (mode
, x
, align
);
9143 /* Implement ASM_OUTPUT_POOL_EPILOGUE. */
9145 aarch64_asm_output_pool_epilogue (FILE *f
, const char *, tree
,
9146 HOST_WIDE_INT offset
)
9148 /* When using per-function literal pools, we must ensure that any code
9149 section is aligned to the minimal instruction length, lest we get
9150 errors from the assembler re "unaligned instructions". */
9151 if ((offset
& 3) && aarch64_can_use_per_function_literal_pools_p ())
9152 ASM_OUTPUT_ALIGN (f
, 2);
9157 /* Helper function for rtx cost calculation. Strip a shift expression
9158 from X. Returns the inner operand if successful, or the original
9159 expression on failure. */
9161 aarch64_strip_shift (rtx x
)
9165 /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
9166 we can convert both to ROR during final output. */
9167 if ((GET_CODE (op
) == ASHIFT
9168 || GET_CODE (op
) == ASHIFTRT
9169 || GET_CODE (op
) == LSHIFTRT
9170 || GET_CODE (op
) == ROTATERT
9171 || GET_CODE (op
) == ROTATE
)
9172 && CONST_INT_P (XEXP (op
, 1)))
9173 return XEXP (op
, 0);
9175 if (GET_CODE (op
) == MULT
9176 && CONST_INT_P (XEXP (op
, 1))
9177 && ((unsigned) exact_log2 (INTVAL (XEXP (op
, 1)))) < 64)
9178 return XEXP (op
, 0);
9183 /* Helper function for rtx cost calculation. Strip an extend
9184 expression from X. Returns the inner operand if successful, or the
9185 original expression on failure. We deal with a number of possible
9186 canonicalization variations here. If STRIP_SHIFT is true, then
9187 we can strip off a shift also. */
9189 aarch64_strip_extend (rtx x
, bool strip_shift
)
9191 scalar_int_mode mode
;
9194 if (!is_a
<scalar_int_mode
> (GET_MODE (op
), &mode
))
9197 /* Zero and sign extraction of a widened value. */
9198 if ((GET_CODE (op
) == ZERO_EXTRACT
|| GET_CODE (op
) == SIGN_EXTRACT
)
9199 && XEXP (op
, 2) == const0_rtx
9200 && GET_CODE (XEXP (op
, 0)) == MULT
9201 && aarch64_is_extend_from_extract (mode
, XEXP (XEXP (op
, 0), 1),
9203 return XEXP (XEXP (op
, 0), 0);
9205 /* It can also be represented (for zero-extend) as an AND with an
9207 if (GET_CODE (op
) == AND
9208 && GET_CODE (XEXP (op
, 0)) == MULT
9209 && CONST_INT_P (XEXP (XEXP (op
, 0), 1))
9210 && CONST_INT_P (XEXP (op
, 1))
9211 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op
, 0), 1))),
9212 INTVAL (XEXP (op
, 1))) != 0)
9213 return XEXP (XEXP (op
, 0), 0);
9215 /* Now handle extended register, as this may also have an optional
9216 left shift by 1..4. */
9218 && GET_CODE (op
) == ASHIFT
9219 && CONST_INT_P (XEXP (op
, 1))
9220 && ((unsigned HOST_WIDE_INT
) INTVAL (XEXP (op
, 1))) <= 4)
9223 if (GET_CODE (op
) == ZERO_EXTEND
9224 || GET_CODE (op
) == SIGN_EXTEND
)
9233 /* Return true iff CODE is a shift supported in combination
9234 with arithmetic instructions. */
9237 aarch64_shift_p (enum rtx_code code
)
9239 return code
== ASHIFT
|| code
== ASHIFTRT
|| code
== LSHIFTRT
;
9243 /* Return true iff X is a cheap shift without a sign extend. */
9246 aarch64_cheap_mult_shift_p (rtx x
)
9253 if (!(aarch64_tune_params
.extra_tuning_flags
9254 & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND
))
9257 if (GET_CODE (op0
) == SIGN_EXTEND
)
9260 if (GET_CODE (x
) == ASHIFT
&& CONST_INT_P (op1
)
9261 && UINTVAL (op1
) <= 4)
9264 if (GET_CODE (x
) != MULT
|| !CONST_INT_P (op1
))
9267 HOST_WIDE_INT l2
= exact_log2 (INTVAL (op1
));
9269 if (l2
> 0 && l2
<= 4)
9275 /* Helper function for rtx cost calculation. Calculate the cost of
9276 a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
9277 Return the calculated cost of the expression, recursing manually in to
9278 operands where needed. */
9281 aarch64_rtx_mult_cost (rtx x
, enum rtx_code code
, int outer
, bool speed
)
9284 const struct cpu_cost_table
*extra_cost
9285 = aarch64_tune_params
.insn_extra_cost
;
9287 bool compound_p
= (outer
== PLUS
|| outer
== MINUS
);
9288 machine_mode mode
= GET_MODE (x
);
9290 gcc_checking_assert (code
== MULT
);
9295 if (VECTOR_MODE_P (mode
))
9296 mode
= GET_MODE_INNER (mode
);
9298 /* Integer multiply/fma. */
9299 if (GET_MODE_CLASS (mode
) == MODE_INT
)
9301 /* The multiply will be canonicalized as a shift, cost it as such. */
9302 if (aarch64_shift_p (GET_CODE (x
))
9303 || (CONST_INT_P (op1
)
9304 && exact_log2 (INTVAL (op1
)) > 0))
9306 bool is_extend
= GET_CODE (op0
) == ZERO_EXTEND
9307 || GET_CODE (op0
) == SIGN_EXTEND
;
9312 /* If the shift is considered cheap,
9313 then don't add any cost. */
9314 if (aarch64_cheap_mult_shift_p (x
))
9316 else if (REG_P (op1
))
9317 /* ARITH + shift-by-register. */
9318 cost
+= extra_cost
->alu
.arith_shift_reg
;
9320 /* ARITH + extended register. We don't have a cost field
9321 for ARITH+EXTEND+SHIFT, so use extend_arith here. */
9322 cost
+= extra_cost
->alu
.extend_arith
;
9324 /* ARITH + shift-by-immediate. */
9325 cost
+= extra_cost
->alu
.arith_shift
;
9328 /* LSL (immediate). */
9329 cost
+= extra_cost
->alu
.shift
;
9332 /* Strip extends as we will have costed them in the case above. */
9334 op0
= aarch64_strip_extend (op0
, true);
9336 cost
+= rtx_cost (op0
, VOIDmode
, code
, 0, speed
);
9341 /* MNEG or [US]MNEGL. Extract the NEG operand and indicate that it's a
9342 compound and let the below cases handle it. After all, MNEG is a
9343 special-case alias of MSUB. */
9344 if (GET_CODE (op0
) == NEG
)
9346 op0
= XEXP (op0
, 0);
9350 /* Integer multiplies or FMAs have zero/sign extending variants. */
9351 if ((GET_CODE (op0
) == ZERO_EXTEND
9352 && GET_CODE (op1
) == ZERO_EXTEND
)
9353 || (GET_CODE (op0
) == SIGN_EXTEND
9354 && GET_CODE (op1
) == SIGN_EXTEND
))
9356 cost
+= rtx_cost (XEXP (op0
, 0), VOIDmode
, MULT
, 0, speed
);
9357 cost
+= rtx_cost (XEXP (op1
, 0), VOIDmode
, MULT
, 1, speed
);
9362 /* SMADDL/UMADDL/UMSUBL/SMSUBL. */
9363 cost
+= extra_cost
->mult
[0].extend_add
;
9365 /* MUL/SMULL/UMULL. */
9366 cost
+= extra_cost
->mult
[0].extend
;
9372 /* This is either an integer multiply or a MADD. In both cases
9373 we want to recurse and cost the operands. */
9374 cost
+= rtx_cost (op0
, mode
, MULT
, 0, speed
);
9375 cost
+= rtx_cost (op1
, mode
, MULT
, 1, speed
);
9381 cost
+= extra_cost
->mult
[mode
== DImode
].add
;
9384 cost
+= extra_cost
->mult
[mode
== DImode
].simple
;
9393 /* Floating-point FMA/FMUL can also support negations of the
9394 operands, unless the rounding mode is upward or downward in
9395 which case FNMUL is different than FMUL with operand negation. */
9396 bool neg0
= GET_CODE (op0
) == NEG
;
9397 bool neg1
= GET_CODE (op1
) == NEG
;
9398 if (compound_p
|| !flag_rounding_math
|| (neg0
&& neg1
))
9401 op0
= XEXP (op0
, 0);
9403 op1
= XEXP (op1
, 0);
9407 /* FMADD/FNMADD/FNMSUB/FMSUB. */
9408 cost
+= extra_cost
->fp
[mode
== DFmode
].fma
;
9411 cost
+= extra_cost
->fp
[mode
== DFmode
].mult
;
9414 cost
+= rtx_cost (op0
, mode
, MULT
, 0, speed
);
9415 cost
+= rtx_cost (op1
, mode
, MULT
, 1, speed
);
9421 aarch64_address_cost (rtx x
,
9423 addr_space_t as ATTRIBUTE_UNUSED
,
9426 enum rtx_code c
= GET_CODE (x
);
9427 const struct cpu_addrcost_table
*addr_cost
= aarch64_tune_params
.addr_cost
;
9428 struct aarch64_address_info info
;
9432 if (!aarch64_classify_address (&info
, x
, mode
, false))
9434 if (GET_CODE (x
) == CONST
|| GET_CODE (x
) == SYMBOL_REF
)
9436 /* This is a CONST or SYMBOL ref which will be split
9437 in a different way depending on the code model in use.
9438 Cost it through the generic infrastructure. */
9439 int cost_symbol_ref
= rtx_cost (x
, Pmode
, MEM
, 1, speed
);
9440 /* Divide through by the cost of one instruction to
9441 bring it to the same units as the address costs. */
9442 cost_symbol_ref
/= COSTS_N_INSNS (1);
9443 /* The cost is then the cost of preparing the address,
9444 followed by an immediate (possibly 0) offset. */
9445 return cost_symbol_ref
+ addr_cost
->imm_offset
;
9449 /* This is most likely a jump table from a case
9451 return addr_cost
->register_offset
;
9457 case ADDRESS_LO_SUM
:
9458 case ADDRESS_SYMBOLIC
:
9459 case ADDRESS_REG_IMM
:
9460 cost
+= addr_cost
->imm_offset
;
9463 case ADDRESS_REG_WB
:
9464 if (c
== PRE_INC
|| c
== PRE_DEC
|| c
== PRE_MODIFY
)
9465 cost
+= addr_cost
->pre_modify
;
9466 else if (c
== POST_INC
|| c
== POST_DEC
|| c
== POST_MODIFY
)
9467 cost
+= addr_cost
->post_modify
;
9473 case ADDRESS_REG_REG
:
9474 cost
+= addr_cost
->register_offset
;
9477 case ADDRESS_REG_SXTW
:
9478 cost
+= addr_cost
->register_sextend
;
9481 case ADDRESS_REG_UXTW
:
9482 cost
+= addr_cost
->register_zextend
;
9492 /* For the sake of calculating the cost of the shifted register
9493 component, we can treat same sized modes in the same way. */
9494 if (known_eq (GET_MODE_BITSIZE (mode
), 16))
9495 cost
+= addr_cost
->addr_scale_costs
.hi
;
9496 else if (known_eq (GET_MODE_BITSIZE (mode
), 32))
9497 cost
+= addr_cost
->addr_scale_costs
.si
;
9498 else if (known_eq (GET_MODE_BITSIZE (mode
), 64))
9499 cost
+= addr_cost
->addr_scale_costs
.di
;
9501 /* We can't tell, or this is a 128-bit vector. */
9502 cost
+= addr_cost
->addr_scale_costs
.ti
;
9508 /* Return the cost of a branch. If SPEED_P is true then the compiler is
9509 optimizing for speed. If PREDICTABLE_P is true then the branch is predicted
9513 aarch64_branch_cost (bool speed_p
, bool predictable_p
)
9515 /* When optimizing for speed, use the cost of unpredictable branches. */
9516 const struct cpu_branch_cost
*branch_costs
=
9517 aarch64_tune_params
.branch_costs
;
9519 if (!speed_p
|| predictable_p
)
9520 return branch_costs
->predictable
;
9522 return branch_costs
->unpredictable
;
9525 /* Return true if the RTX X in mode MODE is a zero or sign extract
9526 usable in an ADD or SUB (extended register) instruction. */
9528 aarch64_rtx_arith_op_extract_p (rtx x
, scalar_int_mode mode
)
9530 /* Catch add with a sign extract.
9531 This is add_<optab><mode>_multp2. */
9532 if (GET_CODE (x
) == SIGN_EXTRACT
9533 || GET_CODE (x
) == ZERO_EXTRACT
)
9535 rtx op0
= XEXP (x
, 0);
9536 rtx op1
= XEXP (x
, 1);
9537 rtx op2
= XEXP (x
, 2);
9539 if (GET_CODE (op0
) == MULT
9540 && CONST_INT_P (op1
)
9541 && op2
== const0_rtx
9542 && CONST_INT_P (XEXP (op0
, 1))
9543 && aarch64_is_extend_from_extract (mode
,
9550 /* The simple case <ARITH>, XD, XN, XM, [us]xt.
9552 else if (GET_CODE (x
) == SIGN_EXTEND
9553 || GET_CODE (x
) == ZERO_EXTEND
)
9554 return REG_P (XEXP (x
, 0));
9560 aarch64_frint_unspec_p (unsigned int u
)
9578 /* Return true iff X is an rtx that will match an extr instruction
9579 i.e. as described in the *extr<mode>5_insn family of patterns.
9580 OP0 and OP1 will be set to the operands of the shifts involved
9581 on success and will be NULL_RTX otherwise. */
9584 aarch64_extr_rtx_p (rtx x
, rtx
*res_op0
, rtx
*res_op1
)
9587 scalar_int_mode mode
;
9588 if (!is_a
<scalar_int_mode
> (GET_MODE (x
), &mode
))
9591 *res_op0
= NULL_RTX
;
9592 *res_op1
= NULL_RTX
;
9594 if (GET_CODE (x
) != IOR
)
9600 if ((GET_CODE (op0
) == ASHIFT
&& GET_CODE (op1
) == LSHIFTRT
)
9601 || (GET_CODE (op1
) == ASHIFT
&& GET_CODE (op0
) == LSHIFTRT
))
9603 /* Canonicalise locally to ashift in op0, lshiftrt in op1. */
9604 if (GET_CODE (op1
) == ASHIFT
)
9605 std::swap (op0
, op1
);
9607 if (!CONST_INT_P (XEXP (op0
, 1)) || !CONST_INT_P (XEXP (op1
, 1)))
9610 unsigned HOST_WIDE_INT shft_amnt_0
= UINTVAL (XEXP (op0
, 1));
9611 unsigned HOST_WIDE_INT shft_amnt_1
= UINTVAL (XEXP (op1
, 1));
9613 if (shft_amnt_0
< GET_MODE_BITSIZE (mode
)
9614 && shft_amnt_0
+ shft_amnt_1
== GET_MODE_BITSIZE (mode
))
9616 *res_op0
= XEXP (op0
, 0);
9617 *res_op1
= XEXP (op1
, 0);
9625 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
9626 storing it in *COST. Result is true if the total cost of the operation
9627 has now been calculated. */
9629 aarch64_if_then_else_costs (rtx op0
, rtx op1
, rtx op2
, int *cost
, bool speed
)
9633 enum rtx_code cmpcode
;
9635 if (COMPARISON_P (op0
))
9637 inner
= XEXP (op0
, 0);
9638 comparator
= XEXP (op0
, 1);
9639 cmpcode
= GET_CODE (op0
);
9644 comparator
= const0_rtx
;
9648 if (GET_CODE (op1
) == PC
|| GET_CODE (op2
) == PC
)
9650 /* Conditional branch. */
9651 if (GET_MODE_CLASS (GET_MODE (inner
)) == MODE_CC
)
9655 if (cmpcode
== NE
|| cmpcode
== EQ
)
9657 if (comparator
== const0_rtx
)
9659 /* TBZ/TBNZ/CBZ/CBNZ. */
9660 if (GET_CODE (inner
) == ZERO_EXTRACT
)
9662 *cost
+= rtx_cost (XEXP (inner
, 0), VOIDmode
,
9663 ZERO_EXTRACT
, 0, speed
);
9666 *cost
+= rtx_cost (inner
, VOIDmode
, cmpcode
, 0, speed
);
9671 else if (cmpcode
== LT
|| cmpcode
== GE
)
9674 if (comparator
== const0_rtx
)
9679 else if (GET_MODE_CLASS (GET_MODE (inner
)) == MODE_CC
)
9682 if (GET_CODE (op1
) == COMPARE
)
9684 /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0. */
9685 if (XEXP (op1
, 1) == const0_rtx
)
9689 machine_mode mode
= GET_MODE (XEXP (op1
, 0));
9690 const struct cpu_cost_table
*extra_cost
9691 = aarch64_tune_params
.insn_extra_cost
;
9693 if (GET_MODE_CLASS (mode
) == MODE_INT
)
9694 *cost
+= extra_cost
->alu
.arith
;
9696 *cost
+= extra_cost
->fp
[mode
== DFmode
].compare
;
9701 /* It's a conditional operation based on the status flags,
9702 so it must be some flavor of CSEL. */
9704 /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL. */
9705 if (GET_CODE (op1
) == NEG
9706 || GET_CODE (op1
) == NOT
9707 || (GET_CODE (op1
) == PLUS
&& XEXP (op1
, 1) == const1_rtx
))
9708 op1
= XEXP (op1
, 0);
9709 else if (GET_CODE (op1
) == ZERO_EXTEND
&& GET_CODE (op2
) == ZERO_EXTEND
)
9711 /* CSEL with zero-extension (*cmovdi_insn_uxtw). */
9712 op1
= XEXP (op1
, 0);
9713 op2
= XEXP (op2
, 0);
9716 *cost
+= rtx_cost (op1
, VOIDmode
, IF_THEN_ELSE
, 1, speed
);
9717 *cost
+= rtx_cost (op2
, VOIDmode
, IF_THEN_ELSE
, 2, speed
);
9721 /* We don't know what this is, cost all operands. */
9725 /* Check whether X is a bitfield operation of the form shift + extend that
9726 maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction. If so, return the
9727 operand to which the bitfield operation is applied. Otherwise return
9731 aarch64_extend_bitfield_pattern_p (rtx x
)
9733 rtx_code outer_code
= GET_CODE (x
);
9734 machine_mode outer_mode
= GET_MODE (x
);
9736 if (outer_code
!= ZERO_EXTEND
&& outer_code
!= SIGN_EXTEND
9737 && outer_mode
!= SImode
&& outer_mode
!= DImode
)
9740 rtx inner
= XEXP (x
, 0);
9741 rtx_code inner_code
= GET_CODE (inner
);
9742 machine_mode inner_mode
= GET_MODE (inner
);
9748 if (CONST_INT_P (XEXP (inner
, 1))
9749 && (inner_mode
== QImode
|| inner_mode
== HImode
))
9750 op
= XEXP (inner
, 0);
9753 if (outer_code
== ZERO_EXTEND
&& CONST_INT_P (XEXP (inner
, 1))
9754 && (inner_mode
== QImode
|| inner_mode
== HImode
))
9755 op
= XEXP (inner
, 0);
9758 if (outer_code
== SIGN_EXTEND
&& CONST_INT_P (XEXP (inner
, 1))
9759 && (inner_mode
== QImode
|| inner_mode
== HImode
))
9760 op
= XEXP (inner
, 0);
9769 /* Return true if the mask and a shift amount from an RTX of the form
9770 (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
9771 mode MODE. See the *andim_ashift<mode>_bfiz pattern. */
9774 aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode mode
, rtx mask
,
9777 return CONST_INT_P (mask
) && CONST_INT_P (shft_amnt
)
9778 && INTVAL (shft_amnt
) < GET_MODE_BITSIZE (mode
)
9779 && exact_log2 ((INTVAL (mask
) >> INTVAL (shft_amnt
)) + 1) >= 0
9781 & ((HOST_WIDE_INT_1U
<< INTVAL (shft_amnt
)) - 1)) == 0;
9784 /* Return true if the masks and a shift amount from an RTX of the form
9785 ((x & MASK1) | ((y << SHIFT_AMNT) & MASK2)) are valid to combine into
9786 a BFI instruction of mode MODE. See *arch64_bfi patterns. */
9789 aarch64_masks_and_shift_for_bfi_p (scalar_int_mode mode
,
9790 unsigned HOST_WIDE_INT mask1
,
9791 unsigned HOST_WIDE_INT shft_amnt
,
9792 unsigned HOST_WIDE_INT mask2
)
9794 unsigned HOST_WIDE_INT t
;
9796 /* Verify that there is no overlap in what bits are set in the two masks. */
9797 if (mask1
!= ~mask2
)
9800 /* Verify that mask2 is not all zeros or ones. */
9801 if (mask2
== 0 || mask2
== HOST_WIDE_INT_M1U
)
9804 /* The shift amount should always be less than the mode size. */
9805 gcc_assert (shft_amnt
< GET_MODE_BITSIZE (mode
));
9807 /* Verify that the mask being shifted is contiguous and would be in the
9808 least significant bits after shifting by shft_amnt. */
9809 t
= mask2
+ (HOST_WIDE_INT_1U
<< shft_amnt
);
9810 return (t
== (t
& -t
));
9813 /* Calculate the cost of calculating X, storing it in *COST. Result
9814 is true if the total cost of the operation has now been calculated. */
9816 aarch64_rtx_costs (rtx x
, machine_mode mode
, int outer ATTRIBUTE_UNUSED
,
9817 int param ATTRIBUTE_UNUSED
, int *cost
, bool speed
)
9820 const struct cpu_cost_table
*extra_cost
9821 = aarch64_tune_params
.insn_extra_cost
;
9822 int code
= GET_CODE (x
);
9823 scalar_int_mode int_mode
;
9825 /* By default, assume that everything has equivalent cost to the
9826 cheapest instruction. Any additional costs are applied as a delta
9827 above this default. */
9828 *cost
= COSTS_N_INSNS (1);
9833 /* The cost depends entirely on the operands to SET. */
9838 switch (GET_CODE (op0
))
9843 rtx address
= XEXP (op0
, 0);
9844 if (VECTOR_MODE_P (mode
))
9845 *cost
+= extra_cost
->ldst
.storev
;
9846 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
9847 *cost
+= extra_cost
->ldst
.store
;
9848 else if (mode
== SFmode
)
9849 *cost
+= extra_cost
->ldst
.storef
;
9850 else if (mode
== DFmode
)
9851 *cost
+= extra_cost
->ldst
.stored
;
9854 COSTS_N_INSNS (aarch64_address_cost (address
, mode
,
9858 *cost
+= rtx_cost (op1
, mode
, SET
, 1, speed
);
9862 if (! REG_P (SUBREG_REG (op0
)))
9863 *cost
+= rtx_cost (SUBREG_REG (op0
), VOIDmode
, SET
, 0, speed
);
9867 /* The cost is one per vector-register copied. */
9868 if (VECTOR_MODE_P (GET_MODE (op0
)) && REG_P (op1
))
9870 int nregs
= aarch64_hard_regno_nregs (V0_REGNUM
, GET_MODE (op0
));
9871 *cost
= COSTS_N_INSNS (nregs
);
9873 /* const0_rtx is in general free, but we will use an
9874 instruction to set a register to 0. */
9875 else if (REG_P (op1
) || op1
== const0_rtx
)
9877 /* The cost is 1 per register copied. */
9878 int nregs
= aarch64_hard_regno_nregs (R0_REGNUM
, GET_MODE (op0
));
9879 *cost
= COSTS_N_INSNS (nregs
);
9882 /* Cost is just the cost of the RHS of the set. */
9883 *cost
+= rtx_cost (op1
, mode
, SET
, 1, speed
);
9888 /* Bit-field insertion. Strip any redundant widening of
9889 the RHS to meet the width of the target. */
9890 if (GET_CODE (op1
) == SUBREG
)
9891 op1
= SUBREG_REG (op1
);
9892 if ((GET_CODE (op1
) == ZERO_EXTEND
9893 || GET_CODE (op1
) == SIGN_EXTEND
)
9894 && CONST_INT_P (XEXP (op0
, 1))
9895 && is_a
<scalar_int_mode
> (GET_MODE (XEXP (op1
, 0)), &int_mode
)
9896 && GET_MODE_BITSIZE (int_mode
) >= INTVAL (XEXP (op0
, 1)))
9897 op1
= XEXP (op1
, 0);
9899 if (CONST_INT_P (op1
))
9901 /* MOV immediate is assumed to always be cheap. */
9902 *cost
= COSTS_N_INSNS (1);
9908 *cost
+= extra_cost
->alu
.bfi
;
9909 *cost
+= rtx_cost (op1
, VOIDmode
, (enum rtx_code
) code
, 1, speed
);
9915 /* We can't make sense of this, assume default cost. */
9916 *cost
= COSTS_N_INSNS (1);
9922 /* If an instruction can incorporate a constant within the
9923 instruction, the instruction's expression avoids calling
9924 rtx_cost() on the constant. If rtx_cost() is called on a
9925 constant, then it is usually because the constant must be
9926 moved into a register by one or more instructions.
9928 The exception is constant 0, which can be expressed
9929 as XZR/WZR and is therefore free. The exception to this is
9930 if we have (set (reg) (const0_rtx)) in which case we must cost
9931 the move. However, we can catch that when we cost the SET, so
9932 we don't need to consider that here. */
9933 if (x
== const0_rtx
)
9937 /* To an approximation, building any other constant is
9938 proportionally expensive to the number of instructions
9939 required to build that constant. This is true whether we
9940 are compiling for SPEED or otherwise. */
9941 if (!is_a
<scalar_int_mode
> (mode
, &int_mode
))
9942 int_mode
= word_mode
;
9943 *cost
= COSTS_N_INSNS (aarch64_internal_mov_immediate
9944 (NULL_RTX
, x
, false, int_mode
));
9950 /* First determine number of instructions to do the move
9951 as an integer constant. */
9952 if (!aarch64_float_const_representable_p (x
)
9953 && !aarch64_can_const_movi_rtx_p (x
, mode
)
9954 && aarch64_float_const_rtx_p (x
))
9956 unsigned HOST_WIDE_INT ival
;
9957 bool succeed
= aarch64_reinterpret_float_as_int (x
, &ival
);
9958 gcc_assert (succeed
);
9960 scalar_int_mode imode
= (mode
== HFmode
9962 : int_mode_for_mode (mode
).require ());
9963 int ncost
= aarch64_internal_mov_immediate
9964 (NULL_RTX
, gen_int_mode (ival
, imode
), false, imode
);
9965 *cost
+= COSTS_N_INSNS (ncost
);
9971 /* mov[df,sf]_aarch64. */
9972 if (aarch64_float_const_representable_p (x
))
9973 /* FMOV (scalar immediate). */
9974 *cost
+= extra_cost
->fp
[mode
== DFmode
].fpconst
;
9975 else if (!aarch64_float_const_zero_rtx_p (x
))
9977 /* This will be a load from memory. */
9979 *cost
+= extra_cost
->ldst
.loadd
;
9981 *cost
+= extra_cost
->ldst
.loadf
;
9984 /* Otherwise this is +0.0. We get this using MOVI d0, #0
9985 or MOV v0.s[0], wzr - neither of which are modeled by the
9986 cost tables. Just use the default cost. */
9996 /* For loads we want the base cost of a load, plus an
9997 approximation for the additional cost of the addressing
9999 rtx address
= XEXP (x
, 0);
10000 if (VECTOR_MODE_P (mode
))
10001 *cost
+= extra_cost
->ldst
.loadv
;
10002 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
10003 *cost
+= extra_cost
->ldst
.load
;
10004 else if (mode
== SFmode
)
10005 *cost
+= extra_cost
->ldst
.loadf
;
10006 else if (mode
== DFmode
)
10007 *cost
+= extra_cost
->ldst
.loadd
;
10010 COSTS_N_INSNS (aarch64_address_cost (address
, mode
,
10019 if (VECTOR_MODE_P (mode
))
10024 *cost
+= extra_cost
->vect
.alu
;
10029 if (GET_MODE_CLASS (mode
) == MODE_INT
)
10031 if (GET_RTX_CLASS (GET_CODE (op0
)) == RTX_COMPARE
10032 || GET_RTX_CLASS (GET_CODE (op0
)) == RTX_COMM_COMPARE
)
10035 *cost
+= rtx_cost (XEXP (op0
, 0), VOIDmode
, NEG
, 0, speed
);
10039 /* Cost this as SUB wzr, X. */
10040 op0
= CONST0_RTX (mode
);
10045 if (GET_MODE_CLASS (mode
) == MODE_FLOAT
)
10047 /* Support (neg(fma...)) as a single instruction only if
10048 sign of zeros is unimportant. This matches the decision
10049 making in aarch64.md. */
10050 if (GET_CODE (op0
) == FMA
&& !HONOR_SIGNED_ZEROS (GET_MODE (op0
)))
10053 *cost
= rtx_cost (op0
, mode
, NEG
, 0, speed
);
10056 if (GET_CODE (op0
) == MULT
)
10059 *cost
= rtx_cost (op0
, mode
, NEG
, 0, speed
);
10064 *cost
+= extra_cost
->fp
[mode
== DFmode
].neg
;
10074 if (VECTOR_MODE_P (mode
))
10075 *cost
+= extra_cost
->vect
.alu
;
10077 *cost
+= extra_cost
->alu
.clz
;
10086 if (op1
== const0_rtx
10087 && GET_CODE (op0
) == AND
)
10090 mode
= GET_MODE (op0
);
10094 if (GET_MODE_CLASS (GET_MODE (op0
)) == MODE_INT
)
10096 /* TODO: A write to the CC flags possibly costs extra, this
10097 needs encoding in the cost tables. */
10099 mode
= GET_MODE (op0
);
10101 if (GET_CODE (op0
) == AND
)
10107 if (GET_CODE (op0
) == PLUS
)
10109 /* ADDS (and CMN alias). */
10114 if (GET_CODE (op0
) == MINUS
)
10121 if (GET_CODE (op0
) == ZERO_EXTRACT
&& op1
== const0_rtx
10122 && GET_MODE (x
) == CC_NZmode
&& CONST_INT_P (XEXP (op0
, 1))
10123 && CONST_INT_P (XEXP (op0
, 2)))
10125 /* COMPARE of ZERO_EXTRACT form of TST-immediate.
10126 Handle it here directly rather than going to cost_logic
10127 since we know the immediate generated for the TST is valid
10128 so we can avoid creating an intermediate rtx for it only
10129 for costing purposes. */
10131 *cost
+= extra_cost
->alu
.logical
;
10133 *cost
+= rtx_cost (XEXP (op0
, 0), GET_MODE (op0
),
10134 ZERO_EXTRACT
, 0, speed
);
10138 if (GET_CODE (op1
) == NEG
)
10142 *cost
+= extra_cost
->alu
.arith
;
10144 *cost
+= rtx_cost (op0
, mode
, COMPARE
, 0, speed
);
10145 *cost
+= rtx_cost (XEXP (op1
, 0), mode
, NEG
, 1, speed
);
10151 Compare can freely swap the order of operands, and
10152 canonicalization puts the more complex operation first.
10153 But the integer MINUS logic expects the shift/extend
10154 operation in op1. */
10156 || (GET_CODE (op0
) == SUBREG
&& REG_P (SUBREG_REG (op0
)))))
10164 if (GET_MODE_CLASS (GET_MODE (op0
)) == MODE_FLOAT
)
10168 *cost
+= extra_cost
->fp
[mode
== DFmode
].compare
;
10170 if (CONST_DOUBLE_P (op1
) && aarch64_float_const_zero_rtx_p (op1
))
10172 *cost
+= rtx_cost (op0
, VOIDmode
, COMPARE
, 0, speed
);
10173 /* FCMP supports constant 0.0 for no extra cost. */
10179 if (VECTOR_MODE_P (mode
))
10181 /* Vector compare. */
10183 *cost
+= extra_cost
->vect
.alu
;
10185 if (aarch64_float_const_zero_rtx_p (op1
))
10187 /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
10201 *cost
+= rtx_cost (op0
, mode
, MINUS
, 0, speed
);
10203 /* Detect valid immediates. */
10204 if ((GET_MODE_CLASS (mode
) == MODE_INT
10205 || (GET_MODE_CLASS (mode
) == MODE_CC
10206 && GET_MODE_CLASS (GET_MODE (op0
)) == MODE_INT
))
10207 && CONST_INT_P (op1
)
10208 && aarch64_uimm12_shift (INTVAL (op1
)))
10211 /* SUB(S) (immediate). */
10212 *cost
+= extra_cost
->alu
.arith
;
10216 /* Look for SUB (extended register). */
10217 if (is_a
<scalar_int_mode
> (mode
, &int_mode
)
10218 && aarch64_rtx_arith_op_extract_p (op1
, int_mode
))
10221 *cost
+= extra_cost
->alu
.extend_arith
;
10223 op1
= aarch64_strip_extend (op1
, true);
10224 *cost
+= rtx_cost (op1
, VOIDmode
,
10225 (enum rtx_code
) GET_CODE (op1
), 0, speed
);
10229 rtx new_op1
= aarch64_strip_extend (op1
, false);
10231 /* Cost this as an FMA-alike operation. */
10232 if ((GET_CODE (new_op1
) == MULT
10233 || aarch64_shift_p (GET_CODE (new_op1
)))
10234 && code
!= COMPARE
)
10236 *cost
+= aarch64_rtx_mult_cost (new_op1
, MULT
,
10237 (enum rtx_code
) code
,
10242 *cost
+= rtx_cost (new_op1
, VOIDmode
, MINUS
, 1, speed
);
10246 if (VECTOR_MODE_P (mode
))
10249 *cost
+= extra_cost
->vect
.alu
;
10251 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
10254 *cost
+= extra_cost
->alu
.arith
;
10256 else if (GET_MODE_CLASS (mode
) == MODE_FLOAT
)
10259 *cost
+= extra_cost
->fp
[mode
== DFmode
].addsub
;
10273 if (GET_RTX_CLASS (GET_CODE (op0
)) == RTX_COMPARE
10274 || GET_RTX_CLASS (GET_CODE (op0
)) == RTX_COMM_COMPARE
)
10277 *cost
+= rtx_cost (XEXP (op0
, 0), mode
, PLUS
, 0, speed
);
10278 *cost
+= rtx_cost (op1
, mode
, PLUS
, 1, speed
);
10282 if (GET_MODE_CLASS (mode
) == MODE_INT
10283 && ((CONST_INT_P (op1
) && aarch64_uimm12_shift (INTVAL (op1
)))
10284 || aarch64_sve_addvl_addpl_immediate (op1
, mode
)))
10286 *cost
+= rtx_cost (op0
, mode
, PLUS
, 0, speed
);
10289 /* ADD (immediate). */
10290 *cost
+= extra_cost
->alu
.arith
;
10294 *cost
+= rtx_cost (op1
, mode
, PLUS
, 1, speed
);
10296 /* Look for ADD (extended register). */
10297 if (is_a
<scalar_int_mode
> (mode
, &int_mode
)
10298 && aarch64_rtx_arith_op_extract_p (op0
, int_mode
))
10301 *cost
+= extra_cost
->alu
.extend_arith
;
10303 op0
= aarch64_strip_extend (op0
, true);
10304 *cost
+= rtx_cost (op0
, VOIDmode
,
10305 (enum rtx_code
) GET_CODE (op0
), 0, speed
);
10309 /* Strip any extend, leave shifts behind as we will
10310 cost them through mult_cost. */
10311 new_op0
= aarch64_strip_extend (op0
, false);
10313 if (GET_CODE (new_op0
) == MULT
10314 || aarch64_shift_p (GET_CODE (new_op0
)))
10316 *cost
+= aarch64_rtx_mult_cost (new_op0
, MULT
, PLUS
,
10321 *cost
+= rtx_cost (new_op0
, VOIDmode
, PLUS
, 0, speed
);
10325 if (VECTOR_MODE_P (mode
))
10328 *cost
+= extra_cost
->vect
.alu
;
10330 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
10333 *cost
+= extra_cost
->alu
.arith
;
10335 else if (GET_MODE_CLASS (mode
) == MODE_FLOAT
)
10338 *cost
+= extra_cost
->fp
[mode
== DFmode
].addsub
;
10345 *cost
= COSTS_N_INSNS (1);
10349 if (VECTOR_MODE_P (mode
))
10350 *cost
+= extra_cost
->vect
.alu
;
10352 *cost
+= extra_cost
->alu
.rev
;
10357 if (aarch_rev16_p (x
))
10359 *cost
= COSTS_N_INSNS (1);
10363 if (VECTOR_MODE_P (mode
))
10364 *cost
+= extra_cost
->vect
.alu
;
10366 *cost
+= extra_cost
->alu
.rev
;
10371 if (aarch64_extr_rtx_p (x
, &op0
, &op1
))
10373 *cost
+= rtx_cost (op0
, mode
, IOR
, 0, speed
);
10374 *cost
+= rtx_cost (op1
, mode
, IOR
, 1, speed
);
10376 *cost
+= extra_cost
->alu
.shift
;
10380 /* Fall through. */
10387 if (VECTOR_MODE_P (mode
))
10390 *cost
+= extra_cost
->vect
.alu
;
10395 && GET_CODE (op0
) == MULT
10396 && CONST_INT_P (XEXP (op0
, 1))
10397 && CONST_INT_P (op1
)
10398 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0
, 1))),
10399 INTVAL (op1
)) != 0)
10401 /* This is a UBFM/SBFM. */
10402 *cost
+= rtx_cost (XEXP (op0
, 0), mode
, ZERO_EXTRACT
, 0, speed
);
10404 *cost
+= extra_cost
->alu
.bfx
;
10408 if (is_int_mode (mode
, &int_mode
))
10410 if (CONST_INT_P (op1
))
10412 /* We have a mask + shift version of a UBFIZ
10413 i.e. the *andim_ashift<mode>_bfiz pattern. */
10414 if (GET_CODE (op0
) == ASHIFT
10415 && aarch64_mask_and_shift_for_ubfiz_p (int_mode
, op1
,
10418 *cost
+= rtx_cost (XEXP (op0
, 0), int_mode
,
10419 (enum rtx_code
) code
, 0, speed
);
10421 *cost
+= extra_cost
->alu
.bfx
;
10425 else if (aarch64_bitmask_imm (INTVAL (op1
), int_mode
))
10427 /* We possibly get the immediate for free, this is not
10429 *cost
+= rtx_cost (op0
, int_mode
,
10430 (enum rtx_code
) code
, 0, speed
);
10432 *cost
+= extra_cost
->alu
.logical
;
10441 /* Handle ORN, EON, or BIC. */
10442 if (GET_CODE (op0
) == NOT
)
10443 op0
= XEXP (op0
, 0);
10445 new_op0
= aarch64_strip_shift (op0
);
10447 /* If we had a shift on op0 then this is a logical-shift-
10448 by-register/immediate operation. Otherwise, this is just
10449 a logical operation. */
10452 if (new_op0
!= op0
)
10454 /* Shift by immediate. */
10455 if (CONST_INT_P (XEXP (op0
, 1)))
10456 *cost
+= extra_cost
->alu
.log_shift
;
10458 *cost
+= extra_cost
->alu
.log_shift_reg
;
10461 *cost
+= extra_cost
->alu
.logical
;
10464 /* In both cases we want to cost both operands. */
10465 *cost
+= rtx_cost (new_op0
, int_mode
, (enum rtx_code
) code
,
10467 *cost
+= rtx_cost (op1
, int_mode
, (enum rtx_code
) code
,
10477 op0
= aarch64_strip_shift (x
);
10479 if (VECTOR_MODE_P (mode
))
10482 *cost
+= extra_cost
->vect
.alu
;
10486 /* MVN-shifted-reg. */
10489 *cost
+= rtx_cost (op0
, mode
, (enum rtx_code
) code
, 0, speed
);
10492 *cost
+= extra_cost
->alu
.log_shift
;
10496 /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
10497 Handle the second form here taking care that 'a' in the above can
10499 else if (GET_CODE (op0
) == XOR
)
10501 rtx newop0
= XEXP (op0
, 0);
10502 rtx newop1
= XEXP (op0
, 1);
10503 rtx op0_stripped
= aarch64_strip_shift (newop0
);
10505 *cost
+= rtx_cost (newop1
, mode
, (enum rtx_code
) code
, 1, speed
);
10506 *cost
+= rtx_cost (op0_stripped
, mode
, XOR
, 0, speed
);
10510 if (op0_stripped
!= newop0
)
10511 *cost
+= extra_cost
->alu
.log_shift
;
10513 *cost
+= extra_cost
->alu
.logical
;
10520 *cost
+= extra_cost
->alu
.logical
;
10527 /* If a value is written in SI mode, then zero extended to DI
10528 mode, the operation will in general be free as a write to
10529 a 'w' register implicitly zeroes the upper bits of an 'x'
10530 register. However, if this is
10532 (set (reg) (zero_extend (reg)))
10534 we must cost the explicit register move. */
10536 && GET_MODE (op0
) == SImode
10539 int op_cost
= rtx_cost (op0
, VOIDmode
, ZERO_EXTEND
, 0, speed
);
10541 /* If OP_COST is non-zero, then the cost of the zero extend
10542 is effectively the cost of the inner operation. Otherwise
10543 we have a MOV instruction and we take the cost from the MOV
10544 itself. This is true independently of whether we are
10545 optimizing for space or time. */
10551 else if (MEM_P (op0
))
10553 /* All loads can zero extend to any size for free. */
10554 *cost
= rtx_cost (op0
, VOIDmode
, ZERO_EXTEND
, param
, speed
);
10558 op0
= aarch64_extend_bitfield_pattern_p (x
);
10561 *cost
+= rtx_cost (op0
, mode
, ZERO_EXTEND
, 0, speed
);
10563 *cost
+= extra_cost
->alu
.bfx
;
10569 if (VECTOR_MODE_P (mode
))
10572 *cost
+= extra_cost
->vect
.alu
;
10576 /* We generate an AND instead of UXTB/UXTH. */
10577 *cost
+= extra_cost
->alu
.logical
;
10583 if (MEM_P (XEXP (x
, 0)))
10588 rtx address
= XEXP (XEXP (x
, 0), 0);
10589 *cost
+= extra_cost
->ldst
.load_sign_extend
;
10592 COSTS_N_INSNS (aarch64_address_cost (address
, mode
,
10598 op0
= aarch64_extend_bitfield_pattern_p (x
);
10601 *cost
+= rtx_cost (op0
, mode
, SIGN_EXTEND
, 0, speed
);
10603 *cost
+= extra_cost
->alu
.bfx
;
10609 if (VECTOR_MODE_P (mode
))
10610 *cost
+= extra_cost
->vect
.alu
;
10612 *cost
+= extra_cost
->alu
.extend
;
10620 if (CONST_INT_P (op1
))
10624 if (VECTOR_MODE_P (mode
))
10626 /* Vector shift (immediate). */
10627 *cost
+= extra_cost
->vect
.alu
;
10631 /* LSL (immediate), UBMF, UBFIZ and friends. These are all
10633 *cost
+= extra_cost
->alu
.shift
;
10637 /* We can incorporate zero/sign extend for free. */
10638 if (GET_CODE (op0
) == ZERO_EXTEND
10639 || GET_CODE (op0
) == SIGN_EXTEND
)
10640 op0
= XEXP (op0
, 0);
10642 *cost
+= rtx_cost (op0
, VOIDmode
, ASHIFT
, 0, speed
);
10647 if (VECTOR_MODE_P (mode
))
10650 /* Vector shift (register). */
10651 *cost
+= extra_cost
->vect
.alu
;
10657 *cost
+= extra_cost
->alu
.shift_reg
;
10659 if (GET_CODE (op1
) == AND
&& REG_P (XEXP (op1
, 0))
10660 && CONST_INT_P (XEXP (op1
, 1))
10661 && known_eq (INTVAL (XEXP (op1
, 1)),
10662 GET_MODE_BITSIZE (mode
) - 1))
10664 *cost
+= rtx_cost (op0
, mode
, (rtx_code
) code
, 0, speed
);
10665 /* We already demanded XEXP (op1, 0) to be REG_P, so
10666 don't recurse into it. */
10670 return false; /* All arguments need to be in registers. */
10680 if (CONST_INT_P (op1
))
10682 /* ASR (immediate) and friends. */
10685 if (VECTOR_MODE_P (mode
))
10686 *cost
+= extra_cost
->vect
.alu
;
10688 *cost
+= extra_cost
->alu
.shift
;
10691 *cost
+= rtx_cost (op0
, mode
, (enum rtx_code
) code
, 0, speed
);
10696 if (VECTOR_MODE_P (mode
))
10699 /* Vector shift (register). */
10700 *cost
+= extra_cost
->vect
.alu
;
10705 /* ASR (register) and friends. */
10706 *cost
+= extra_cost
->alu
.shift_reg
;
10708 if (GET_CODE (op1
) == AND
&& REG_P (XEXP (op1
, 0))
10709 && CONST_INT_P (XEXP (op1
, 1))
10710 && known_eq (INTVAL (XEXP (op1
, 1)),
10711 GET_MODE_BITSIZE (mode
) - 1))
10713 *cost
+= rtx_cost (op0
, mode
, (rtx_code
) code
, 0, speed
);
10714 /* We already demanded XEXP (op1, 0) to be REG_P, so
10715 don't recurse into it. */
10719 return false; /* All arguments need to be in registers. */
10724 if (aarch64_cmodel
== AARCH64_CMODEL_LARGE
10725 || aarch64_cmodel
== AARCH64_CMODEL_SMALL_SPIC
)
10729 *cost
+= extra_cost
->ldst
.load
;
10731 else if (aarch64_cmodel
== AARCH64_CMODEL_SMALL
10732 || aarch64_cmodel
== AARCH64_CMODEL_SMALL_PIC
)
10734 /* ADRP, followed by ADD. */
10735 *cost
+= COSTS_N_INSNS (1);
10737 *cost
+= 2 * extra_cost
->alu
.arith
;
10739 else if (aarch64_cmodel
== AARCH64_CMODEL_TINY
10740 || aarch64_cmodel
== AARCH64_CMODEL_TINY_PIC
)
10744 *cost
+= extra_cost
->alu
.arith
;
10749 /* One extra load instruction, after accessing the GOT. */
10750 *cost
+= COSTS_N_INSNS (1);
10752 *cost
+= extra_cost
->ldst
.load
;
10758 /* ADRP/ADD (immediate). */
10760 *cost
+= extra_cost
->alu
.arith
;
10768 if (VECTOR_MODE_P (mode
))
10769 *cost
+= extra_cost
->vect
.alu
;
10771 *cost
+= extra_cost
->alu
.bfx
;
10774 /* We can trust that the immediates used will be correct (there
10775 are no by-register forms), so we need only cost op0. */
10776 *cost
+= rtx_cost (XEXP (x
, 0), VOIDmode
, (enum rtx_code
) code
, 0, speed
);
10780 *cost
+= aarch64_rtx_mult_cost (x
, MULT
, 0, speed
);
10781 /* aarch64_rtx_mult_cost always handles recursion to its
10786 /* We can expand signed mod by power of 2 using a NEGS, two parallel
10787 ANDs and a CSNEG. Assume here that CSNEG is the same as the cost of
10788 an unconditional negate. This case should only ever be reached through
10789 the set_smod_pow2_cheap check in expmed.c. */
10790 if (CONST_INT_P (XEXP (x
, 1))
10791 && exact_log2 (INTVAL (XEXP (x
, 1))) > 0
10792 && (mode
== SImode
|| mode
== DImode
))
10794 /* We expand to 4 instructions. Reset the baseline. */
10795 *cost
= COSTS_N_INSNS (4);
10798 *cost
+= 2 * extra_cost
->alu
.logical
10799 + 2 * extra_cost
->alu
.arith
;
10804 /* Fall-through. */
10808 /* Slighly prefer UMOD over SMOD. */
10809 if (VECTOR_MODE_P (mode
))
10810 *cost
+= extra_cost
->vect
.alu
;
10811 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
10812 *cost
+= (extra_cost
->mult
[mode
== DImode
].add
10813 + extra_cost
->mult
[mode
== DImode
].idiv
10814 + (code
== MOD
? 1 : 0));
10816 return false; /* All arguments need to be in registers. */
10823 if (VECTOR_MODE_P (mode
))
10824 *cost
+= extra_cost
->vect
.alu
;
10825 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
10826 /* There is no integer SQRT, so only DIV and UDIV can get
10828 *cost
+= (extra_cost
->mult
[mode
== DImode
].idiv
10829 /* Slighly prefer UDIV over SDIV. */
10830 + (code
== DIV
? 1 : 0));
10832 *cost
+= extra_cost
->fp
[mode
== DFmode
].div
;
10834 return false; /* All arguments need to be in registers. */
10837 return aarch64_if_then_else_costs (XEXP (x
, 0), XEXP (x
, 1),
10838 XEXP (x
, 2), cost
, speed
);
10851 return false; /* All arguments must be in registers. */
10860 if (VECTOR_MODE_P (mode
))
10861 *cost
+= extra_cost
->vect
.alu
;
10863 *cost
+= extra_cost
->fp
[mode
== DFmode
].fma
;
10866 /* FMSUB, FNMADD, and FNMSUB are free. */
10867 if (GET_CODE (op0
) == NEG
)
10868 op0
= XEXP (op0
, 0);
10870 if (GET_CODE (op2
) == NEG
)
10871 op2
= XEXP (op2
, 0);
10873 /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
10874 and the by-element operand as operand 0. */
10875 if (GET_CODE (op1
) == NEG
)
10876 op1
= XEXP (op1
, 0);
10878 /* Catch vector-by-element operations. The by-element operand can
10879 either be (vec_duplicate (vec_select (x))) or just
10880 (vec_select (x)), depending on whether we are multiplying by
10881 a vector or a scalar.
10883 Canonicalization is not very good in these cases, FMA4 will put the
10884 by-element operand as operand 0, FNMA4 will have it as operand 1. */
10885 if (GET_CODE (op0
) == VEC_DUPLICATE
)
10886 op0
= XEXP (op0
, 0);
10887 else if (GET_CODE (op1
) == VEC_DUPLICATE
)
10888 op1
= XEXP (op1
, 0);
10890 if (GET_CODE (op0
) == VEC_SELECT
)
10891 op0
= XEXP (op0
, 0);
10892 else if (GET_CODE (op1
) == VEC_SELECT
)
10893 op1
= XEXP (op1
, 0);
10895 /* If the remaining parameters are not registers,
10896 get the cost to put them into registers. */
10897 *cost
+= rtx_cost (op0
, mode
, FMA
, 0, speed
);
10898 *cost
+= rtx_cost (op1
, mode
, FMA
, 1, speed
);
10899 *cost
+= rtx_cost (op2
, mode
, FMA
, 2, speed
);
10903 case UNSIGNED_FLOAT
:
10905 *cost
+= extra_cost
->fp
[mode
== DFmode
].fromint
;
10911 if (VECTOR_MODE_P (mode
))
10913 /*Vector truncate. */
10914 *cost
+= extra_cost
->vect
.alu
;
10917 *cost
+= extra_cost
->fp
[mode
== DFmode
].widen
;
10921 case FLOAT_TRUNCATE
:
10924 if (VECTOR_MODE_P (mode
))
10926 /*Vector conversion. */
10927 *cost
+= extra_cost
->vect
.alu
;
10930 *cost
+= extra_cost
->fp
[mode
== DFmode
].narrow
;
10937 /* Strip the rounding part. They will all be implemented
10938 by the fcvt* family of instructions anyway. */
10939 if (GET_CODE (x
) == UNSPEC
)
10941 unsigned int uns_code
= XINT (x
, 1);
10943 if (uns_code
== UNSPEC_FRINTA
10944 || uns_code
== UNSPEC_FRINTM
10945 || uns_code
== UNSPEC_FRINTN
10946 || uns_code
== UNSPEC_FRINTP
10947 || uns_code
== UNSPEC_FRINTZ
)
10948 x
= XVECEXP (x
, 0, 0);
10953 if (VECTOR_MODE_P (mode
))
10954 *cost
+= extra_cost
->vect
.alu
;
10956 *cost
+= extra_cost
->fp
[GET_MODE (x
) == DFmode
].toint
;
10959 /* We can combine fmul by a power of 2 followed by a fcvt into a single
10960 fixed-point fcvt. */
10961 if (GET_CODE (x
) == MULT
10962 && ((VECTOR_MODE_P (mode
)
10963 && aarch64_vec_fpconst_pow_of_2 (XEXP (x
, 1)) > 0)
10964 || aarch64_fpconst_pow_of_2 (XEXP (x
, 1)) > 0))
10966 *cost
+= rtx_cost (XEXP (x
, 0), VOIDmode
, (rtx_code
) code
,
10971 *cost
+= rtx_cost (x
, VOIDmode
, (enum rtx_code
) code
, 0, speed
);
10975 if (VECTOR_MODE_P (mode
))
10977 /* ABS (vector). */
10979 *cost
+= extra_cost
->vect
.alu
;
10981 else if (GET_MODE_CLASS (mode
) == MODE_FLOAT
)
10985 /* FABD, which is analogous to FADD. */
10986 if (GET_CODE (op0
) == MINUS
)
10988 *cost
+= rtx_cost (XEXP (op0
, 0), mode
, MINUS
, 0, speed
);
10989 *cost
+= rtx_cost (XEXP (op0
, 1), mode
, MINUS
, 1, speed
);
10991 *cost
+= extra_cost
->fp
[mode
== DFmode
].addsub
;
10995 /* Simple FABS is analogous to FNEG. */
10997 *cost
+= extra_cost
->fp
[mode
== DFmode
].neg
;
11001 /* Integer ABS will either be split to
11002 two arithmetic instructions, or will be an ABS
11003 (scalar), which we don't model. */
11004 *cost
= COSTS_N_INSNS (2);
11006 *cost
+= 2 * extra_cost
->alu
.arith
;
11014 if (VECTOR_MODE_P (mode
))
11015 *cost
+= extra_cost
->vect
.alu
;
11018 /* FMAXNM/FMINNM/FMAX/FMIN.
11019 TODO: This may not be accurate for all implementations, but
11020 we do not model this in the cost tables. */
11021 *cost
+= extra_cost
->fp
[mode
== DFmode
].addsub
;
11027 /* The floating point round to integer frint* instructions. */
11028 if (aarch64_frint_unspec_p (XINT (x
, 1)))
11031 *cost
+= extra_cost
->fp
[mode
== DFmode
].roundint
;
11036 if (XINT (x
, 1) == UNSPEC_RBIT
)
11039 *cost
+= extra_cost
->alu
.rev
;
11047 /* Decompose <su>muldi3_highpart. */
11048 if (/* (truncate:DI */
11051 && GET_MODE (XEXP (x
, 0)) == TImode
11052 && GET_CODE (XEXP (x
, 0)) == LSHIFTRT
11054 && GET_CODE (XEXP (XEXP (x
, 0), 0)) == MULT
11055 /* (ANY_EXTEND:TI (reg:DI))
11056 (ANY_EXTEND:TI (reg:DI))) */
11057 && ((GET_CODE (XEXP (XEXP (XEXP (x
, 0), 0), 0)) == ZERO_EXTEND
11058 && GET_CODE (XEXP (XEXP (XEXP (x
, 0), 0), 1)) == ZERO_EXTEND
)
11059 || (GET_CODE (XEXP (XEXP (XEXP (x
, 0), 0), 0)) == SIGN_EXTEND
11060 && GET_CODE (XEXP (XEXP (XEXP (x
, 0), 0), 1)) == SIGN_EXTEND
))
11061 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x
, 0), 0), 0), 0)) == DImode
11062 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x
, 0), 0), 1), 0)) == DImode
11063 /* (const_int 64) */
11064 && CONST_INT_P (XEXP (XEXP (x
, 0), 1))
11065 && UINTVAL (XEXP (XEXP (x
, 0), 1)) == 64)
11069 *cost
+= extra_cost
->mult
[mode
== DImode
].extend
;
11070 *cost
+= rtx_cost (XEXP (XEXP (XEXP (XEXP (x
, 0), 0), 0), 0),
11071 mode
, MULT
, 0, speed
);
11072 *cost
+= rtx_cost (XEXP (XEXP (XEXP (XEXP (x
, 0), 0), 1), 0),
11073 mode
, MULT
, 1, speed
);
11077 /* Fall through. */
11083 && flag_aarch64_verbose_cost
)
11084 fprintf (dump_file
,
11085 "\nFailed to cost RTX. Assuming default cost.\n");
11090 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
11091 calculated for X. This cost is stored in *COST. Returns true
11092 if the total cost of X was calculated. */
11094 aarch64_rtx_costs_wrapper (rtx x
, machine_mode mode
, int outer
,
11095 int param
, int *cost
, bool speed
)
11097 bool result
= aarch64_rtx_costs (x
, mode
, outer
, param
, cost
, speed
);
11100 && flag_aarch64_verbose_cost
)
11102 print_rtl_single (dump_file
, x
);
11103 fprintf (dump_file
, "\n%s cost: %d (%s)\n",
11104 speed
? "Hot" : "Cold",
11105 *cost
, result
? "final" : "partial");
11112 aarch64_register_move_cost (machine_mode mode
,
11113 reg_class_t from_i
, reg_class_t to_i
)
11115 enum reg_class from
= (enum reg_class
) from_i
;
11116 enum reg_class to
= (enum reg_class
) to_i
;
11117 const struct cpu_regmove_cost
*regmove_cost
11118 = aarch64_tune_params
.regmove_cost
;
11120 /* Caller save and pointer regs are equivalent to GENERAL_REGS. */
11121 if (to
== TAILCALL_ADDR_REGS
|| to
== POINTER_REGS
)
11124 if (from
== TAILCALL_ADDR_REGS
|| from
== POINTER_REGS
)
11125 from
= GENERAL_REGS
;
11127 /* Moving between GPR and stack cost is the same as GP2GP. */
11128 if ((from
== GENERAL_REGS
&& to
== STACK_REG
)
11129 || (to
== GENERAL_REGS
&& from
== STACK_REG
))
11130 return regmove_cost
->GP2GP
;
11132 /* To/From the stack register, we move via the gprs. */
11133 if (to
== STACK_REG
|| from
== STACK_REG
)
11134 return aarch64_register_move_cost (mode
, from
, GENERAL_REGS
)
11135 + aarch64_register_move_cost (mode
, GENERAL_REGS
, to
);
11137 if (known_eq (GET_MODE_SIZE (mode
), 16))
11139 /* 128-bit operations on general registers require 2 instructions. */
11140 if (from
== GENERAL_REGS
&& to
== GENERAL_REGS
)
11141 return regmove_cost
->GP2GP
* 2;
11142 else if (from
== GENERAL_REGS
)
11143 return regmove_cost
->GP2FP
* 2;
11144 else if (to
== GENERAL_REGS
)
11145 return regmove_cost
->FP2GP
* 2;
11147 /* When AdvSIMD instructions are disabled it is not possible to move
11148 a 128-bit value directly between Q registers. This is handled in
11149 secondary reload. A general register is used as a scratch to move
11150 the upper DI value and the lower DI value is moved directly,
11151 hence the cost is the sum of three moves. */
11153 return regmove_cost
->GP2FP
+ regmove_cost
->FP2GP
+ regmove_cost
->FP2FP
;
11155 return regmove_cost
->FP2FP
;
11158 if (from
== GENERAL_REGS
&& to
== GENERAL_REGS
)
11159 return regmove_cost
->GP2GP
;
11160 else if (from
== GENERAL_REGS
)
11161 return regmove_cost
->GP2FP
;
11162 else if (to
== GENERAL_REGS
)
11163 return regmove_cost
->FP2GP
;
11165 return regmove_cost
->FP2FP
;
11169 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED
,
11170 reg_class_t rclass ATTRIBUTE_UNUSED
,
11171 bool in ATTRIBUTE_UNUSED
)
11173 return aarch64_tune_params
.memmov_cost
;
11176 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
11177 to optimize 1.0/sqrt. */
11180 use_rsqrt_p (machine_mode mode
)
11182 return (!flag_trapping_math
11183 && flag_unsafe_math_optimizations
11184 && ((aarch64_tune_params
.approx_modes
->recip_sqrt
11185 & AARCH64_APPROX_MODE (mode
))
11186 || flag_mrecip_low_precision_sqrt
));
11189 /* Function to decide when to use the approximate reciprocal square root
11193 aarch64_builtin_reciprocal (tree fndecl
)
11195 machine_mode mode
= TYPE_MODE (TREE_TYPE (fndecl
));
11197 if (!use_rsqrt_p (mode
))
11199 return aarch64_builtin_rsqrt (DECL_MD_FUNCTION_CODE (fndecl
));
11202 /* Emit instruction sequence to compute either the approximate square root
11203 or its approximate reciprocal, depending on the flag RECP, and return
11204 whether the sequence was emitted or not. */
11207 aarch64_emit_approx_sqrt (rtx dst
, rtx src
, bool recp
)
11209 machine_mode mode
= GET_MODE (dst
);
11211 if (GET_MODE_INNER (mode
) == HFmode
)
11213 gcc_assert (!recp
);
11219 if (!(flag_mlow_precision_sqrt
11220 || (aarch64_tune_params
.approx_modes
->sqrt
11221 & AARCH64_APPROX_MODE (mode
))))
11224 if (flag_finite_math_only
11225 || flag_trapping_math
11226 || !flag_unsafe_math_optimizations
11227 || optimize_function_for_size_p (cfun
))
11231 /* Caller assumes we cannot fail. */
11232 gcc_assert (use_rsqrt_p (mode
));
11234 machine_mode mmsk
= mode_for_int_vector (mode
).require ();
11235 rtx xmsk
= gen_reg_rtx (mmsk
);
11237 /* When calculating the approximate square root, compare the
11238 argument with 0.0 and create a mask. */
11239 emit_insn (gen_rtx_SET (xmsk
,
11241 gen_rtx_EQ (mmsk
, src
,
11242 CONST0_RTX (mode
)))));
11244 /* Estimate the approximate reciprocal square root. */
11245 rtx xdst
= gen_reg_rtx (mode
);
11246 emit_insn (gen_aarch64_rsqrte (mode
, xdst
, src
));
11248 /* Iterate over the series twice for SF and thrice for DF. */
11249 int iterations
= (GET_MODE_INNER (mode
) == DFmode
) ? 3 : 2;
11251 /* Optionally iterate over the series once less for faster performance
11252 while sacrificing the accuracy. */
11253 if ((recp
&& flag_mrecip_low_precision_sqrt
)
11254 || (!recp
&& flag_mlow_precision_sqrt
))
11257 /* Iterate over the series to calculate the approximate reciprocal square
11259 rtx x1
= gen_reg_rtx (mode
);
11260 while (iterations
--)
11262 rtx x2
= gen_reg_rtx (mode
);
11263 emit_set_insn (x2
, gen_rtx_MULT (mode
, xdst
, xdst
));
11265 emit_insn (gen_aarch64_rsqrts (mode
, x1
, src
, x2
));
11267 if (iterations
> 0)
11268 emit_set_insn (xdst
, gen_rtx_MULT (mode
, xdst
, x1
));
11273 /* Qualify the approximate reciprocal square root when the argument is
11274 0.0 by squashing the intermediary result to 0.0. */
11275 rtx xtmp
= gen_reg_rtx (mmsk
);
11276 emit_set_insn (xtmp
, gen_rtx_AND (mmsk
, gen_rtx_NOT (mmsk
, xmsk
),
11277 gen_rtx_SUBREG (mmsk
, xdst
, 0)));
11278 emit_move_insn (xdst
, gen_rtx_SUBREG (mode
, xtmp
, 0));
11280 /* Calculate the approximate square root. */
11281 emit_set_insn (xdst
, gen_rtx_MULT (mode
, xdst
, src
));
11284 /* Finalize the approximation. */
11285 emit_set_insn (dst
, gen_rtx_MULT (mode
, xdst
, x1
));
11290 /* Emit the instruction sequence to compute the approximation for the division
11291 of NUM by DEN in QUO and return whether the sequence was emitted or not. */
11294 aarch64_emit_approx_div (rtx quo
, rtx num
, rtx den
)
11296 machine_mode mode
= GET_MODE (quo
);
11298 if (GET_MODE_INNER (mode
) == HFmode
)
11301 bool use_approx_division_p
= (flag_mlow_precision_div
11302 || (aarch64_tune_params
.approx_modes
->division
11303 & AARCH64_APPROX_MODE (mode
)));
11305 if (!flag_finite_math_only
11306 || flag_trapping_math
11307 || !flag_unsafe_math_optimizations
11308 || optimize_function_for_size_p (cfun
)
11309 || !use_approx_division_p
)
11312 if (!TARGET_SIMD
&& VECTOR_MODE_P (mode
))
11315 /* Estimate the approximate reciprocal. */
11316 rtx xrcp
= gen_reg_rtx (mode
);
11317 emit_insn (gen_aarch64_frecpe (mode
, xrcp
, den
));
11319 /* Iterate over the series twice for SF and thrice for DF. */
11320 int iterations
= (GET_MODE_INNER (mode
) == DFmode
) ? 3 : 2;
11322 /* Optionally iterate over the series once less for faster performance,
11323 while sacrificing the accuracy. */
11324 if (flag_mlow_precision_div
)
11327 /* Iterate over the series to calculate the approximate reciprocal. */
11328 rtx xtmp
= gen_reg_rtx (mode
);
11329 while (iterations
--)
11331 emit_insn (gen_aarch64_frecps (mode
, xtmp
, xrcp
, den
));
11333 if (iterations
> 0)
11334 emit_set_insn (xrcp
, gen_rtx_MULT (mode
, xrcp
, xtmp
));
11337 if (num
!= CONST1_RTX (mode
))
11339 /* As the approximate reciprocal of DEN is already calculated, only
11340 calculate the approximate division when NUM is not 1.0. */
11341 rtx xnum
= force_reg (mode
, num
);
11342 emit_set_insn (xrcp
, gen_rtx_MULT (mode
, xrcp
, xnum
));
11345 /* Finalize the approximation. */
11346 emit_set_insn (quo
, gen_rtx_MULT (mode
, xrcp
, xtmp
));
11350 /* Return the number of instructions that can be issued per cycle. */
11352 aarch64_sched_issue_rate (void)
11354 return aarch64_tune_params
.issue_rate
;
11358 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
11360 int issue_rate
= aarch64_sched_issue_rate ();
11362 return issue_rate
> 1 && !sched_fusion
? issue_rate
: 0;
11366 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
11367 autopref_multipass_dfa_lookahead_guard from haifa-sched.c. It only
11368 has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0. */
11371 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn
*insn
,
11374 return autopref_multipass_dfa_lookahead_guard (insn
, ready_index
);
11378 /* Vectorizer cost model target hooks. */
11380 /* Implement targetm.vectorize.builtin_vectorization_cost. */
11382 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost
,
11384 int misalign ATTRIBUTE_UNUSED
)
11387 const cpu_vector_cost
*costs
= aarch64_tune_params
.vec_costs
;
11390 if (vectype
!= NULL
)
11391 fp
= FLOAT_TYPE_P (vectype
);
11393 switch (type_of_cost
)
11396 return fp
? costs
->scalar_fp_stmt_cost
: costs
->scalar_int_stmt_cost
;
11399 return costs
->scalar_load_cost
;
11402 return costs
->scalar_store_cost
;
11405 return fp
? costs
->vec_fp_stmt_cost
: costs
->vec_int_stmt_cost
;
11408 return costs
->vec_align_load_cost
;
11411 return costs
->vec_store_cost
;
11413 case vec_to_scalar
:
11414 return costs
->vec_to_scalar_cost
;
11416 case scalar_to_vec
:
11417 return costs
->scalar_to_vec_cost
;
11419 case unaligned_load
:
11420 case vector_gather_load
:
11421 return costs
->vec_unalign_load_cost
;
11423 case unaligned_store
:
11424 case vector_scatter_store
:
11425 return costs
->vec_unalign_store_cost
;
11427 case cond_branch_taken
:
11428 return costs
->cond_taken_branch_cost
;
11430 case cond_branch_not_taken
:
11431 return costs
->cond_not_taken_branch_cost
;
11434 return costs
->vec_permute_cost
;
11436 case vec_promote_demote
:
11437 return fp
? costs
->vec_fp_stmt_cost
: costs
->vec_int_stmt_cost
;
11439 case vec_construct
:
11440 elements
= estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype
));
11441 return elements
/ 2 + 1;
11444 gcc_unreachable ();
11448 /* Implement targetm.vectorize.add_stmt_cost. */
11450 aarch64_add_stmt_cost (void *data
, int count
, enum vect_cost_for_stmt kind
,
11451 struct _stmt_vec_info
*stmt_info
, int misalign
,
11452 enum vect_cost_model_location where
)
11454 unsigned *cost
= (unsigned *) data
;
11455 unsigned retval
= 0;
11457 if (flag_vect_cost_model
)
11459 tree vectype
= stmt_info
? stmt_vectype (stmt_info
) : NULL_TREE
;
11461 aarch64_builtin_vectorization_cost (kind
, vectype
, misalign
);
11463 /* Statements in an inner loop relative to the loop being
11464 vectorized are weighted more heavily. The value here is
11465 arbitrary and could potentially be improved with analysis. */
11466 if (where
== vect_body
&& stmt_info
&& stmt_in_inner_loop_p (stmt_info
))
11467 count
*= 50; /* FIXME */
11469 retval
= (unsigned) (count
* stmt_cost
);
11470 cost
[where
] += retval
;
11476 static void initialize_aarch64_code_model (struct gcc_options
*);
11478 /* Parse the TO_PARSE string and put the architecture struct that it
11479 selects into RES and the architectural features into ISA_FLAGS.
11480 Return an aarch64_parse_opt_result describing the parse result.
11481 If there is an error parsing, RES and ISA_FLAGS are left unchanged.
11482 When the TO_PARSE string contains an invalid extension,
11483 a copy of the string is created and stored to INVALID_EXTENSION. */
11485 static enum aarch64_parse_opt_result
11486 aarch64_parse_arch (const char *to_parse
, const struct processor
**res
,
11487 uint64_t *isa_flags
, std::string
*invalid_extension
)
11490 const struct processor
*arch
;
11493 ext
= strchr (to_parse
, '+');
11496 len
= ext
- to_parse
;
11498 len
= strlen (to_parse
);
11501 return AARCH64_PARSE_MISSING_ARG
;
11504 /* Loop through the list of supported ARCHes to find a match. */
11505 for (arch
= all_architectures
; arch
->name
!= NULL
; arch
++)
11507 if (strlen (arch
->name
) == len
11508 && strncmp (arch
->name
, to_parse
, len
) == 0)
11510 uint64_t isa_temp
= arch
->flags
;
11514 /* TO_PARSE string contains at least one extension. */
11515 enum aarch64_parse_opt_result ext_res
11516 = aarch64_parse_extension (ext
, &isa_temp
, invalid_extension
);
11518 if (ext_res
!= AARCH64_PARSE_OK
)
11521 /* Extension parsing was successful. Confirm the result
11522 arch and ISA flags. */
11524 *isa_flags
= isa_temp
;
11525 return AARCH64_PARSE_OK
;
11529 /* ARCH name not found in list. */
11530 return AARCH64_PARSE_INVALID_ARG
;
11533 /* Parse the TO_PARSE string and put the result tuning in RES and the
11534 architecture flags in ISA_FLAGS. Return an aarch64_parse_opt_result
11535 describing the parse result. If there is an error parsing, RES and
11536 ISA_FLAGS are left unchanged.
11537 When the TO_PARSE string contains an invalid extension,
11538 a copy of the string is created and stored to INVALID_EXTENSION. */
11540 static enum aarch64_parse_opt_result
11541 aarch64_parse_cpu (const char *to_parse
, const struct processor
**res
,
11542 uint64_t *isa_flags
, std::string
*invalid_extension
)
11545 const struct processor
*cpu
;
11548 ext
= strchr (to_parse
, '+');
11551 len
= ext
- to_parse
;
11553 len
= strlen (to_parse
);
11556 return AARCH64_PARSE_MISSING_ARG
;
11559 /* Loop through the list of supported CPUs to find a match. */
11560 for (cpu
= all_cores
; cpu
->name
!= NULL
; cpu
++)
11562 if (strlen (cpu
->name
) == len
&& strncmp (cpu
->name
, to_parse
, len
) == 0)
11564 uint64_t isa_temp
= cpu
->flags
;
11569 /* TO_PARSE string contains at least one extension. */
11570 enum aarch64_parse_opt_result ext_res
11571 = aarch64_parse_extension (ext
, &isa_temp
, invalid_extension
);
11573 if (ext_res
!= AARCH64_PARSE_OK
)
11576 /* Extension parsing was successfull. Confirm the result
11577 cpu and ISA flags. */
11579 *isa_flags
= isa_temp
;
11580 return AARCH64_PARSE_OK
;
11584 /* CPU name not found in list. */
11585 return AARCH64_PARSE_INVALID_ARG
;
11588 /* Parse the TO_PARSE string and put the cpu it selects into RES.
11589 Return an aarch64_parse_opt_result describing the parse result.
11590 If the parsing fails the RES does not change. */
11592 static enum aarch64_parse_opt_result
11593 aarch64_parse_tune (const char *to_parse
, const struct processor
**res
)
11595 const struct processor
*cpu
;
11597 /* Loop through the list of supported CPUs to find a match. */
11598 for (cpu
= all_cores
; cpu
->name
!= NULL
; cpu
++)
11600 if (strcmp (cpu
->name
, to_parse
) == 0)
11603 return AARCH64_PARSE_OK
;
11607 /* CPU name not found in list. */
11608 return AARCH64_PARSE_INVALID_ARG
;
11611 /* Parse TOKEN, which has length LENGTH to see if it is an option
11612 described in FLAG. If it is, return the index bit for that fusion type.
11613 If not, error (printing OPTION_NAME) and return zero. */
11615 static unsigned int
11616 aarch64_parse_one_option_token (const char *token
,
11618 const struct aarch64_flag_desc
*flag
,
11619 const char *option_name
)
11621 for (; flag
->name
!= NULL
; flag
++)
11623 if (length
== strlen (flag
->name
)
11624 && !strncmp (flag
->name
, token
, length
))
11628 error ("unknown flag passed in %<-moverride=%s%> (%s)", option_name
, token
);
11632 /* Parse OPTION which is a comma-separated list of flags to enable.
11633 FLAGS gives the list of flags we understand, INITIAL_STATE gives any
11634 default state we inherit from the CPU tuning structures. OPTION_NAME
11635 gives the top-level option we are parsing in the -moverride string,
11636 for use in error messages. */
11638 static unsigned int
11639 aarch64_parse_boolean_options (const char *option
,
11640 const struct aarch64_flag_desc
*flags
,
11641 unsigned int initial_state
,
11642 const char *option_name
)
11644 const char separator
= '.';
11645 const char* specs
= option
;
11646 const char* ntoken
= option
;
11647 unsigned int found_flags
= initial_state
;
11649 while ((ntoken
= strchr (specs
, separator
)))
11651 size_t token_length
= ntoken
- specs
;
11652 unsigned token_ops
= aarch64_parse_one_option_token (specs
,
11656 /* If we find "none" (or, for simplicity's sake, an error) anywhere
11657 in the token stream, reset the supported operations. So:
11659 adrp+add.cmp+branch.none.adrp+add
11661 would have the result of turning on only adrp+add fusion. */
11665 found_flags
|= token_ops
;
11669 /* We ended with a comma, print something. */
11672 error ("%s string ill-formed\n", option_name
);
11676 /* We still have one more token to parse. */
11677 size_t token_length
= strlen (specs
);
11678 unsigned token_ops
= aarch64_parse_one_option_token (specs
,
11685 found_flags
|= token_ops
;
11686 return found_flags
;
11689 /* Support for overriding instruction fusion. */
11692 aarch64_parse_fuse_string (const char *fuse_string
,
11693 struct tune_params
*tune
)
11695 tune
->fusible_ops
= aarch64_parse_boolean_options (fuse_string
,
11696 aarch64_fusible_pairs
,
11701 /* Support for overriding other tuning flags. */
11704 aarch64_parse_tune_string (const char *tune_string
,
11705 struct tune_params
*tune
)
11707 tune
->extra_tuning_flags
11708 = aarch64_parse_boolean_options (tune_string
,
11709 aarch64_tuning_flags
,
11710 tune
->extra_tuning_flags
,
11714 /* Parse the sve_width tuning moverride string in TUNE_STRING.
11715 Accept the valid SVE vector widths allowed by
11716 aarch64_sve_vector_bits_enum and use it to override sve_width
11720 aarch64_parse_sve_width_string (const char *tune_string
,
11721 struct tune_params
*tune
)
11725 int n
= sscanf (tune_string
, "%d", &width
);
11728 error ("invalid format for sve_width");
11740 error ("invalid sve_width value: %d", width
);
11742 tune
->sve_width
= (enum aarch64_sve_vector_bits_enum
) width
;
11745 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
11746 we understand. If it is, extract the option string and handoff to
11747 the appropriate function. */
11750 aarch64_parse_one_override_token (const char* token
,
11752 struct tune_params
*tune
)
11754 const struct aarch64_tuning_override_function
*fn
11755 = aarch64_tuning_override_functions
;
11757 const char *option_part
= strchr (token
, '=');
11760 error ("tuning string missing in option (%s)", token
);
11764 /* Get the length of the option name. */
11765 length
= option_part
- token
;
11766 /* Skip the '=' to get to the option string. */
11769 for (; fn
->name
!= NULL
; fn
++)
11771 if (!strncmp (fn
->name
, token
, length
))
11773 fn
->parse_override (option_part
, tune
);
11778 error ("unknown tuning option (%s)",token
);
11782 /* A checking mechanism for the implementation of the tls size. */
11785 initialize_aarch64_tls_size (struct gcc_options
*opts
)
11787 if (aarch64_tls_size
== 0)
11788 aarch64_tls_size
= 24;
11790 switch (opts
->x_aarch64_cmodel_var
)
11792 case AARCH64_CMODEL_TINY
:
11793 /* Both the default and maximum TLS size allowed under tiny is 1M which
11794 needs two instructions to address, so we clamp the size to 24. */
11795 if (aarch64_tls_size
> 24)
11796 aarch64_tls_size
= 24;
11798 case AARCH64_CMODEL_SMALL
:
11799 /* The maximum TLS size allowed under small is 4G. */
11800 if (aarch64_tls_size
> 32)
11801 aarch64_tls_size
= 32;
11803 case AARCH64_CMODEL_LARGE
:
11804 /* The maximum TLS size allowed under large is 16E.
11805 FIXME: 16E should be 64bit, we only support 48bit offset now. */
11806 if (aarch64_tls_size
> 48)
11807 aarch64_tls_size
= 48;
11810 gcc_unreachable ();
11816 /* Parse STRING looking for options in the format:
11817 string :: option:string
11818 option :: name=substring
11820 substring :: defined by option. */
11823 aarch64_parse_override_string (const char* input_string
,
11824 struct tune_params
* tune
)
11826 const char separator
= ':';
11827 size_t string_length
= strlen (input_string
) + 1;
11828 char *string_root
= (char *) xmalloc (sizeof (*string_root
) * string_length
);
11829 char *string
= string_root
;
11830 strncpy (string
, input_string
, string_length
);
11831 string
[string_length
- 1] = '\0';
11833 char* ntoken
= string
;
11835 while ((ntoken
= strchr (string
, separator
)))
11837 size_t token_length
= ntoken
- string
;
11838 /* Make this substring look like a string. */
11840 aarch64_parse_one_override_token (string
, token_length
, tune
);
11844 /* One last option to parse. */
11845 aarch64_parse_one_override_token (string
, strlen (string
), tune
);
11846 free (string_root
);
11851 aarch64_override_options_after_change_1 (struct gcc_options
*opts
)
11853 if (accepted_branch_protection_string
)
11855 opts
->x_aarch64_branch_protection_string
11856 = xstrdup (accepted_branch_protection_string
);
11859 /* PR 70044: We have to be careful about being called multiple times for the
11860 same function. This means all changes should be repeatable. */
11862 /* Set aarch64_use_frame_pointer based on -fno-omit-frame-pointer.
11863 Disable the frame pointer flag so the mid-end will not use a frame
11864 pointer in leaf functions in order to support -fomit-leaf-frame-pointer.
11865 Set x_flag_omit_frame_pointer to the special value 2 to differentiate
11866 between -fomit-frame-pointer (1) and -fno-omit-frame-pointer (2). */
11867 aarch64_use_frame_pointer
= opts
->x_flag_omit_frame_pointer
!= 1;
11868 if (opts
->x_flag_omit_frame_pointer
== 0)
11869 opts
->x_flag_omit_frame_pointer
= 2;
11871 /* If not optimizing for size, set the default
11872 alignment to what the target wants. */
11873 if (!opts
->x_optimize_size
)
11875 if (opts
->x_flag_align_loops
&& !opts
->x_str_align_loops
)
11876 opts
->x_str_align_loops
= aarch64_tune_params
.loop_align
;
11877 if (opts
->x_flag_align_jumps
&& !opts
->x_str_align_jumps
)
11878 opts
->x_str_align_jumps
= aarch64_tune_params
.jump_align
;
11879 if (opts
->x_flag_align_functions
&& !opts
->x_str_align_functions
)
11880 opts
->x_str_align_functions
= aarch64_tune_params
.function_align
;
11883 /* We default to no pc-relative literal loads. */
11885 aarch64_pcrelative_literal_loads
= false;
11887 /* If -mpc-relative-literal-loads is set on the command line, this
11888 implies that the user asked for PC relative literal loads. */
11889 if (opts
->x_pcrelative_literal_loads
== 1)
11890 aarch64_pcrelative_literal_loads
= true;
11892 /* In the tiny memory model it makes no sense to disallow PC relative
11893 literal pool loads. */
11894 if (aarch64_cmodel
== AARCH64_CMODEL_TINY
11895 || aarch64_cmodel
== AARCH64_CMODEL_TINY_PIC
)
11896 aarch64_pcrelative_literal_loads
= true;
11898 /* When enabling the lower precision Newton series for the square root, also
11899 enable it for the reciprocal square root, since the latter is an
11900 intermediary step for the former. */
11901 if (flag_mlow_precision_sqrt
)
11902 flag_mrecip_low_precision_sqrt
= true;
11905 /* 'Unpack' up the internal tuning structs and update the options
11906 in OPTS. The caller must have set up selected_tune and selected_arch
11907 as all the other target-specific codegen decisions are
11908 derived from them. */
11911 aarch64_override_options_internal (struct gcc_options
*opts
)
11913 aarch64_tune_flags
= selected_tune
->flags
;
11914 aarch64_tune
= selected_tune
->sched_core
;
11915 /* Make a copy of the tuning parameters attached to the core, which
11916 we may later overwrite. */
11917 aarch64_tune_params
= *(selected_tune
->tune
);
11918 aarch64_architecture_version
= selected_arch
->architecture_version
;
11920 if (opts
->x_aarch64_override_tune_string
)
11921 aarch64_parse_override_string (opts
->x_aarch64_override_tune_string
,
11922 &aarch64_tune_params
);
11924 /* This target defaults to strict volatile bitfields. */
11925 if (opts
->x_flag_strict_volatile_bitfields
< 0 && abi_version_at_least (2))
11926 opts
->x_flag_strict_volatile_bitfields
= 1;
11928 if (aarch64_stack_protector_guard
== SSP_GLOBAL
11929 && opts
->x_aarch64_stack_protector_guard_offset_str
)
11931 error ("incompatible options %<-mstack-protector-guard=global%> and "
11932 "%<-mstack-protector-guard-offset=%s%>",
11933 aarch64_stack_protector_guard_offset_str
);
11936 if (aarch64_stack_protector_guard
== SSP_SYSREG
11937 && !(opts
->x_aarch64_stack_protector_guard_offset_str
11938 && opts
->x_aarch64_stack_protector_guard_reg_str
))
11940 error ("both %<-mstack-protector-guard-offset%> and "
11941 "%<-mstack-protector-guard-reg%> must be used "
11942 "with %<-mstack-protector-guard=sysreg%>");
11945 if (opts
->x_aarch64_stack_protector_guard_reg_str
)
11947 if (strlen (opts
->x_aarch64_stack_protector_guard_reg_str
) > 100)
11948 error ("specify a system register with a small string length.");
11951 if (opts
->x_aarch64_stack_protector_guard_offset_str
)
11954 const char *str
= aarch64_stack_protector_guard_offset_str
;
11956 long offs
= strtol (aarch64_stack_protector_guard_offset_str
, &end
, 0);
11957 if (!*str
|| *end
|| errno
)
11958 error ("%qs is not a valid offset in %qs", str
,
11959 "-mstack-protector-guard-offset=");
11960 aarch64_stack_protector_guard_offset
= offs
;
11963 initialize_aarch64_code_model (opts
);
11964 initialize_aarch64_tls_size (opts
);
11966 int queue_depth
= 0;
11967 switch (aarch64_tune_params
.autoprefetcher_model
)
11969 case tune_params::AUTOPREFETCHER_OFF
:
11972 case tune_params::AUTOPREFETCHER_WEAK
:
11975 case tune_params::AUTOPREFETCHER_STRONG
:
11976 queue_depth
= max_insn_queue_index
+ 1;
11979 gcc_unreachable ();
11982 /* We don't mind passing in global_options_set here as we don't use
11983 the *options_set structs anyway. */
11984 maybe_set_param_value (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH
,
11986 opts
->x_param_values
,
11987 global_options_set
.x_param_values
);
11989 /* Set up parameters to be used in prefetching algorithm. Do not
11990 override the defaults unless we are tuning for a core we have
11991 researched values for. */
11992 if (aarch64_tune_params
.prefetch
->num_slots
> 0)
11993 maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES
,
11994 aarch64_tune_params
.prefetch
->num_slots
,
11995 opts
->x_param_values
,
11996 global_options_set
.x_param_values
);
11997 if (aarch64_tune_params
.prefetch
->l1_cache_size
>= 0)
11998 maybe_set_param_value (PARAM_L1_CACHE_SIZE
,
11999 aarch64_tune_params
.prefetch
->l1_cache_size
,
12000 opts
->x_param_values
,
12001 global_options_set
.x_param_values
);
12002 if (aarch64_tune_params
.prefetch
->l1_cache_line_size
>= 0)
12003 maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE
,
12004 aarch64_tune_params
.prefetch
->l1_cache_line_size
,
12005 opts
->x_param_values
,
12006 global_options_set
.x_param_values
);
12007 if (aarch64_tune_params
.prefetch
->l2_cache_size
>= 0)
12008 maybe_set_param_value (PARAM_L2_CACHE_SIZE
,
12009 aarch64_tune_params
.prefetch
->l2_cache_size
,
12010 opts
->x_param_values
,
12011 global_options_set
.x_param_values
);
12012 if (!aarch64_tune_params
.prefetch
->prefetch_dynamic_strides
)
12013 maybe_set_param_value (PARAM_PREFETCH_DYNAMIC_STRIDES
,
12015 opts
->x_param_values
,
12016 global_options_set
.x_param_values
);
12017 if (aarch64_tune_params
.prefetch
->minimum_stride
>= 0)
12018 maybe_set_param_value (PARAM_PREFETCH_MINIMUM_STRIDE
,
12019 aarch64_tune_params
.prefetch
->minimum_stride
,
12020 opts
->x_param_values
,
12021 global_options_set
.x_param_values
);
12023 /* Use the alternative scheduling-pressure algorithm by default. */
12024 maybe_set_param_value (PARAM_SCHED_PRESSURE_ALGORITHM
, SCHED_PRESSURE_MODEL
,
12025 opts
->x_param_values
,
12026 global_options_set
.x_param_values
);
12028 /* If the user hasn't changed it via configure then set the default to 64 KB
12029 for the backend. */
12030 maybe_set_param_value (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE
,
12031 DEFAULT_STK_CLASH_GUARD_SIZE
== 0
12032 ? 16 : DEFAULT_STK_CLASH_GUARD_SIZE
,
12033 opts
->x_param_values
,
12034 global_options_set
.x_param_values
);
12036 /* Validate the guard size. */
12037 int guard_size
= PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE
);
12039 /* Enforce that interval is the same size as size so the mid-end does the
12041 maybe_set_param_value (PARAM_STACK_CLASH_PROTECTION_PROBE_INTERVAL
,
12043 opts
->x_param_values
,
12044 global_options_set
.x_param_values
);
12046 /* The maybe_set calls won't update the value if the user has explicitly set
12047 one. Which means we need to validate that probing interval and guard size
12050 = PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_PROBE_INTERVAL
);
12051 if (guard_size
!= probe_interval
)
12052 error ("stack clash guard size %<%d%> must be equal to probing interval "
12053 "%<%d%>", guard_size
, probe_interval
);
12055 /* Enable sw prefetching at specified optimization level for
12056 CPUS that have prefetch. Lower optimization level threshold by 1
12057 when profiling is enabled. */
12058 if (opts
->x_flag_prefetch_loop_arrays
< 0
12059 && !opts
->x_optimize_size
12060 && aarch64_tune_params
.prefetch
->default_opt_level
>= 0
12061 && opts
->x_optimize
>= aarch64_tune_params
.prefetch
->default_opt_level
)
12062 opts
->x_flag_prefetch_loop_arrays
= 1;
12064 if (opts
->x_aarch64_arch_string
== NULL
)
12065 opts
->x_aarch64_arch_string
= selected_arch
->name
;
12066 if (opts
->x_aarch64_cpu_string
== NULL
)
12067 opts
->x_aarch64_cpu_string
= selected_cpu
->name
;
12068 if (opts
->x_aarch64_tune_string
== NULL
)
12069 opts
->x_aarch64_tune_string
= selected_tune
->name
;
12071 aarch64_override_options_after_change_1 (opts
);
12074 /* Print a hint with a suggestion for a core or architecture name that
12075 most closely resembles what the user passed in STR. ARCH is true if
12076 the user is asking for an architecture name. ARCH is false if the user
12077 is asking for a core name. */
12080 aarch64_print_hint_for_core_or_arch (const char *str
, bool arch
)
12082 auto_vec
<const char *> candidates
;
12083 const struct processor
*entry
= arch
? all_architectures
: all_cores
;
12084 for (; entry
->name
!= NULL
; entry
++)
12085 candidates
.safe_push (entry
->name
);
12087 #ifdef HAVE_LOCAL_CPU_DETECT
12088 /* Add also "native" as possible value. */
12090 candidates
.safe_push ("native");
12094 const char *hint
= candidates_list_and_hint (str
, s
, candidates
);
12096 inform (input_location
, "valid arguments are: %s;"
12097 " did you mean %qs?", s
, hint
);
12099 inform (input_location
, "valid arguments are: %s", s
);
12104 /* Print a hint with a suggestion for a core name that most closely resembles
12105 what the user passed in STR. */
12108 aarch64_print_hint_for_core (const char *str
)
12110 aarch64_print_hint_for_core_or_arch (str
, false);
12113 /* Print a hint with a suggestion for an architecture name that most closely
12114 resembles what the user passed in STR. */
12117 aarch64_print_hint_for_arch (const char *str
)
12119 aarch64_print_hint_for_core_or_arch (str
, true);
12123 /* Print a hint with a suggestion for an extension name
12124 that most closely resembles what the user passed in STR. */
12127 aarch64_print_hint_for_extensions (const std::string
&str
)
12129 auto_vec
<const char *> candidates
;
12130 aarch64_get_all_extension_candidates (&candidates
);
12132 const char *hint
= candidates_list_and_hint (str
.c_str (), s
, candidates
);
12134 inform (input_location
, "valid arguments are: %s;"
12135 " did you mean %qs?", s
, hint
);
12137 inform (input_location
, "valid arguments are: %s;", s
);
12142 /* Validate a command-line -mcpu option. Parse the cpu and extensions (if any)
12143 specified in STR and throw errors if appropriate. Put the results if
12144 they are valid in RES and ISA_FLAGS. Return whether the option is
12148 aarch64_validate_mcpu (const char *str
, const struct processor
**res
,
12149 uint64_t *isa_flags
)
12151 std::string invalid_extension
;
12152 enum aarch64_parse_opt_result parse_res
12153 = aarch64_parse_cpu (str
, res
, isa_flags
, &invalid_extension
);
12155 if (parse_res
== AARCH64_PARSE_OK
)
12160 case AARCH64_PARSE_MISSING_ARG
:
12161 error ("missing cpu name in %<-mcpu=%s%>", str
);
12163 case AARCH64_PARSE_INVALID_ARG
:
12164 error ("unknown value %qs for %<-mcpu%>", str
);
12165 aarch64_print_hint_for_core (str
);
12167 case AARCH64_PARSE_INVALID_FEATURE
:
12168 error ("invalid feature modifier %qs in %<-mcpu=%s%>",
12169 invalid_extension
.c_str (), str
);
12170 aarch64_print_hint_for_extensions (invalid_extension
);
12173 gcc_unreachable ();
12179 /* Parses CONST_STR for branch protection features specified in
12180 aarch64_branch_protect_types, and set any global variables required. Returns
12181 the parsing result and assigns LAST_STR to the last processed token from
12182 CONST_STR so that it can be used for error reporting. */
12185 aarch64_parse_opt_result
aarch64_parse_branch_protection (const char *const_str
,
12188 char *str_root
= xstrdup (const_str
);
12189 char* token_save
= NULL
;
12190 char *str
= strtok_r (str_root
, "+", &token_save
);
12191 enum aarch64_parse_opt_result res
= AARCH64_PARSE_OK
;
12193 res
= AARCH64_PARSE_MISSING_ARG
;
12196 char *next_str
= strtok_r (NULL
, "+", &token_save
);
12197 /* Reset the branch protection features to their defaults. */
12198 aarch64_handle_no_branch_protection (NULL
, NULL
);
12200 while (str
&& res
== AARCH64_PARSE_OK
)
12202 const aarch64_branch_protect_type
* type
= aarch64_branch_protect_types
;
12203 bool found
= false;
12204 /* Search for this type. */
12205 while (type
&& type
->name
&& !found
&& res
== AARCH64_PARSE_OK
)
12207 if (strcmp (str
, type
->name
) == 0)
12210 res
= type
->handler (str
, next_str
);
12212 next_str
= strtok_r (NULL
, "+", &token_save
);
12217 if (found
&& res
== AARCH64_PARSE_OK
)
12219 bool found_subtype
= true;
12220 /* Loop through each token until we find one that isn't a
12222 while (found_subtype
)
12224 found_subtype
= false;
12225 const aarch64_branch_protect_type
*subtype
= type
->subtypes
;
12226 /* Search for the subtype. */
12227 while (str
&& subtype
&& subtype
->name
&& !found_subtype
12228 && res
== AARCH64_PARSE_OK
)
12230 if (strcmp (str
, subtype
->name
) == 0)
12232 found_subtype
= true;
12233 res
= subtype
->handler (str
, next_str
);
12235 next_str
= strtok_r (NULL
, "+", &token_save
);
12243 res
= AARCH64_PARSE_INVALID_ARG
;
12246 /* Copy the last processed token into the argument to pass it back.
12247 Used by option and attribute validation to print the offending token. */
12250 if (str
) strcpy (*last_str
, str
);
12251 else *last_str
= NULL
;
12253 if (res
== AARCH64_PARSE_OK
)
12255 /* If needed, alloc the accepted string then copy in const_str.
12256 Used by override_option_after_change_1. */
12257 if (!accepted_branch_protection_string
)
12258 accepted_branch_protection_string
= (char *) xmalloc (
12259 BRANCH_PROTECT_STR_MAX
12261 strncpy (accepted_branch_protection_string
, const_str
,
12262 BRANCH_PROTECT_STR_MAX
+ 1);
12263 /* Forcibly null-terminate. */
12264 accepted_branch_protection_string
[BRANCH_PROTECT_STR_MAX
] = '\0';
12270 aarch64_validate_mbranch_protection (const char *const_str
)
12272 char *str
= (char *) xmalloc (strlen (const_str
));
12273 enum aarch64_parse_opt_result res
=
12274 aarch64_parse_branch_protection (const_str
, &str
);
12275 if (res
== AARCH64_PARSE_INVALID_ARG
)
12276 error ("invalid argument %<%s%> for %<-mbranch-protection=%>", str
);
12277 else if (res
== AARCH64_PARSE_MISSING_ARG
)
12278 error ("missing argument for %<-mbranch-protection=%>");
12280 return res
== AARCH64_PARSE_OK
;
12283 /* Validate a command-line -march option. Parse the arch and extensions
12284 (if any) specified in STR and throw errors if appropriate. Put the
12285 results, if they are valid, in RES and ISA_FLAGS. Return whether the
12286 option is valid. */
12289 aarch64_validate_march (const char *str
, const struct processor
**res
,
12290 uint64_t *isa_flags
)
12292 std::string invalid_extension
;
12293 enum aarch64_parse_opt_result parse_res
12294 = aarch64_parse_arch (str
, res
, isa_flags
, &invalid_extension
);
12296 if (parse_res
== AARCH64_PARSE_OK
)
12301 case AARCH64_PARSE_MISSING_ARG
:
12302 error ("missing arch name in %<-march=%s%>", str
);
12304 case AARCH64_PARSE_INVALID_ARG
:
12305 error ("unknown value %qs for %<-march%>", str
);
12306 aarch64_print_hint_for_arch (str
);
12308 case AARCH64_PARSE_INVALID_FEATURE
:
12309 error ("invalid feature modifier %qs in %<-march=%s%>",
12310 invalid_extension
.c_str (), str
);
12311 aarch64_print_hint_for_extensions (invalid_extension
);
12314 gcc_unreachable ();
12320 /* Validate a command-line -mtune option. Parse the cpu
12321 specified in STR and throw errors if appropriate. Put the
12322 result, if it is valid, in RES. Return whether the option is
12326 aarch64_validate_mtune (const char *str
, const struct processor
**res
)
12328 enum aarch64_parse_opt_result parse_res
12329 = aarch64_parse_tune (str
, res
);
12331 if (parse_res
== AARCH64_PARSE_OK
)
12336 case AARCH64_PARSE_MISSING_ARG
:
12337 error ("missing cpu name in %<-mtune=%s%>", str
);
12339 case AARCH64_PARSE_INVALID_ARG
:
12340 error ("unknown value %qs for %<-mtune%>", str
);
12341 aarch64_print_hint_for_core (str
);
12344 gcc_unreachable ();
12349 /* Return the CPU corresponding to the enum CPU.
12350 If it doesn't specify a cpu, return the default. */
12352 static const struct processor
*
12353 aarch64_get_tune_cpu (enum aarch64_processor cpu
)
12355 if (cpu
!= aarch64_none
)
12356 return &all_cores
[cpu
];
12358 /* The & 0x3f is to extract the bottom 6 bits that encode the
12359 default cpu as selected by the --with-cpu GCC configure option
12361 ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
12362 flags mechanism should be reworked to make it more sane. */
12363 return &all_cores
[TARGET_CPU_DEFAULT
& 0x3f];
12366 /* Return the architecture corresponding to the enum ARCH.
12367 If it doesn't specify a valid architecture, return the default. */
12369 static const struct processor
*
12370 aarch64_get_arch (enum aarch64_arch arch
)
12372 if (arch
!= aarch64_no_arch
)
12373 return &all_architectures
[arch
];
12375 const struct processor
*cpu
= &all_cores
[TARGET_CPU_DEFAULT
& 0x3f];
12377 return &all_architectures
[cpu
->arch
];
12380 /* Return the VG value associated with -msve-vector-bits= value VALUE. */
12383 aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits_enum value
)
12385 /* For now generate vector-length agnostic code for -msve-vector-bits=128.
12386 This ensures we can clearly distinguish SVE and Advanced SIMD modes when
12387 deciding which .md file patterns to use and when deciding whether
12388 something is a legitimate address or constant. */
12389 if (value
== SVE_SCALABLE
|| value
== SVE_128
)
12390 return poly_uint16 (2, 2);
12392 return (int) value
/ 64;
12395 /* Implement TARGET_OPTION_OVERRIDE. This is called once in the beginning
12396 and is used to parse the -m{cpu,tune,arch} strings and setup the initial
12397 tuning structs. In particular it must set selected_tune and
12398 aarch64_isa_flags that define the available ISA features and tuning
12399 decisions. It must also set selected_arch as this will be used to
12400 output the .arch asm tags for each function. */
12403 aarch64_override_options (void)
12405 uint64_t cpu_isa
= 0;
12406 uint64_t arch_isa
= 0;
12407 aarch64_isa_flags
= 0;
12409 bool valid_cpu
= true;
12410 bool valid_tune
= true;
12411 bool valid_arch
= true;
12413 selected_cpu
= NULL
;
12414 selected_arch
= NULL
;
12415 selected_tune
= NULL
;
12417 if (aarch64_branch_protection_string
)
12418 aarch64_validate_mbranch_protection (aarch64_branch_protection_string
);
12420 /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
12421 If either of -march or -mtune is given, they override their
12422 respective component of -mcpu. */
12423 if (aarch64_cpu_string
)
12424 valid_cpu
= aarch64_validate_mcpu (aarch64_cpu_string
, &selected_cpu
,
12427 if (aarch64_arch_string
)
12428 valid_arch
= aarch64_validate_march (aarch64_arch_string
, &selected_arch
,
12431 if (aarch64_tune_string
)
12432 valid_tune
= aarch64_validate_mtune (aarch64_tune_string
, &selected_tune
);
12434 #ifdef SUBTARGET_OVERRIDE_OPTIONS
12435 SUBTARGET_OVERRIDE_OPTIONS
;
12438 /* If the user did not specify a processor, choose the default
12439 one for them. This will be the CPU set during configuration using
12440 --with-cpu, otherwise it is "generic". */
12445 selected_cpu
= &all_cores
[selected_arch
->ident
];
12446 aarch64_isa_flags
= arch_isa
;
12447 explicit_arch
= selected_arch
->arch
;
12451 /* Get default configure-time CPU. */
12452 selected_cpu
= aarch64_get_tune_cpu (aarch64_none
);
12453 aarch64_isa_flags
= TARGET_CPU_DEFAULT
>> 6;
12457 explicit_tune_core
= selected_tune
->ident
;
12459 /* If both -mcpu and -march are specified check that they are architecturally
12460 compatible, warn if they're not and prefer the -march ISA flags. */
12461 else if (selected_arch
)
12463 if (selected_arch
->arch
!= selected_cpu
->arch
)
12465 warning (0, "switch %<-mcpu=%s%> conflicts with %<-march=%s%> switch",
12466 all_architectures
[selected_cpu
->arch
].name
,
12467 selected_arch
->name
);
12469 aarch64_isa_flags
= arch_isa
;
12470 explicit_arch
= selected_arch
->arch
;
12471 explicit_tune_core
= selected_tune
? selected_tune
->ident
12472 : selected_cpu
->ident
;
12476 /* -mcpu but no -march. */
12477 aarch64_isa_flags
= cpu_isa
;
12478 explicit_tune_core
= selected_tune
? selected_tune
->ident
12479 : selected_cpu
->ident
;
12480 gcc_assert (selected_cpu
);
12481 selected_arch
= &all_architectures
[selected_cpu
->arch
];
12482 explicit_arch
= selected_arch
->arch
;
12485 /* Set the arch as well as we will need it when outputing
12486 the .arch directive in assembly. */
12487 if (!selected_arch
)
12489 gcc_assert (selected_cpu
);
12490 selected_arch
= &all_architectures
[selected_cpu
->arch
];
12493 if (!selected_tune
)
12494 selected_tune
= selected_cpu
;
12496 if (aarch64_enable_bti
== 2)
12498 #ifdef TARGET_ENABLE_BTI
12499 aarch64_enable_bti
= 1;
12501 aarch64_enable_bti
= 0;
12505 /* Return address signing is currently not supported for ILP32 targets. For
12506 LP64 targets use the configured option in the absence of a command-line
12507 option for -mbranch-protection. */
12508 if (!TARGET_ILP32
&& accepted_branch_protection_string
== NULL
)
12510 #ifdef TARGET_ENABLE_PAC_RET
12511 aarch64_ra_sign_scope
= AARCH64_FUNCTION_NON_LEAF
;
12513 aarch64_ra_sign_scope
= AARCH64_FUNCTION_NONE
;
12517 #ifndef HAVE_AS_MABI_OPTION
12518 /* The compiler may have been configured with 2.23.* binutils, which does
12519 not have support for ILP32. */
12521 error ("assembler does not support %<-mabi=ilp32%>");
12524 /* Convert -msve-vector-bits to a VG count. */
12525 aarch64_sve_vg
= aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits
);
12527 if (aarch64_ra_sign_scope
!= AARCH64_FUNCTION_NONE
&& TARGET_ILP32
)
12528 sorry ("return address signing is only supported for %<-mabi=lp64%>");
12530 /* Make sure we properly set up the explicit options. */
12531 if ((aarch64_cpu_string
&& valid_cpu
)
12532 || (aarch64_tune_string
&& valid_tune
))
12533 gcc_assert (explicit_tune_core
!= aarch64_none
);
12535 if ((aarch64_cpu_string
&& valid_cpu
)
12536 || (aarch64_arch_string
&& valid_arch
))
12537 gcc_assert (explicit_arch
!= aarch64_no_arch
);
12539 /* The pass to insert speculation tracking runs before
12540 shrink-wrapping and the latter does not know how to update the
12541 tracking status. So disable it in this case. */
12542 if (aarch64_track_speculation
)
12543 flag_shrink_wrap
= 0;
12545 aarch64_override_options_internal (&global_options
);
12547 /* Save these options as the default ones in case we push and pop them later
12548 while processing functions with potential target attributes. */
12549 target_option_default_node
= target_option_current_node
12550 = build_target_option_node (&global_options
);
12553 /* Implement targetm.override_options_after_change. */
12556 aarch64_override_options_after_change (void)
12558 aarch64_override_options_after_change_1 (&global_options
);
12561 static struct machine_function
*
12562 aarch64_init_machine_status (void)
12564 struct machine_function
*machine
;
12565 machine
= ggc_cleared_alloc
<machine_function
> ();
12570 aarch64_init_expanders (void)
12572 init_machine_status
= aarch64_init_machine_status
;
12575 /* A checking mechanism for the implementation of the various code models. */
12577 initialize_aarch64_code_model (struct gcc_options
*opts
)
12579 if (opts
->x_flag_pic
)
12581 switch (opts
->x_aarch64_cmodel_var
)
12583 case AARCH64_CMODEL_TINY
:
12584 aarch64_cmodel
= AARCH64_CMODEL_TINY_PIC
;
12586 case AARCH64_CMODEL_SMALL
:
12587 #ifdef HAVE_AS_SMALL_PIC_RELOCS
12588 aarch64_cmodel
= (flag_pic
== 2
12589 ? AARCH64_CMODEL_SMALL_PIC
12590 : AARCH64_CMODEL_SMALL_SPIC
);
12592 aarch64_cmodel
= AARCH64_CMODEL_SMALL_PIC
;
12595 case AARCH64_CMODEL_LARGE
:
12596 sorry ("code model %qs with %<-f%s%>", "large",
12597 opts
->x_flag_pic
> 1 ? "PIC" : "pic");
12600 gcc_unreachable ();
12604 aarch64_cmodel
= opts
->x_aarch64_cmodel_var
;
12607 /* Implement TARGET_OPTION_SAVE. */
12610 aarch64_option_save (struct cl_target_option
*ptr
, struct gcc_options
*opts
)
12612 ptr
->x_aarch64_override_tune_string
= opts
->x_aarch64_override_tune_string
;
12613 ptr
->x_aarch64_branch_protection_string
12614 = opts
->x_aarch64_branch_protection_string
;
12617 /* Implements TARGET_OPTION_RESTORE. Restore the backend codegen decisions
12618 using the information saved in PTR. */
12621 aarch64_option_restore (struct gcc_options
*opts
, struct cl_target_option
*ptr
)
12623 opts
->x_explicit_tune_core
= ptr
->x_explicit_tune_core
;
12624 selected_tune
= aarch64_get_tune_cpu (ptr
->x_explicit_tune_core
);
12625 opts
->x_explicit_arch
= ptr
->x_explicit_arch
;
12626 selected_arch
= aarch64_get_arch (ptr
->x_explicit_arch
);
12627 opts
->x_aarch64_override_tune_string
= ptr
->x_aarch64_override_tune_string
;
12628 opts
->x_aarch64_branch_protection_string
12629 = ptr
->x_aarch64_branch_protection_string
;
12630 if (opts
->x_aarch64_branch_protection_string
)
12632 aarch64_parse_branch_protection (opts
->x_aarch64_branch_protection_string
,
12636 aarch64_override_options_internal (opts
);
12639 /* Implement TARGET_OPTION_PRINT. */
12642 aarch64_option_print (FILE *file
, int indent
, struct cl_target_option
*ptr
)
12644 const struct processor
*cpu
12645 = aarch64_get_tune_cpu (ptr
->x_explicit_tune_core
);
12646 uint64_t isa_flags
= ptr
->x_aarch64_isa_flags
;
12647 const struct processor
*arch
= aarch64_get_arch (ptr
->x_explicit_arch
);
12648 std::string extension
12649 = aarch64_get_extension_string_for_isa_flags (isa_flags
, arch
->flags
);
12651 fprintf (file
, "%*sselected tune = %s\n", indent
, "", cpu
->name
);
12652 fprintf (file
, "%*sselected arch = %s%s\n", indent
, "",
12653 arch
->name
, extension
.c_str ());
12656 static GTY(()) tree aarch64_previous_fndecl
;
12659 aarch64_reset_previous_fndecl (void)
12661 aarch64_previous_fndecl
= NULL
;
12664 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
12665 Used by aarch64_set_current_function and aarch64_pragma_target_parse to
12666 make sure optab availability predicates are recomputed when necessary. */
12669 aarch64_save_restore_target_globals (tree new_tree
)
12671 if (TREE_TARGET_GLOBALS (new_tree
))
12672 restore_target_globals (TREE_TARGET_GLOBALS (new_tree
));
12673 else if (new_tree
== target_option_default_node
)
12674 restore_target_globals (&default_target_globals
);
12676 TREE_TARGET_GLOBALS (new_tree
) = save_target_globals_default_opts ();
12679 /* Implement TARGET_SET_CURRENT_FUNCTION. Unpack the codegen decisions
12680 like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
12681 of the function, if such exists. This function may be called multiple
12682 times on a single function so use aarch64_previous_fndecl to avoid
12683 setting up identical state. */
12686 aarch64_set_current_function (tree fndecl
)
12688 if (!fndecl
|| fndecl
== aarch64_previous_fndecl
)
12691 tree old_tree
= (aarch64_previous_fndecl
12692 ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl
)
12695 tree new_tree
= DECL_FUNCTION_SPECIFIC_TARGET (fndecl
);
12697 /* If current function has no attributes but the previous one did,
12698 use the default node. */
12699 if (!new_tree
&& old_tree
)
12700 new_tree
= target_option_default_node
;
12702 /* If nothing to do, return. #pragma GCC reset or #pragma GCC pop to
12703 the default have been handled by aarch64_save_restore_target_globals from
12704 aarch64_pragma_target_parse. */
12705 if (old_tree
== new_tree
)
12708 aarch64_previous_fndecl
= fndecl
;
12710 /* First set the target options. */
12711 cl_target_option_restore (&global_options
, TREE_TARGET_OPTION (new_tree
));
12713 aarch64_save_restore_target_globals (new_tree
);
12716 /* Enum describing the various ways we can handle attributes.
12717 In many cases we can reuse the generic option handling machinery. */
12719 enum aarch64_attr_opt_type
12721 aarch64_attr_mask
, /* Attribute should set a bit in target_flags. */
12722 aarch64_attr_bool
, /* Attribute sets or unsets a boolean variable. */
12723 aarch64_attr_enum
, /* Attribute sets an enum variable. */
12724 aarch64_attr_custom
/* Attribute requires a custom handling function. */
12727 /* All the information needed to handle a target attribute.
12728 NAME is the name of the attribute.
12729 ATTR_TYPE specifies the type of behavior of the attribute as described
12730 in the definition of enum aarch64_attr_opt_type.
12731 ALLOW_NEG is true if the attribute supports a "no-" form.
12732 HANDLER is the function that takes the attribute string as an argument
12733 It is needed only when the ATTR_TYPE is aarch64_attr_custom.
12734 OPT_NUM is the enum specifying the option that the attribute modifies.
12735 This is needed for attributes that mirror the behavior of a command-line
12736 option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
12737 aarch64_attr_enum. */
12739 struct aarch64_attribute_info
12742 enum aarch64_attr_opt_type attr_type
;
12744 bool (*handler
) (const char *);
12745 enum opt_code opt_num
;
12748 /* Handle the ARCH_STR argument to the arch= target attribute. */
12751 aarch64_handle_attr_arch (const char *str
)
12753 const struct processor
*tmp_arch
= NULL
;
12754 std::string invalid_extension
;
12755 enum aarch64_parse_opt_result parse_res
12756 = aarch64_parse_arch (str
, &tmp_arch
, &aarch64_isa_flags
, &invalid_extension
);
12758 if (parse_res
== AARCH64_PARSE_OK
)
12760 gcc_assert (tmp_arch
);
12761 selected_arch
= tmp_arch
;
12762 explicit_arch
= selected_arch
->arch
;
12768 case AARCH64_PARSE_MISSING_ARG
:
12769 error ("missing name in %<target(\"arch=\")%> pragma or attribute");
12771 case AARCH64_PARSE_INVALID_ARG
:
12772 error ("invalid name (\"%s\") in %<target(\"arch=\")%> pragma or attribute", str
);
12773 aarch64_print_hint_for_arch (str
);
12775 case AARCH64_PARSE_INVALID_FEATURE
:
12776 error ("invalid feature modifier %s of value (\"%s\") in "
12777 "%<target()%> pragma or attribute", invalid_extension
.c_str (), str
);
12778 aarch64_print_hint_for_extensions (invalid_extension
);
12781 gcc_unreachable ();
12787 /* Handle the argument CPU_STR to the cpu= target attribute. */
12790 aarch64_handle_attr_cpu (const char *str
)
12792 const struct processor
*tmp_cpu
= NULL
;
12793 std::string invalid_extension
;
12794 enum aarch64_parse_opt_result parse_res
12795 = aarch64_parse_cpu (str
, &tmp_cpu
, &aarch64_isa_flags
, &invalid_extension
);
12797 if (parse_res
== AARCH64_PARSE_OK
)
12799 gcc_assert (tmp_cpu
);
12800 selected_tune
= tmp_cpu
;
12801 explicit_tune_core
= selected_tune
->ident
;
12803 selected_arch
= &all_architectures
[tmp_cpu
->arch
];
12804 explicit_arch
= selected_arch
->arch
;
12810 case AARCH64_PARSE_MISSING_ARG
:
12811 error ("missing name in %<target(\"cpu=\")%> pragma or attribute");
12813 case AARCH64_PARSE_INVALID_ARG
:
12814 error ("invalid name (\"%s\") in %<target(\"cpu=\")%> pragma or attribute", str
);
12815 aarch64_print_hint_for_core (str
);
12817 case AARCH64_PARSE_INVALID_FEATURE
:
12818 error ("invalid feature modifier %s of value (\"%s\") in "
12819 "%<target()%> pragma or attribute", invalid_extension
.c_str (), str
);
12820 aarch64_print_hint_for_extensions (invalid_extension
);
12823 gcc_unreachable ();
12829 /* Handle the argument STR to the branch-protection= attribute. */
12832 aarch64_handle_attr_branch_protection (const char* str
)
12834 char *err_str
= (char *) xmalloc (strlen (str
));
12835 enum aarch64_parse_opt_result res
= aarch64_parse_branch_protection (str
,
12837 bool success
= false;
12840 case AARCH64_PARSE_MISSING_ARG
:
12841 error ("missing argument to %<target(\"branch-protection=\")%> pragma or"
12844 case AARCH64_PARSE_INVALID_ARG
:
12845 error ("invalid protection type (\"%s\") in %<target(\"branch-protection"
12846 "=\")%> pragma or attribute", err_str
);
12848 case AARCH64_PARSE_OK
:
12850 /* Fall through. */
12851 case AARCH64_PARSE_INVALID_FEATURE
:
12854 gcc_unreachable ();
12860 /* Handle the argument STR to the tune= target attribute. */
12863 aarch64_handle_attr_tune (const char *str
)
12865 const struct processor
*tmp_tune
= NULL
;
12866 enum aarch64_parse_opt_result parse_res
12867 = aarch64_parse_tune (str
, &tmp_tune
);
12869 if (parse_res
== AARCH64_PARSE_OK
)
12871 gcc_assert (tmp_tune
);
12872 selected_tune
= tmp_tune
;
12873 explicit_tune_core
= selected_tune
->ident
;
12879 case AARCH64_PARSE_INVALID_ARG
:
12880 error ("invalid name (\"%s\") in %<target(\"tune=\")%> pragma or attribute", str
);
12881 aarch64_print_hint_for_core (str
);
12884 gcc_unreachable ();
12890 /* Parse an architecture extensions target attribute string specified in STR.
12891 For example "+fp+nosimd". Show any errors if needed. Return TRUE
12892 if successful. Update aarch64_isa_flags to reflect the ISA features
12896 aarch64_handle_attr_isa_flags (char *str
)
12898 enum aarch64_parse_opt_result parse_res
;
12899 uint64_t isa_flags
= aarch64_isa_flags
;
12901 /* We allow "+nothing" in the beginning to clear out all architectural
12902 features if the user wants to handpick specific features. */
12903 if (strncmp ("+nothing", str
, 8) == 0)
12909 std::string invalid_extension
;
12910 parse_res
= aarch64_parse_extension (str
, &isa_flags
, &invalid_extension
);
12912 if (parse_res
== AARCH64_PARSE_OK
)
12914 aarch64_isa_flags
= isa_flags
;
12920 case AARCH64_PARSE_MISSING_ARG
:
12921 error ("missing value in %<target()%> pragma or attribute");
12924 case AARCH64_PARSE_INVALID_FEATURE
:
12925 error ("invalid feature modifier %s of value (\"%s\") in "
12926 "%<target()%> pragma or attribute", invalid_extension
.c_str (), str
);
12930 gcc_unreachable ();
12936 /* The target attributes that we support. On top of these we also support just
12937 ISA extensions, like __attribute__ ((target ("+crc"))), but that case is
12938 handled explicitly in aarch64_process_one_target_attr. */
12940 static const struct aarch64_attribute_info aarch64_attributes
[] =
12942 { "general-regs-only", aarch64_attr_mask
, false, NULL
,
12943 OPT_mgeneral_regs_only
},
12944 { "fix-cortex-a53-835769", aarch64_attr_bool
, true, NULL
,
12945 OPT_mfix_cortex_a53_835769
},
12946 { "fix-cortex-a53-843419", aarch64_attr_bool
, true, NULL
,
12947 OPT_mfix_cortex_a53_843419
},
12948 { "cmodel", aarch64_attr_enum
, false, NULL
, OPT_mcmodel_
},
12949 { "strict-align", aarch64_attr_mask
, true, NULL
, OPT_mstrict_align
},
12950 { "omit-leaf-frame-pointer", aarch64_attr_bool
, true, NULL
,
12951 OPT_momit_leaf_frame_pointer
},
12952 { "tls-dialect", aarch64_attr_enum
, false, NULL
, OPT_mtls_dialect_
},
12953 { "arch", aarch64_attr_custom
, false, aarch64_handle_attr_arch
,
12955 { "cpu", aarch64_attr_custom
, false, aarch64_handle_attr_cpu
, OPT_mcpu_
},
12956 { "tune", aarch64_attr_custom
, false, aarch64_handle_attr_tune
,
12958 { "branch-protection", aarch64_attr_custom
, false,
12959 aarch64_handle_attr_branch_protection
, OPT_mbranch_protection_
},
12960 { "sign-return-address", aarch64_attr_enum
, false, NULL
,
12961 OPT_msign_return_address_
},
12962 { NULL
, aarch64_attr_custom
, false, NULL
, OPT____
}
12965 /* Parse ARG_STR which contains the definition of one target attribute.
12966 Show appropriate errors if any or return true if the attribute is valid. */
12969 aarch64_process_one_target_attr (char *arg_str
)
12971 bool invert
= false;
12973 size_t len
= strlen (arg_str
);
12977 error ("malformed %<target()%> pragma or attribute");
12981 char *str_to_check
= (char *) alloca (len
+ 1);
12982 strcpy (str_to_check
, arg_str
);
12984 /* We have something like __attribute__ ((target ("+fp+nosimd"))).
12985 It is easier to detect and handle it explicitly here rather than going
12986 through the machinery for the rest of the target attributes in this
12988 if (*str_to_check
== '+')
12989 return aarch64_handle_attr_isa_flags (str_to_check
);
12991 if (len
> 3 && strncmp (str_to_check
, "no-", 3) == 0)
12996 char *arg
= strchr (str_to_check
, '=');
12998 /* If we found opt=foo then terminate STR_TO_CHECK at the '='
12999 and point ARG to "foo". */
13005 const struct aarch64_attribute_info
*p_attr
;
13006 bool found
= false;
13007 for (p_attr
= aarch64_attributes
; p_attr
->name
; p_attr
++)
13009 /* If the names don't match up, or the user has given an argument
13010 to an attribute that doesn't accept one, or didn't give an argument
13011 to an attribute that expects one, fail to match. */
13012 if (strcmp (str_to_check
, p_attr
->name
) != 0)
13016 bool attr_need_arg_p
= p_attr
->attr_type
== aarch64_attr_custom
13017 || p_attr
->attr_type
== aarch64_attr_enum
;
13019 if (attr_need_arg_p
^ (arg
!= NULL
))
13021 error ("pragma or attribute %<target(\"%s\")%> does not accept an argument", str_to_check
);
13025 /* If the name matches but the attribute does not allow "no-" versions
13026 then we can't match. */
13027 if (invert
&& !p_attr
->allow_neg
)
13029 error ("pragma or attribute %<target(\"%s\")%> does not allow a negated form", str_to_check
);
13033 switch (p_attr
->attr_type
)
13035 /* Has a custom handler registered.
13036 For example, cpu=, arch=, tune=. */
13037 case aarch64_attr_custom
:
13038 gcc_assert (p_attr
->handler
);
13039 if (!p_attr
->handler (arg
))
13043 /* Either set or unset a boolean option. */
13044 case aarch64_attr_bool
:
13046 struct cl_decoded_option decoded
;
13048 generate_option (p_attr
->opt_num
, NULL
, !invert
,
13049 CL_TARGET
, &decoded
);
13050 aarch64_handle_option (&global_options
, &global_options_set
,
13051 &decoded
, input_location
);
13054 /* Set or unset a bit in the target_flags. aarch64_handle_option
13055 should know what mask to apply given the option number. */
13056 case aarch64_attr_mask
:
13058 struct cl_decoded_option decoded
;
13059 /* We only need to specify the option number.
13060 aarch64_handle_option will know which mask to apply. */
13061 decoded
.opt_index
= p_attr
->opt_num
;
13062 decoded
.value
= !invert
;
13063 aarch64_handle_option (&global_options
, &global_options_set
,
13064 &decoded
, input_location
);
13067 /* Use the option setting machinery to set an option to an enum. */
13068 case aarch64_attr_enum
:
13073 valid
= opt_enum_arg_to_value (p_attr
->opt_num
, arg
,
13074 &value
, CL_TARGET
);
13077 set_option (&global_options
, NULL
, p_attr
->opt_num
, value
,
13078 NULL
, DK_UNSPECIFIED
, input_location
,
13083 error ("pragma or attribute %<target(\"%s=%s\")%> is not valid", str_to_check
, arg
);
13088 gcc_unreachable ();
13092 /* If we reached here we either have found an attribute and validated
13093 it or didn't match any. If we matched an attribute but its arguments
13094 were malformed we will have returned false already. */
13098 /* Count how many times the character C appears in
13099 NULL-terminated string STR. */
13101 static unsigned int
13102 num_occurences_in_str (char c
, char *str
)
13104 unsigned int res
= 0;
13105 while (*str
!= '\0')
13116 /* Parse the tree in ARGS that contains the target attribute information
13117 and update the global target options space. */
13120 aarch64_process_target_attr (tree args
)
13122 if (TREE_CODE (args
) == TREE_LIST
)
13126 tree head
= TREE_VALUE (args
);
13129 if (!aarch64_process_target_attr (head
))
13132 args
= TREE_CHAIN (args
);
13138 if (TREE_CODE (args
) != STRING_CST
)
13140 error ("attribute %<target%> argument not a string");
13144 size_t len
= strlen (TREE_STRING_POINTER (args
));
13145 char *str_to_check
= (char *) alloca (len
+ 1);
13146 strcpy (str_to_check
, TREE_STRING_POINTER (args
));
13150 error ("malformed %<target()%> pragma or attribute");
13154 /* Used to catch empty spaces between commas i.e.
13155 attribute ((target ("attr1,,attr2"))). */
13156 unsigned int num_commas
= num_occurences_in_str (',', str_to_check
);
13158 /* Handle multiple target attributes separated by ','. */
13159 char *token
= strtok_r (str_to_check
, ",", &str_to_check
);
13161 unsigned int num_attrs
= 0;
13165 if (!aarch64_process_one_target_attr (token
))
13167 error ("pragma or attribute %<target(\"%s\")%> is not valid", token
);
13171 token
= strtok_r (NULL
, ",", &str_to_check
);
13174 if (num_attrs
!= num_commas
+ 1)
13176 error ("malformed %<target(\"%s\")%> pragma or attribute", TREE_STRING_POINTER (args
));
13183 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P. This is used to
13184 process attribute ((target ("..."))). */
13187 aarch64_option_valid_attribute_p (tree fndecl
, tree
, tree args
, int)
13189 struct cl_target_option cur_target
;
13192 tree new_target
, new_optimize
;
13193 tree existing_target
= DECL_FUNCTION_SPECIFIC_TARGET (fndecl
);
13195 /* If what we're processing is the current pragma string then the
13196 target option node is already stored in target_option_current_node
13197 by aarch64_pragma_target_parse in aarch64-c.c. Use that to avoid
13198 having to re-parse the string. This is especially useful to keep
13199 arm_neon.h compile times down since that header contains a lot
13200 of intrinsics enclosed in pragmas. */
13201 if (!existing_target
&& args
== current_target_pragma
)
13203 DECL_FUNCTION_SPECIFIC_TARGET (fndecl
) = target_option_current_node
;
13206 tree func_optimize
= DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl
);
13208 old_optimize
= build_optimization_node (&global_options
);
13209 func_optimize
= DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl
);
13211 /* If the function changed the optimization levels as well as setting
13212 target options, start with the optimizations specified. */
13213 if (func_optimize
&& func_optimize
!= old_optimize
)
13214 cl_optimization_restore (&global_options
,
13215 TREE_OPTIMIZATION (func_optimize
));
13217 /* Save the current target options to restore at the end. */
13218 cl_target_option_save (&cur_target
, &global_options
);
13220 /* If fndecl already has some target attributes applied to it, unpack
13221 them so that we add this attribute on top of them, rather than
13222 overwriting them. */
13223 if (existing_target
)
13225 struct cl_target_option
*existing_options
13226 = TREE_TARGET_OPTION (existing_target
);
13228 if (existing_options
)
13229 cl_target_option_restore (&global_options
, existing_options
);
13232 cl_target_option_restore (&global_options
,
13233 TREE_TARGET_OPTION (target_option_current_node
));
13235 ret
= aarch64_process_target_attr (args
);
13237 /* Set up any additional state. */
13240 aarch64_override_options_internal (&global_options
);
13241 /* Initialize SIMD builtins if we haven't already.
13242 Set current_target_pragma to NULL for the duration so that
13243 the builtin initialization code doesn't try to tag the functions
13244 being built with the attributes specified by any current pragma, thus
13245 going into an infinite recursion. */
13248 tree saved_current_target_pragma
= current_target_pragma
;
13249 current_target_pragma
= NULL
;
13250 aarch64_init_simd_builtins ();
13251 current_target_pragma
= saved_current_target_pragma
;
13253 new_target
= build_target_option_node (&global_options
);
13258 new_optimize
= build_optimization_node (&global_options
);
13262 DECL_FUNCTION_SPECIFIC_TARGET (fndecl
) = new_target
;
13264 if (old_optimize
!= new_optimize
)
13265 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl
) = new_optimize
;
13268 cl_target_option_restore (&global_options
, &cur_target
);
13270 if (old_optimize
!= new_optimize
)
13271 cl_optimization_restore (&global_options
,
13272 TREE_OPTIMIZATION (old_optimize
));
13276 /* Helper for aarch64_can_inline_p. In the case where CALLER and CALLEE are
13277 tri-bool options (yes, no, don't care) and the default value is
13278 DEF, determine whether to reject inlining. */
13281 aarch64_tribools_ok_for_inlining_p (int caller
, int callee
,
13282 int dont_care
, int def
)
13284 /* If the callee doesn't care, always allow inlining. */
13285 if (callee
== dont_care
)
13288 /* If the caller doesn't care, always allow inlining. */
13289 if (caller
== dont_care
)
13292 /* Otherwise, allow inlining if either the callee and caller values
13293 agree, or if the callee is using the default value. */
13294 return (callee
== caller
|| callee
== def
);
13297 /* Implement TARGET_CAN_INLINE_P. Decide whether it is valid
13298 to inline CALLEE into CALLER based on target-specific info.
13299 Make sure that the caller and callee have compatible architectural
13300 features. Then go through the other possible target attributes
13301 and see if they can block inlining. Try not to reject always_inline
13302 callees unless they are incompatible architecturally. */
13305 aarch64_can_inline_p (tree caller
, tree callee
)
13307 tree caller_tree
= DECL_FUNCTION_SPECIFIC_TARGET (caller
);
13308 tree callee_tree
= DECL_FUNCTION_SPECIFIC_TARGET (callee
);
13310 struct cl_target_option
*caller_opts
13311 = TREE_TARGET_OPTION (caller_tree
? caller_tree
13312 : target_option_default_node
);
13314 struct cl_target_option
*callee_opts
13315 = TREE_TARGET_OPTION (callee_tree
? callee_tree
13316 : target_option_default_node
);
13318 /* Callee's ISA flags should be a subset of the caller's. */
13319 if ((caller_opts
->x_aarch64_isa_flags
& callee_opts
->x_aarch64_isa_flags
)
13320 != callee_opts
->x_aarch64_isa_flags
)
13323 /* Allow non-strict aligned functions inlining into strict
13325 if ((TARGET_STRICT_ALIGN_P (caller_opts
->x_target_flags
)
13326 != TARGET_STRICT_ALIGN_P (callee_opts
->x_target_flags
))
13327 && !(!TARGET_STRICT_ALIGN_P (callee_opts
->x_target_flags
)
13328 && TARGET_STRICT_ALIGN_P (caller_opts
->x_target_flags
)))
13331 bool always_inline
= lookup_attribute ("always_inline",
13332 DECL_ATTRIBUTES (callee
));
13334 /* If the architectural features match up and the callee is always_inline
13335 then the other attributes don't matter. */
13339 if (caller_opts
->x_aarch64_cmodel_var
13340 != callee_opts
->x_aarch64_cmodel_var
)
13343 if (caller_opts
->x_aarch64_tls_dialect
13344 != callee_opts
->x_aarch64_tls_dialect
)
13347 /* Honour explicit requests to workaround errata. */
13348 if (!aarch64_tribools_ok_for_inlining_p (
13349 caller_opts
->x_aarch64_fix_a53_err835769
,
13350 callee_opts
->x_aarch64_fix_a53_err835769
,
13351 2, TARGET_FIX_ERR_A53_835769_DEFAULT
))
13354 if (!aarch64_tribools_ok_for_inlining_p (
13355 caller_opts
->x_aarch64_fix_a53_err843419
,
13356 callee_opts
->x_aarch64_fix_a53_err843419
,
13357 2, TARGET_FIX_ERR_A53_843419
))
13360 /* If the user explicitly specified -momit-leaf-frame-pointer for the
13361 caller and calle and they don't match up, reject inlining. */
13362 if (!aarch64_tribools_ok_for_inlining_p (
13363 caller_opts
->x_flag_omit_leaf_frame_pointer
,
13364 callee_opts
->x_flag_omit_leaf_frame_pointer
,
13368 /* If the callee has specific tuning overrides, respect them. */
13369 if (callee_opts
->x_aarch64_override_tune_string
!= NULL
13370 && caller_opts
->x_aarch64_override_tune_string
== NULL
)
13373 /* If the user specified tuning override strings for the
13374 caller and callee and they don't match up, reject inlining.
13375 We just do a string compare here, we don't analyze the meaning
13376 of the string, as it would be too costly for little gain. */
13377 if (callee_opts
->x_aarch64_override_tune_string
13378 && caller_opts
->x_aarch64_override_tune_string
13379 && (strcmp (callee_opts
->x_aarch64_override_tune_string
,
13380 caller_opts
->x_aarch64_override_tune_string
) != 0))
13386 /* Return true if SYMBOL_REF X binds locally. */
13389 aarch64_symbol_binds_local_p (const_rtx x
)
13391 return (SYMBOL_REF_DECL (x
)
13392 ? targetm
.binds_local_p (SYMBOL_REF_DECL (x
))
13393 : SYMBOL_REF_LOCAL_P (x
));
13396 /* Return true if SYMBOL_REF X is thread local */
13398 aarch64_tls_symbol_p (rtx x
)
13400 if (! TARGET_HAVE_TLS
)
13403 if (GET_CODE (x
) != SYMBOL_REF
)
13406 return SYMBOL_REF_TLS_MODEL (x
) != 0;
13409 /* Classify a TLS symbol into one of the TLS kinds. */
13410 enum aarch64_symbol_type
13411 aarch64_classify_tls_symbol (rtx x
)
13413 enum tls_model tls_kind
= tls_symbolic_operand_type (x
);
13417 case TLS_MODEL_GLOBAL_DYNAMIC
:
13418 case TLS_MODEL_LOCAL_DYNAMIC
:
13419 return TARGET_TLS_DESC
? SYMBOL_SMALL_TLSDESC
: SYMBOL_SMALL_TLSGD
;
13421 case TLS_MODEL_INITIAL_EXEC
:
13422 switch (aarch64_cmodel
)
13424 case AARCH64_CMODEL_TINY
:
13425 case AARCH64_CMODEL_TINY_PIC
:
13426 return SYMBOL_TINY_TLSIE
;
13428 return SYMBOL_SMALL_TLSIE
;
13431 case TLS_MODEL_LOCAL_EXEC
:
13432 if (aarch64_tls_size
== 12)
13433 return SYMBOL_TLSLE12
;
13434 else if (aarch64_tls_size
== 24)
13435 return SYMBOL_TLSLE24
;
13436 else if (aarch64_tls_size
== 32)
13437 return SYMBOL_TLSLE32
;
13438 else if (aarch64_tls_size
== 48)
13439 return SYMBOL_TLSLE48
;
13441 gcc_unreachable ();
13443 case TLS_MODEL_EMULATED
:
13444 case TLS_MODEL_NONE
:
13445 return SYMBOL_FORCE_TO_MEM
;
13448 gcc_unreachable ();
13452 /* Return the correct method for accessing X + OFFSET, where X is either
13453 a SYMBOL_REF or LABEL_REF. */
13455 enum aarch64_symbol_type
13456 aarch64_classify_symbol (rtx x
, HOST_WIDE_INT offset
)
13458 if (GET_CODE (x
) == LABEL_REF
)
13460 switch (aarch64_cmodel
)
13462 case AARCH64_CMODEL_LARGE
:
13463 return SYMBOL_FORCE_TO_MEM
;
13465 case AARCH64_CMODEL_TINY_PIC
:
13466 case AARCH64_CMODEL_TINY
:
13467 return SYMBOL_TINY_ABSOLUTE
;
13469 case AARCH64_CMODEL_SMALL_SPIC
:
13470 case AARCH64_CMODEL_SMALL_PIC
:
13471 case AARCH64_CMODEL_SMALL
:
13472 return SYMBOL_SMALL_ABSOLUTE
;
13475 gcc_unreachable ();
13479 if (GET_CODE (x
) == SYMBOL_REF
)
13481 if (aarch64_tls_symbol_p (x
))
13482 return aarch64_classify_tls_symbol (x
);
13484 switch (aarch64_cmodel
)
13486 case AARCH64_CMODEL_TINY
:
13487 /* When we retrieve symbol + offset address, we have to make sure
13488 the offset does not cause overflow of the final address. But
13489 we have no way of knowing the address of symbol at compile time
13490 so we can't accurately say if the distance between the PC and
13491 symbol + offset is outside the addressible range of +/-1M in the
13492 TINY code model. So we rely on images not being greater than
13493 1M and cap the offset at 1M and anything beyond 1M will have to
13494 be loaded using an alternative mechanism. Furthermore if the
13495 symbol is a weak reference to something that isn't known to
13496 resolve to a symbol in this module, then force to memory. */
13497 if ((SYMBOL_REF_WEAK (x
)
13498 && !aarch64_symbol_binds_local_p (x
))
13499 || !IN_RANGE (offset
, -1048575, 1048575))
13500 return SYMBOL_FORCE_TO_MEM
;
13501 return SYMBOL_TINY_ABSOLUTE
;
13503 case AARCH64_CMODEL_SMALL
:
13504 /* Same reasoning as the tiny code model, but the offset cap here is
13506 if ((SYMBOL_REF_WEAK (x
)
13507 && !aarch64_symbol_binds_local_p (x
))
13508 || !IN_RANGE (offset
, HOST_WIDE_INT_C (-4294967263),
13509 HOST_WIDE_INT_C (4294967264)))
13510 return SYMBOL_FORCE_TO_MEM
;
13511 return SYMBOL_SMALL_ABSOLUTE
;
13513 case AARCH64_CMODEL_TINY_PIC
:
13514 if (!aarch64_symbol_binds_local_p (x
))
13515 return SYMBOL_TINY_GOT
;
13516 return SYMBOL_TINY_ABSOLUTE
;
13518 case AARCH64_CMODEL_SMALL_SPIC
:
13519 case AARCH64_CMODEL_SMALL_PIC
:
13520 if (!aarch64_symbol_binds_local_p (x
))
13521 return (aarch64_cmodel
== AARCH64_CMODEL_SMALL_SPIC
13522 ? SYMBOL_SMALL_GOT_28K
: SYMBOL_SMALL_GOT_4G
);
13523 return SYMBOL_SMALL_ABSOLUTE
;
13525 case AARCH64_CMODEL_LARGE
:
13526 /* This is alright even in PIC code as the constant
13527 pool reference is always PC relative and within
13528 the same translation unit. */
13529 if (!aarch64_pcrelative_literal_loads
&& CONSTANT_POOL_ADDRESS_P (x
))
13530 return SYMBOL_SMALL_ABSOLUTE
;
13532 return SYMBOL_FORCE_TO_MEM
;
13535 gcc_unreachable ();
13539 /* By default push everything into the constant pool. */
13540 return SYMBOL_FORCE_TO_MEM
;
13544 aarch64_constant_address_p (rtx x
)
13546 return (CONSTANT_P (x
) && memory_address_p (DImode
, x
));
13550 aarch64_legitimate_pic_operand_p (rtx x
)
13552 if (GET_CODE (x
) == SYMBOL_REF
13553 || (GET_CODE (x
) == CONST
13554 && GET_CODE (XEXP (x
, 0)) == PLUS
13555 && GET_CODE (XEXP (XEXP (x
, 0), 0)) == SYMBOL_REF
))
13561 /* Implement TARGET_LEGITIMATE_CONSTANT_P hook. Return true for constants
13562 that should be rematerialized rather than spilled. */
13565 aarch64_legitimate_constant_p (machine_mode mode
, rtx x
)
13567 /* Support CSE and rematerialization of common constants. */
13568 if (CONST_INT_P (x
)
13569 || (CONST_DOUBLE_P (x
) && GET_MODE_CLASS (mode
) == MODE_FLOAT
)
13570 || GET_CODE (x
) == CONST_VECTOR
)
13573 /* Do not allow vector struct mode constants for Advanced SIMD.
13574 We could support 0 and -1 easily, but they need support in
13575 aarch64-simd.md. */
13576 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
13577 if (vec_flags
== (VEC_ADVSIMD
| VEC_STRUCT
))
13580 /* Only accept variable-length vector constants if they can be
13583 ??? It would be possible to handle rematerialization of other
13584 constants via secondary reloads. */
13585 if (vec_flags
& VEC_ANY_SVE
)
13586 return aarch64_simd_valid_immediate (x
, NULL
);
13588 if (GET_CODE (x
) == HIGH
)
13591 /* Accept polynomial constants that can be calculated by using the
13592 destination of a move as the sole temporary. Constants that
13593 require a second temporary cannot be rematerialized (they can't be
13594 forced to memory and also aren't legitimate constants). */
13596 if (poly_int_rtx_p (x
, &offset
))
13597 return aarch64_offset_temporaries (false, offset
) <= 1;
13599 /* If an offset is being added to something else, we need to allow the
13600 base to be moved into the destination register, meaning that there
13601 are no free temporaries for the offset. */
13602 x
= strip_offset (x
, &offset
);
13603 if (!offset
.is_constant () && aarch64_offset_temporaries (true, offset
) > 0)
13606 /* Do not allow const (plus (anchor_symbol, const_int)). */
13607 if (maybe_ne (offset
, 0) && SYMBOL_REF_P (x
) && SYMBOL_REF_ANCHOR_P (x
))
13610 /* Treat symbols as constants. Avoid TLS symbols as they are complex,
13611 so spilling them is better than rematerialization. */
13612 if (SYMBOL_REF_P (x
) && !SYMBOL_REF_TLS_MODEL (x
))
13615 /* Label references are always constant. */
13616 if (GET_CODE (x
) == LABEL_REF
)
13623 aarch64_load_tp (rtx target
)
13626 || GET_MODE (target
) != Pmode
13627 || !register_operand (target
, Pmode
))
13628 target
= gen_reg_rtx (Pmode
);
13630 /* Can return in any reg. */
13631 emit_insn (gen_aarch64_load_tp_hard (target
));
13635 /* On AAPCS systems, this is the "struct __va_list". */
13636 static GTY(()) tree va_list_type
;
13638 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
13639 Return the type to use as __builtin_va_list.
13641 AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
13653 aarch64_build_builtin_va_list (void)
13656 tree f_stack
, f_grtop
, f_vrtop
, f_groff
, f_vroff
;
13658 /* Create the type. */
13659 va_list_type
= lang_hooks
.types
.make_type (RECORD_TYPE
);
13660 /* Give it the required name. */
13661 va_list_name
= build_decl (BUILTINS_LOCATION
,
13663 get_identifier ("__va_list"),
13665 DECL_ARTIFICIAL (va_list_name
) = 1;
13666 TYPE_NAME (va_list_type
) = va_list_name
;
13667 TYPE_STUB_DECL (va_list_type
) = va_list_name
;
13669 /* Create the fields. */
13670 f_stack
= build_decl (BUILTINS_LOCATION
,
13671 FIELD_DECL
, get_identifier ("__stack"),
13673 f_grtop
= build_decl (BUILTINS_LOCATION
,
13674 FIELD_DECL
, get_identifier ("__gr_top"),
13676 f_vrtop
= build_decl (BUILTINS_LOCATION
,
13677 FIELD_DECL
, get_identifier ("__vr_top"),
13679 f_groff
= build_decl (BUILTINS_LOCATION
,
13680 FIELD_DECL
, get_identifier ("__gr_offs"),
13681 integer_type_node
);
13682 f_vroff
= build_decl (BUILTINS_LOCATION
,
13683 FIELD_DECL
, get_identifier ("__vr_offs"),
13684 integer_type_node
);
13686 /* Tell tree-stdarg pass about our internal offset fields.
13687 NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
13688 purpose to identify whether the code is updating va_list internal
13689 offset fields through irregular way. */
13690 va_list_gpr_counter_field
= f_groff
;
13691 va_list_fpr_counter_field
= f_vroff
;
13693 DECL_ARTIFICIAL (f_stack
) = 1;
13694 DECL_ARTIFICIAL (f_grtop
) = 1;
13695 DECL_ARTIFICIAL (f_vrtop
) = 1;
13696 DECL_ARTIFICIAL (f_groff
) = 1;
13697 DECL_ARTIFICIAL (f_vroff
) = 1;
13699 DECL_FIELD_CONTEXT (f_stack
) = va_list_type
;
13700 DECL_FIELD_CONTEXT (f_grtop
) = va_list_type
;
13701 DECL_FIELD_CONTEXT (f_vrtop
) = va_list_type
;
13702 DECL_FIELD_CONTEXT (f_groff
) = va_list_type
;
13703 DECL_FIELD_CONTEXT (f_vroff
) = va_list_type
;
13705 TYPE_FIELDS (va_list_type
) = f_stack
;
13706 DECL_CHAIN (f_stack
) = f_grtop
;
13707 DECL_CHAIN (f_grtop
) = f_vrtop
;
13708 DECL_CHAIN (f_vrtop
) = f_groff
;
13709 DECL_CHAIN (f_groff
) = f_vroff
;
13711 /* Compute its layout. */
13712 layout_type (va_list_type
);
13714 return va_list_type
;
13717 /* Implement TARGET_EXPAND_BUILTIN_VA_START. */
13719 aarch64_expand_builtin_va_start (tree valist
, rtx nextarg ATTRIBUTE_UNUSED
)
13721 const CUMULATIVE_ARGS
*cum
;
13722 tree f_stack
, f_grtop
, f_vrtop
, f_groff
, f_vroff
;
13723 tree stack
, grtop
, vrtop
, groff
, vroff
;
13725 int gr_save_area_size
= cfun
->va_list_gpr_size
;
13726 int vr_save_area_size
= cfun
->va_list_fpr_size
;
13729 cum
= &crtl
->args
.info
;
13730 if (cfun
->va_list_gpr_size
)
13731 gr_save_area_size
= MIN ((NUM_ARG_REGS
- cum
->aapcs_ncrn
) * UNITS_PER_WORD
,
13732 cfun
->va_list_gpr_size
);
13733 if (cfun
->va_list_fpr_size
)
13734 vr_save_area_size
= MIN ((NUM_FP_ARG_REGS
- cum
->aapcs_nvrn
)
13735 * UNITS_PER_VREG
, cfun
->va_list_fpr_size
);
13739 gcc_assert (cum
->aapcs_nvrn
== 0);
13740 vr_save_area_size
= 0;
13743 f_stack
= TYPE_FIELDS (va_list_type_node
);
13744 f_grtop
= DECL_CHAIN (f_stack
);
13745 f_vrtop
= DECL_CHAIN (f_grtop
);
13746 f_groff
= DECL_CHAIN (f_vrtop
);
13747 f_vroff
= DECL_CHAIN (f_groff
);
13749 stack
= build3 (COMPONENT_REF
, TREE_TYPE (f_stack
), valist
, f_stack
,
13751 grtop
= build3 (COMPONENT_REF
, TREE_TYPE (f_grtop
), valist
, f_grtop
,
13753 vrtop
= build3 (COMPONENT_REF
, TREE_TYPE (f_vrtop
), valist
, f_vrtop
,
13755 groff
= build3 (COMPONENT_REF
, TREE_TYPE (f_groff
), valist
, f_groff
,
13757 vroff
= build3 (COMPONENT_REF
, TREE_TYPE (f_vroff
), valist
, f_vroff
,
13760 /* Emit code to initialize STACK, which points to the next varargs stack
13761 argument. CUM->AAPCS_STACK_SIZE gives the number of stack words used
13762 by named arguments. STACK is 8-byte aligned. */
13763 t
= make_tree (TREE_TYPE (stack
), virtual_incoming_args_rtx
);
13764 if (cum
->aapcs_stack_size
> 0)
13765 t
= fold_build_pointer_plus_hwi (t
, cum
->aapcs_stack_size
* UNITS_PER_WORD
);
13766 t
= build2 (MODIFY_EXPR
, TREE_TYPE (stack
), stack
, t
);
13767 expand_expr (t
, const0_rtx
, VOIDmode
, EXPAND_NORMAL
);
13769 /* Emit code to initialize GRTOP, the top of the GR save area.
13770 virtual_incoming_args_rtx should have been 16 byte aligned. */
13771 t
= make_tree (TREE_TYPE (grtop
), virtual_incoming_args_rtx
);
13772 t
= build2 (MODIFY_EXPR
, TREE_TYPE (grtop
), grtop
, t
);
13773 expand_expr (t
, const0_rtx
, VOIDmode
, EXPAND_NORMAL
);
13775 /* Emit code to initialize VRTOP, the top of the VR save area.
13776 This address is gr_save_area_bytes below GRTOP, rounded
13777 down to the next 16-byte boundary. */
13778 t
= make_tree (TREE_TYPE (vrtop
), virtual_incoming_args_rtx
);
13779 vr_offset
= ROUND_UP (gr_save_area_size
,
13780 STACK_BOUNDARY
/ BITS_PER_UNIT
);
13783 t
= fold_build_pointer_plus_hwi (t
, -vr_offset
);
13784 t
= build2 (MODIFY_EXPR
, TREE_TYPE (vrtop
), vrtop
, t
);
13785 expand_expr (t
, const0_rtx
, VOIDmode
, EXPAND_NORMAL
);
13787 /* Emit code to initialize GROFF, the offset from GRTOP of the
13788 next GPR argument. */
13789 t
= build2 (MODIFY_EXPR
, TREE_TYPE (groff
), groff
,
13790 build_int_cst (TREE_TYPE (groff
), -gr_save_area_size
));
13791 expand_expr (t
, const0_rtx
, VOIDmode
, EXPAND_NORMAL
);
13793 /* Likewise emit code to initialize VROFF, the offset from FTOP
13794 of the next VR argument. */
13795 t
= build2 (MODIFY_EXPR
, TREE_TYPE (vroff
), vroff
,
13796 build_int_cst (TREE_TYPE (vroff
), -vr_save_area_size
));
13797 expand_expr (t
, const0_rtx
, VOIDmode
, EXPAND_NORMAL
);
13800 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR. */
13803 aarch64_gimplify_va_arg_expr (tree valist
, tree type
, gimple_seq
*pre_p
,
13804 gimple_seq
*post_p ATTRIBUTE_UNUSED
)
13808 bool is_ha
; /* is HFA or HVA. */
13809 bool dw_align
; /* double-word align. */
13810 machine_mode ag_mode
= VOIDmode
;
13814 tree f_stack
, f_grtop
, f_vrtop
, f_groff
, f_vroff
;
13815 tree stack
, f_top
, f_off
, off
, arg
, roundup
, on_stack
;
13816 HOST_WIDE_INT size
, rsize
, adjust
, align
;
13817 tree t
, u
, cond1
, cond2
;
13819 indirect_p
= pass_by_reference (NULL
, TYPE_MODE (type
), type
, false);
13821 type
= build_pointer_type (type
);
13823 mode
= TYPE_MODE (type
);
13825 f_stack
= TYPE_FIELDS (va_list_type_node
);
13826 f_grtop
= DECL_CHAIN (f_stack
);
13827 f_vrtop
= DECL_CHAIN (f_grtop
);
13828 f_groff
= DECL_CHAIN (f_vrtop
);
13829 f_vroff
= DECL_CHAIN (f_groff
);
13831 stack
= build3 (COMPONENT_REF
, TREE_TYPE (f_stack
), unshare_expr (valist
),
13832 f_stack
, NULL_TREE
);
13833 size
= int_size_in_bytes (type
);
13837 = aarch64_function_arg_alignment (mode
, type
, &abi_break
) / BITS_PER_UNIT
;
13841 if (aarch64_vfp_is_call_or_return_candidate (mode
,
13847 /* No frontends can create types with variable-sized modes, so we
13848 shouldn't be asked to pass or return them. */
13849 unsigned int ag_size
= GET_MODE_SIZE (ag_mode
).to_constant ();
13851 /* TYPE passed in fp/simd registers. */
13853 aarch64_err_no_fpadvsimd (mode
);
13855 f_top
= build3 (COMPONENT_REF
, TREE_TYPE (f_vrtop
),
13856 unshare_expr (valist
), f_vrtop
, NULL_TREE
);
13857 f_off
= build3 (COMPONENT_REF
, TREE_TYPE (f_vroff
),
13858 unshare_expr (valist
), f_vroff
, NULL_TREE
);
13860 rsize
= nregs
* UNITS_PER_VREG
;
13864 if (BYTES_BIG_ENDIAN
&& ag_size
< UNITS_PER_VREG
)
13865 adjust
= UNITS_PER_VREG
- ag_size
;
13867 else if (BLOCK_REG_PADDING (mode
, type
, 1) == PAD_DOWNWARD
13868 && size
< UNITS_PER_VREG
)
13870 adjust
= UNITS_PER_VREG
- size
;
13875 /* TYPE passed in general registers. */
13876 f_top
= build3 (COMPONENT_REF
, TREE_TYPE (f_grtop
),
13877 unshare_expr (valist
), f_grtop
, NULL_TREE
);
13878 f_off
= build3 (COMPONENT_REF
, TREE_TYPE (f_groff
),
13879 unshare_expr (valist
), f_groff
, NULL_TREE
);
13880 rsize
= ROUND_UP (size
, UNITS_PER_WORD
);
13881 nregs
= rsize
/ UNITS_PER_WORD
;
13885 if (abi_break
&& warn_psabi
)
13886 inform (input_location
, "parameter passing for argument of type "
13887 "%qT changed in GCC 9.1", type
);
13891 if (BLOCK_REG_PADDING (mode
, type
, 1) == PAD_DOWNWARD
13892 && size
< UNITS_PER_WORD
)
13894 adjust
= UNITS_PER_WORD
- size
;
13898 /* Get a local temporary for the field value. */
13899 off
= get_initialized_tmp_var (f_off
, pre_p
, NULL
);
13901 /* Emit code to branch if off >= 0. */
13902 t
= build2 (GE_EXPR
, boolean_type_node
, off
,
13903 build_int_cst (TREE_TYPE (off
), 0));
13904 cond1
= build3 (COND_EXPR
, ptr_type_node
, t
, NULL_TREE
, NULL_TREE
);
13908 /* Emit: offs = (offs + 15) & -16. */
13909 t
= build2 (PLUS_EXPR
, TREE_TYPE (off
), off
,
13910 build_int_cst (TREE_TYPE (off
), 15));
13911 t
= build2 (BIT_AND_EXPR
, TREE_TYPE (off
), t
,
13912 build_int_cst (TREE_TYPE (off
), -16));
13913 roundup
= build2 (MODIFY_EXPR
, TREE_TYPE (off
), off
, t
);
13918 /* Update ap.__[g|v]r_offs */
13919 t
= build2 (PLUS_EXPR
, TREE_TYPE (off
), off
,
13920 build_int_cst (TREE_TYPE (off
), rsize
));
13921 t
= build2 (MODIFY_EXPR
, TREE_TYPE (f_off
), unshare_expr (f_off
), t
);
13925 t
= build2 (COMPOUND_EXPR
, TREE_TYPE (t
), roundup
, t
);
13927 /* [cond2] if (ap.__[g|v]r_offs > 0) */
13928 u
= build2 (GT_EXPR
, boolean_type_node
, unshare_expr (f_off
),
13929 build_int_cst (TREE_TYPE (f_off
), 0));
13930 cond2
= build3 (COND_EXPR
, ptr_type_node
, u
, NULL_TREE
, NULL_TREE
);
13932 /* String up: make sure the assignment happens before the use. */
13933 t
= build2 (COMPOUND_EXPR
, TREE_TYPE (cond2
), t
, cond2
);
13934 COND_EXPR_ELSE (cond1
) = t
;
13936 /* Prepare the trees handling the argument that is passed on the stack;
13937 the top level node will store in ON_STACK. */
13938 arg
= get_initialized_tmp_var (stack
, pre_p
, NULL
);
13941 /* if (alignof(type) > 8) (arg = arg + 15) & -16; */
13942 t
= fold_build_pointer_plus_hwi (arg
, 15);
13943 t
= build2 (BIT_AND_EXPR
, TREE_TYPE (t
), t
,
13944 build_int_cst (TREE_TYPE (t
), -16));
13945 roundup
= build2 (MODIFY_EXPR
, TREE_TYPE (arg
), arg
, t
);
13949 /* Advance ap.__stack */
13950 t
= fold_build_pointer_plus_hwi (arg
, size
+ 7);
13951 t
= build2 (BIT_AND_EXPR
, TREE_TYPE (t
), t
,
13952 build_int_cst (TREE_TYPE (t
), -8));
13953 t
= build2 (MODIFY_EXPR
, TREE_TYPE (stack
), unshare_expr (stack
), t
);
13954 /* String up roundup and advance. */
13956 t
= build2 (COMPOUND_EXPR
, TREE_TYPE (t
), roundup
, t
);
13957 /* String up with arg */
13958 on_stack
= build2 (COMPOUND_EXPR
, TREE_TYPE (arg
), t
, arg
);
13959 /* Big-endianness related address adjustment. */
13960 if (BLOCK_REG_PADDING (mode
, type
, 1) == PAD_DOWNWARD
13961 && size
< UNITS_PER_WORD
)
13963 t
= build2 (POINTER_PLUS_EXPR
, TREE_TYPE (arg
), arg
,
13964 size_int (UNITS_PER_WORD
- size
));
13965 on_stack
= build2 (COMPOUND_EXPR
, TREE_TYPE (arg
), on_stack
, t
);
13968 COND_EXPR_THEN (cond1
) = unshare_expr (on_stack
);
13969 COND_EXPR_THEN (cond2
) = unshare_expr (on_stack
);
13971 /* Adjustment to OFFSET in the case of BIG_ENDIAN. */
13974 t
= build2 (PREINCREMENT_EXPR
, TREE_TYPE (off
), off
,
13975 build_int_cst (TREE_TYPE (off
), adjust
));
13977 t
= fold_convert (sizetype
, t
);
13978 t
= build2 (POINTER_PLUS_EXPR
, TREE_TYPE (f_top
), f_top
, t
);
13982 /* type ha; // treat as "struct {ftype field[n];}"
13983 ... [computing offs]
13984 for (i = 0; i <nregs; ++i, offs += 16)
13985 ha.field[i] = *((ftype *)(ap.__vr_top + offs));
13988 tree tmp_ha
, field_t
, field_ptr_t
;
13990 /* Declare a local variable. */
13991 tmp_ha
= create_tmp_var_raw (type
, "ha");
13992 gimple_add_tmp_var (tmp_ha
);
13994 /* Establish the base type. */
13998 field_t
= float_type_node
;
13999 field_ptr_t
= float_ptr_type_node
;
14002 field_t
= double_type_node
;
14003 field_ptr_t
= double_ptr_type_node
;
14006 field_t
= long_double_type_node
;
14007 field_ptr_t
= long_double_ptr_type_node
;
14010 field_t
= aarch64_fp16_type_node
;
14011 field_ptr_t
= aarch64_fp16_ptr_type_node
;
14016 tree innertype
= make_signed_type (GET_MODE_PRECISION (SImode
));
14017 field_t
= build_vector_type_for_mode (innertype
, ag_mode
);
14018 field_ptr_t
= build_pointer_type (field_t
);
14025 /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area */
14026 tmp_ha
= build1 (ADDR_EXPR
, field_ptr_t
, tmp_ha
);
14028 t
= fold_convert (field_ptr_t
, addr
);
14029 t
= build2 (MODIFY_EXPR
, field_t
,
14030 build1 (INDIRECT_REF
, field_t
, tmp_ha
),
14031 build1 (INDIRECT_REF
, field_t
, t
));
14033 /* ha.field[i] = *((field_ptr_t)vr_saved_area + i) */
14034 for (i
= 1; i
< nregs
; ++i
)
14036 addr
= fold_build_pointer_plus_hwi (addr
, UNITS_PER_VREG
);
14037 u
= fold_convert (field_ptr_t
, addr
);
14038 u
= build2 (MODIFY_EXPR
, field_t
,
14039 build2 (MEM_REF
, field_t
, tmp_ha
,
14040 build_int_cst (field_ptr_t
,
14042 int_size_in_bytes (field_t
)))),
14043 build1 (INDIRECT_REF
, field_t
, u
));
14044 t
= build2 (COMPOUND_EXPR
, TREE_TYPE (t
), t
, u
);
14047 u
= fold_convert (TREE_TYPE (f_top
), tmp_ha
);
14048 t
= build2 (COMPOUND_EXPR
, TREE_TYPE (f_top
), t
, u
);
14051 COND_EXPR_ELSE (cond2
) = t
;
14052 addr
= fold_convert (build_pointer_type (type
), cond1
);
14053 addr
= build_va_arg_indirect_ref (addr
);
14056 addr
= build_va_arg_indirect_ref (addr
);
14061 /* Implement TARGET_SETUP_INCOMING_VARARGS. */
14064 aarch64_setup_incoming_varargs (cumulative_args_t cum_v
, machine_mode mode
,
14065 tree type
, int *pretend_size ATTRIBUTE_UNUSED
,
14068 CUMULATIVE_ARGS
*cum
= get_cumulative_args (cum_v
);
14069 CUMULATIVE_ARGS local_cum
;
14070 int gr_saved
= cfun
->va_list_gpr_size
;
14071 int vr_saved
= cfun
->va_list_fpr_size
;
14073 /* The caller has advanced CUM up to, but not beyond, the last named
14074 argument. Advance a local copy of CUM past the last "real" named
14075 argument, to find out how many registers are left over. */
14077 aarch64_function_arg_advance (pack_cumulative_args(&local_cum
), mode
, type
, true);
14079 /* Found out how many registers we need to save.
14080 Honor tree-stdvar analysis results. */
14081 if (cfun
->va_list_gpr_size
)
14082 gr_saved
= MIN (NUM_ARG_REGS
- local_cum
.aapcs_ncrn
,
14083 cfun
->va_list_gpr_size
/ UNITS_PER_WORD
);
14084 if (cfun
->va_list_fpr_size
)
14085 vr_saved
= MIN (NUM_FP_ARG_REGS
- local_cum
.aapcs_nvrn
,
14086 cfun
->va_list_fpr_size
/ UNITS_PER_VREG
);
14090 gcc_assert (local_cum
.aapcs_nvrn
== 0);
14100 /* virtual_incoming_args_rtx should have been 16-byte aligned. */
14101 ptr
= plus_constant (Pmode
, virtual_incoming_args_rtx
,
14102 - gr_saved
* UNITS_PER_WORD
);
14103 mem
= gen_frame_mem (BLKmode
, ptr
);
14104 set_mem_alias_set (mem
, get_varargs_alias_set ());
14106 move_block_from_reg (local_cum
.aapcs_ncrn
+ R0_REGNUM
,
14111 /* We can't use move_block_from_reg, because it will use
14112 the wrong mode, storing D regs only. */
14113 machine_mode mode
= TImode
;
14114 int off
, i
, vr_start
;
14116 /* Set OFF to the offset from virtual_incoming_args_rtx of
14117 the first vector register. The VR save area lies below
14118 the GR one, and is aligned to 16 bytes. */
14119 off
= -ROUND_UP (gr_saved
* UNITS_PER_WORD
,
14120 STACK_BOUNDARY
/ BITS_PER_UNIT
);
14121 off
-= vr_saved
* UNITS_PER_VREG
;
14123 vr_start
= V0_REGNUM
+ local_cum
.aapcs_nvrn
;
14124 for (i
= 0; i
< vr_saved
; ++i
)
14128 ptr
= plus_constant (Pmode
, virtual_incoming_args_rtx
, off
);
14129 mem
= gen_frame_mem (mode
, ptr
);
14130 set_mem_alias_set (mem
, get_varargs_alias_set ());
14131 aarch64_emit_move (mem
, gen_rtx_REG (mode
, vr_start
+ i
));
14132 off
+= UNITS_PER_VREG
;
14137 /* We don't save the size into *PRETEND_SIZE because we want to avoid
14138 any complication of having crtl->args.pretend_args_size changed. */
14139 cfun
->machine
->frame
.saved_varargs_size
14140 = (ROUND_UP (gr_saved
* UNITS_PER_WORD
,
14141 STACK_BOUNDARY
/ BITS_PER_UNIT
)
14142 + vr_saved
* UNITS_PER_VREG
);
14146 aarch64_conditional_register_usage (void)
14151 for (i
= V0_REGNUM
; i
<= V31_REGNUM
; i
++)
14154 call_used_regs
[i
] = 1;
14158 for (i
= P0_REGNUM
; i
<= P15_REGNUM
; i
++)
14161 call_used_regs
[i
] = 1;
14164 /* When tracking speculation, we need a couple of call-clobbered registers
14165 to track the speculation state. It would be nice to just use
14166 IP0 and IP1, but currently there are numerous places that just
14167 assume these registers are free for other uses (eg pointer
14168 authentication). */
14169 if (aarch64_track_speculation
)
14171 fixed_regs
[SPECULATION_TRACKER_REGNUM
] = 1;
14172 call_used_regs
[SPECULATION_TRACKER_REGNUM
] = 1;
14173 fixed_regs
[SPECULATION_SCRATCH_REGNUM
] = 1;
14174 call_used_regs
[SPECULATION_SCRATCH_REGNUM
] = 1;
14178 /* Walk down the type tree of TYPE counting consecutive base elements.
14179 If *MODEP is VOIDmode, then set it to the first valid floating point
14180 type. If a non-floating point type is found, or if a floating point
14181 type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
14182 otherwise return the count in the sub-tree. */
14184 aapcs_vfp_sub_candidate (const_tree type
, machine_mode
*modep
)
14187 HOST_WIDE_INT size
;
14189 switch (TREE_CODE (type
))
14192 mode
= TYPE_MODE (type
);
14193 if (mode
!= DFmode
&& mode
!= SFmode
14194 && mode
!= TFmode
&& mode
!= HFmode
)
14197 if (*modep
== VOIDmode
)
14200 if (*modep
== mode
)
14206 mode
= TYPE_MODE (TREE_TYPE (type
));
14207 if (mode
!= DFmode
&& mode
!= SFmode
14208 && mode
!= TFmode
&& mode
!= HFmode
)
14211 if (*modep
== VOIDmode
)
14214 if (*modep
== mode
)
14220 /* Use V2SImode and V4SImode as representatives of all 64-bit
14221 and 128-bit vector types. */
14222 size
= int_size_in_bytes (type
);
14235 if (*modep
== VOIDmode
)
14238 /* Vector modes are considered to be opaque: two vectors are
14239 equivalent for the purposes of being homogeneous aggregates
14240 if they are the same size. */
14241 if (*modep
== mode
)
14249 tree index
= TYPE_DOMAIN (type
);
14251 /* Can't handle incomplete types nor sizes that are not
14253 if (!COMPLETE_TYPE_P (type
)
14254 || TREE_CODE (TYPE_SIZE (type
)) != INTEGER_CST
)
14257 count
= aapcs_vfp_sub_candidate (TREE_TYPE (type
), modep
);
14260 || !TYPE_MAX_VALUE (index
)
14261 || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index
))
14262 || !TYPE_MIN_VALUE (index
)
14263 || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index
))
14267 count
*= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index
))
14268 - tree_to_uhwi (TYPE_MIN_VALUE (index
)));
14270 /* There must be no padding. */
14271 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type
)),
14272 count
* GET_MODE_BITSIZE (*modep
)))
14284 /* Can't handle incomplete types nor sizes that are not
14286 if (!COMPLETE_TYPE_P (type
)
14287 || TREE_CODE (TYPE_SIZE (type
)) != INTEGER_CST
)
14290 for (field
= TYPE_FIELDS (type
); field
; field
= TREE_CHAIN (field
))
14292 if (TREE_CODE (field
) != FIELD_DECL
)
14295 sub_count
= aapcs_vfp_sub_candidate (TREE_TYPE (field
), modep
);
14298 count
+= sub_count
;
14301 /* There must be no padding. */
14302 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type
)),
14303 count
* GET_MODE_BITSIZE (*modep
)))
14310 case QUAL_UNION_TYPE
:
14312 /* These aren't very interesting except in a degenerate case. */
14317 /* Can't handle incomplete types nor sizes that are not
14319 if (!COMPLETE_TYPE_P (type
)
14320 || TREE_CODE (TYPE_SIZE (type
)) != INTEGER_CST
)
14323 for (field
= TYPE_FIELDS (type
); field
; field
= TREE_CHAIN (field
))
14325 if (TREE_CODE (field
) != FIELD_DECL
)
14328 sub_count
= aapcs_vfp_sub_candidate (TREE_TYPE (field
), modep
);
14331 count
= count
> sub_count
? count
: sub_count
;
14334 /* There must be no padding. */
14335 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type
)),
14336 count
* GET_MODE_BITSIZE (*modep
)))
14349 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
14350 type as described in AAPCS64 \S 4.1.2.
14352 See the comment above aarch64_composite_type_p for the notes on MODE. */
14355 aarch64_short_vector_p (const_tree type
,
14358 poly_int64 size
= -1;
14360 if (type
&& TREE_CODE (type
) == VECTOR_TYPE
)
14361 size
= int_size_in_bytes (type
);
14362 else if (GET_MODE_CLASS (mode
) == MODE_VECTOR_INT
14363 || GET_MODE_CLASS (mode
) == MODE_VECTOR_FLOAT
)
14364 size
= GET_MODE_SIZE (mode
);
14366 return known_eq (size
, 8) || known_eq (size
, 16);
14369 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
14370 type as described in AAPCS64 \S 4.3. This includes aggregate, union and
14371 array types. The C99 floating-point complex types are also considered
14372 as composite types, according to AAPCS64 \S 7.1.1. The complex integer
14373 types, which are GCC extensions and out of the scope of AAPCS64, are
14374 treated as composite types here as well.
14376 Note that MODE itself is not sufficient in determining whether a type
14377 is such a composite type or not. This is because
14378 stor-layout.c:compute_record_mode may have already changed the MODE
14379 (BLKmode) of a RECORD_TYPE TYPE to some other mode. For example, a
14380 structure with only one field may have its MODE set to the mode of the
14381 field. Also an integer mode whose size matches the size of the
14382 RECORD_TYPE type may be used to substitute the original mode
14383 (i.e. BLKmode) in certain circumstances. In other words, MODE cannot be
14384 solely relied on. */
14387 aarch64_composite_type_p (const_tree type
,
14390 if (aarch64_short_vector_p (type
, mode
))
14393 if (type
&& (AGGREGATE_TYPE_P (type
) || TREE_CODE (type
) == COMPLEX_TYPE
))
14396 if (mode
== BLKmode
14397 || GET_MODE_CLASS (mode
) == MODE_COMPLEX_FLOAT
14398 || GET_MODE_CLASS (mode
) == MODE_COMPLEX_INT
)
14404 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
14405 shall be passed or returned in simd/fp register(s) (providing these
14406 parameter passing registers are available).
14408 Upon successful return, *COUNT returns the number of needed registers,
14409 *BASE_MODE returns the mode of the individual register and when IS_HAF
14410 is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
14411 floating-point aggregate or a homogeneous short-vector aggregate. */
14414 aarch64_vfp_is_call_or_return_candidate (machine_mode mode
,
14416 machine_mode
*base_mode
,
14420 machine_mode new_mode
= VOIDmode
;
14421 bool composite_p
= aarch64_composite_type_p (type
, mode
);
14423 if (is_ha
!= NULL
) *is_ha
= false;
14425 if ((!composite_p
&& GET_MODE_CLASS (mode
) == MODE_FLOAT
)
14426 || aarch64_short_vector_p (type
, mode
))
14431 else if (GET_MODE_CLASS (mode
) == MODE_COMPLEX_FLOAT
)
14433 if (is_ha
!= NULL
) *is_ha
= true;
14435 new_mode
= GET_MODE_INNER (mode
);
14437 else if (type
&& composite_p
)
14439 int ag_count
= aapcs_vfp_sub_candidate (type
, &new_mode
);
14441 if (ag_count
> 0 && ag_count
<= HA_MAX_NUM_FLDS
)
14443 if (is_ha
!= NULL
) *is_ha
= true;
14452 *base_mode
= new_mode
;
14456 /* Implement TARGET_STRUCT_VALUE_RTX. */
14459 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED
,
14460 int incoming ATTRIBUTE_UNUSED
)
14462 return gen_rtx_REG (Pmode
, AARCH64_STRUCT_VALUE_REGNUM
);
14465 /* Implements target hook vector_mode_supported_p. */
14467 aarch64_vector_mode_supported_p (machine_mode mode
)
14469 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
14470 return vec_flags
!= 0 && (vec_flags
& VEC_STRUCT
) == 0;
14473 /* Return the full-width SVE vector mode for element mode MODE, if one
14476 aarch64_full_sve_mode (scalar_mode mode
)
14493 return VNx16QImode
;
14495 return opt_machine_mode ();
14499 /* Return the 128-bit Advanced SIMD vector mode for element mode MODE,
14502 aarch64_vq_mode (scalar_mode mode
)
14521 return opt_machine_mode ();
14525 /* Return appropriate SIMD container
14526 for MODE within a vector of WIDTH bits. */
14527 static machine_mode
14528 aarch64_simd_container_mode (scalar_mode mode
, poly_int64 width
)
14530 if (TARGET_SVE
&& known_eq (width
, BITS_PER_SVE_VECTOR
))
14531 return aarch64_full_sve_mode (mode
).else_mode (word_mode
);
14533 gcc_assert (known_eq (width
, 64) || known_eq (width
, 128));
14536 if (known_eq (width
, 128))
14537 return aarch64_vq_mode (mode
).else_mode (word_mode
);
14558 /* Return 128-bit container as the preferred SIMD mode for MODE. */
14559 static machine_mode
14560 aarch64_preferred_simd_mode (scalar_mode mode
)
14562 poly_int64 bits
= TARGET_SVE
? BITS_PER_SVE_VECTOR
: 128;
14563 return aarch64_simd_container_mode (mode
, bits
);
14566 /* Return a list of possible vector sizes for the vectorizer
14567 to iterate over. */
14569 aarch64_autovectorize_vector_sizes (vector_sizes
*sizes
, bool)
14572 sizes
->safe_push (BYTES_PER_SVE_VECTOR
);
14573 sizes
->safe_push (16);
14574 sizes
->safe_push (8);
14577 /* Implement TARGET_MANGLE_TYPE. */
14579 static const char *
14580 aarch64_mangle_type (const_tree type
)
14582 /* The AArch64 ABI documents say that "__va_list" has to be
14583 mangled as if it is in the "std" namespace. */
14584 if (lang_hooks
.types_compatible_p (CONST_CAST_TREE (type
), va_list_type
))
14585 return "St9__va_list";
14587 /* Half-precision float. */
14588 if (TREE_CODE (type
) == REAL_TYPE
&& TYPE_PRECISION (type
) == 16)
14591 /* Mangle AArch64-specific internal types. TYPE_NAME is non-NULL_TREE for
14593 if (TYPE_NAME (type
) != NULL
)
14594 return aarch64_mangle_builtin_type (type
);
14596 /* Use the default mangling. */
14600 /* Find the first rtx_insn before insn that will generate an assembly
14604 aarch64_prev_real_insn (rtx_insn
*insn
)
14611 insn
= prev_real_insn (insn
);
14613 while (insn
&& recog_memoized (insn
) < 0);
14619 is_madd_op (enum attr_type t1
)
14622 /* A number of these may be AArch32 only. */
14623 enum attr_type mlatypes
[] = {
14624 TYPE_MLA
, TYPE_MLAS
, TYPE_SMLAD
, TYPE_SMLADX
, TYPE_SMLAL
, TYPE_SMLALD
,
14625 TYPE_SMLALS
, TYPE_SMLALXY
, TYPE_SMLAWX
, TYPE_SMLAWY
, TYPE_SMLAXY
,
14626 TYPE_SMMLA
, TYPE_UMLAL
, TYPE_UMLALS
,TYPE_SMLSD
, TYPE_SMLSDX
, TYPE_SMLSLD
14629 for (i
= 0; i
< sizeof (mlatypes
) / sizeof (enum attr_type
); i
++)
14631 if (t1
== mlatypes
[i
])
14638 /* Check if there is a register dependency between a load and the insn
14639 for which we hold recog_data. */
14642 dep_between_memop_and_curr (rtx memop
)
14647 gcc_assert (GET_CODE (memop
) == SET
);
14649 if (!REG_P (SET_DEST (memop
)))
14652 load_reg
= SET_DEST (memop
);
14653 for (opno
= 1; opno
< recog_data
.n_operands
; opno
++)
14655 rtx operand
= recog_data
.operand
[opno
];
14656 if (REG_P (operand
)
14657 && reg_overlap_mentioned_p (load_reg
, operand
))
14665 /* When working around the Cortex-A53 erratum 835769,
14666 given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
14667 instruction and has a preceding memory instruction such that a NOP
14668 should be inserted between them. */
14671 aarch64_madd_needs_nop (rtx_insn
* insn
)
14673 enum attr_type attr_type
;
14677 if (!TARGET_FIX_ERR_A53_835769
)
14680 if (!INSN_P (insn
) || recog_memoized (insn
) < 0)
14683 attr_type
= get_attr_type (insn
);
14684 if (!is_madd_op (attr_type
))
14687 prev
= aarch64_prev_real_insn (insn
);
14688 /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
14689 Restore recog state to INSN to avoid state corruption. */
14690 extract_constrain_insn_cached (insn
);
14692 if (!prev
|| !contains_mem_rtx_p (PATTERN (prev
)))
14695 body
= single_set (prev
);
14697 /* If the previous insn is a memory op and there is no dependency between
14698 it and the DImode madd, emit a NOP between them. If body is NULL then we
14699 have a complex memory operation, probably a load/store pair.
14700 Be conservative for now and emit a NOP. */
14701 if (GET_MODE (recog_data
.operand
[0]) == DImode
14702 && (!body
|| !dep_between_memop_and_curr (body
)))
14710 /* Implement FINAL_PRESCAN_INSN. */
14713 aarch64_final_prescan_insn (rtx_insn
*insn
)
14715 if (aarch64_madd_needs_nop (insn
))
14716 fprintf (asm_out_file
, "\tnop // between mem op and mult-accumulate\n");
14720 /* Return true if BASE_OR_STEP is a valid immediate operand for an SVE INDEX
14724 aarch64_sve_index_immediate_p (rtx base_or_step
)
14726 return (CONST_INT_P (base_or_step
)
14727 && IN_RANGE (INTVAL (base_or_step
), -16, 15));
14730 /* Return true if X is a valid immediate for the SVE ADD and SUB
14731 instructions. Negate X first if NEGATE_P is true. */
14734 aarch64_sve_arith_immediate_p (rtx x
, bool negate_p
)
14738 if (!const_vec_duplicate_p (x
, &elt
)
14739 || !CONST_INT_P (elt
))
14742 HOST_WIDE_INT val
= INTVAL (elt
);
14745 val
&= GET_MODE_MASK (GET_MODE_INNER (GET_MODE (x
)));
14748 return IN_RANGE (val
, 0, 0xff);
14749 return IN_RANGE (val
, 0, 0xff00);
14752 /* Return true if X is a valid immediate operand for an SVE logical
14753 instruction such as AND. */
14756 aarch64_sve_bitmask_immediate_p (rtx x
)
14760 return (const_vec_duplicate_p (x
, &elt
)
14761 && CONST_INT_P (elt
)
14762 && aarch64_bitmask_imm (INTVAL (elt
),
14763 GET_MODE_INNER (GET_MODE (x
))));
14766 /* Return true if X is a valid immediate for the SVE DUP and CPY
14770 aarch64_sve_dup_immediate_p (rtx x
)
14774 if (!const_vec_duplicate_p (x
, &elt
)
14775 || !CONST_INT_P (elt
))
14778 HOST_WIDE_INT val
= INTVAL (elt
);
14780 return IN_RANGE (val
, -0x80, 0x7f);
14781 return IN_RANGE (val
, -0x8000, 0x7f00);
14784 /* Return true if X is a valid immediate operand for an SVE CMP instruction.
14785 SIGNED_P says whether the operand is signed rather than unsigned. */
14788 aarch64_sve_cmp_immediate_p (rtx x
, bool signed_p
)
14792 return (const_vec_duplicate_p (x
, &elt
)
14793 && CONST_INT_P (elt
)
14795 ? IN_RANGE (INTVAL (elt
), -16, 15)
14796 : IN_RANGE (INTVAL (elt
), 0, 127)));
14799 /* Return true if X is a valid immediate operand for an SVE FADD or FSUB
14800 instruction. Negate X first if NEGATE_P is true. */
14803 aarch64_sve_float_arith_immediate_p (rtx x
, bool negate_p
)
14808 if (!const_vec_duplicate_p (x
, &elt
)
14809 || GET_CODE (elt
) != CONST_DOUBLE
)
14812 r
= *CONST_DOUBLE_REAL_VALUE (elt
);
14815 r
= real_value_negate (&r
);
14817 if (real_equal (&r
, &dconst1
))
14819 if (real_equal (&r
, &dconsthalf
))
14824 /* Return true if X is a valid immediate operand for an SVE FMUL
14828 aarch64_sve_float_mul_immediate_p (rtx x
)
14832 /* GCC will never generate a multiply with an immediate of 2, so there is no
14833 point testing for it (even though it is a valid constant). */
14834 return (const_vec_duplicate_p (x
, &elt
)
14835 && GET_CODE (elt
) == CONST_DOUBLE
14836 && real_equal (CONST_DOUBLE_REAL_VALUE (elt
), &dconsthalf
));
14839 /* Return true if replicating VAL32 is a valid 2-byte or 4-byte immediate
14840 for the Advanced SIMD operation described by WHICH and INSN. If INFO
14841 is nonnull, use it to describe valid immediates. */
14843 aarch64_advsimd_valid_immediate_hs (unsigned int val32
,
14844 simd_immediate_info
*info
,
14845 enum simd_immediate_check which
,
14846 simd_immediate_info::insn_type insn
)
14848 /* Try a 4-byte immediate with LSL. */
14849 for (unsigned int shift
= 0; shift
< 32; shift
+= 8)
14850 if ((val32
& (0xff << shift
)) == val32
)
14853 *info
= simd_immediate_info (SImode
, val32
>> shift
, insn
,
14854 simd_immediate_info::LSL
, shift
);
14858 /* Try a 2-byte immediate with LSL. */
14859 unsigned int imm16
= val32
& 0xffff;
14860 if (imm16
== (val32
>> 16))
14861 for (unsigned int shift
= 0; shift
< 16; shift
+= 8)
14862 if ((imm16
& (0xff << shift
)) == imm16
)
14865 *info
= simd_immediate_info (HImode
, imm16
>> shift
, insn
,
14866 simd_immediate_info::LSL
, shift
);
14870 /* Try a 4-byte immediate with MSL, except for cases that MVN
14872 if (which
== AARCH64_CHECK_MOV
)
14873 for (unsigned int shift
= 8; shift
< 24; shift
+= 8)
14875 unsigned int low
= (1 << shift
) - 1;
14876 if (((val32
& (0xff << shift
)) | low
) == val32
)
14879 *info
= simd_immediate_info (SImode
, val32
>> shift
, insn
,
14880 simd_immediate_info::MSL
, shift
);
14888 /* Return true if replicating VAL64 is a valid immediate for the
14889 Advanced SIMD operation described by WHICH. If INFO is nonnull,
14890 use it to describe valid immediates. */
14892 aarch64_advsimd_valid_immediate (unsigned HOST_WIDE_INT val64
,
14893 simd_immediate_info
*info
,
14894 enum simd_immediate_check which
)
14896 unsigned int val32
= val64
& 0xffffffff;
14897 unsigned int val16
= val64
& 0xffff;
14898 unsigned int val8
= val64
& 0xff;
14900 if (val32
== (val64
>> 32))
14902 if ((which
& AARCH64_CHECK_ORR
) != 0
14903 && aarch64_advsimd_valid_immediate_hs (val32
, info
, which
,
14904 simd_immediate_info::MOV
))
14907 if ((which
& AARCH64_CHECK_BIC
) != 0
14908 && aarch64_advsimd_valid_immediate_hs (~val32
, info
, which
,
14909 simd_immediate_info::MVN
))
14912 /* Try using a replicated byte. */
14913 if (which
== AARCH64_CHECK_MOV
14914 && val16
== (val32
>> 16)
14915 && val8
== (val16
>> 8))
14918 *info
= simd_immediate_info (QImode
, val8
);
14923 /* Try using a bit-to-bytemask. */
14924 if (which
== AARCH64_CHECK_MOV
)
14927 for (i
= 0; i
< 64; i
+= 8)
14929 unsigned char byte
= (val64
>> i
) & 0xff;
14930 if (byte
!= 0 && byte
!= 0xff)
14936 *info
= simd_immediate_info (DImode
, val64
);
14943 /* Return true if replicating VAL64 gives a valid immediate for an SVE MOV
14944 instruction. If INFO is nonnull, use it to describe valid immediates. */
14947 aarch64_sve_valid_immediate (unsigned HOST_WIDE_INT val64
,
14948 simd_immediate_info
*info
)
14950 scalar_int_mode mode
= DImode
;
14951 unsigned int val32
= val64
& 0xffffffff;
14952 if (val32
== (val64
>> 32))
14955 unsigned int val16
= val32
& 0xffff;
14956 if (val16
== (val32
>> 16))
14959 unsigned int val8
= val16
& 0xff;
14960 if (val8
== (val16
>> 8))
14964 HOST_WIDE_INT val
= trunc_int_for_mode (val64
, mode
);
14965 if (IN_RANGE (val
, -0x80, 0x7f))
14967 /* DUP with no shift. */
14969 *info
= simd_immediate_info (mode
, val
);
14972 if ((val
& 0xff) == 0 && IN_RANGE (val
, -0x8000, 0x7f00))
14974 /* DUP with LSL #8. */
14976 *info
= simd_immediate_info (mode
, val
);
14979 if (aarch64_bitmask_imm (val64
, mode
))
14983 *info
= simd_immediate_info (mode
, val
);
14989 /* Return true if X is a valid SVE predicate. If INFO is nonnull, use
14990 it to describe valid immediates. */
14993 aarch64_sve_pred_valid_immediate (rtx x
, simd_immediate_info
*info
)
14995 if (x
== CONST0_RTX (GET_MODE (x
)))
14998 *info
= simd_immediate_info (DImode
, 0);
15002 /* Analyze the value as a VNx16BImode. This should be relatively
15003 efficient, since rtx_vector_builder has enough built-in capacity
15004 to store all VLA predicate constants without needing the heap. */
15005 rtx_vector_builder builder
;
15006 if (!aarch64_get_sve_pred_bits (builder
, x
))
15009 unsigned int elt_size
= aarch64_widest_sve_pred_elt_size (builder
);
15010 if (int vl
= aarch64_partial_ptrue_length (builder
, elt_size
))
15012 machine_mode mode
= aarch64_sve_pred_mode (elt_size
).require ();
15013 aarch64_svpattern pattern
= aarch64_svpattern_for_vl (mode
, vl
);
15014 if (pattern
!= AARCH64_NUM_SVPATTERNS
)
15018 scalar_int_mode int_mode
= aarch64_sve_element_int_mode (mode
);
15019 *info
= simd_immediate_info (int_mode
, pattern
);
15027 /* Return true if OP is a valid SIMD immediate for the operation
15028 described by WHICH. If INFO is nonnull, use it to describe valid
15031 aarch64_simd_valid_immediate (rtx op
, simd_immediate_info
*info
,
15032 enum simd_immediate_check which
)
15034 machine_mode mode
= GET_MODE (op
);
15035 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
15036 if (vec_flags
== 0 || vec_flags
== (VEC_ADVSIMD
| VEC_STRUCT
))
15039 if (vec_flags
& VEC_SVE_PRED
)
15040 return aarch64_sve_pred_valid_immediate (op
, info
);
15042 scalar_mode elt_mode
= GET_MODE_INNER (mode
);
15044 unsigned int n_elts
;
15045 if (GET_CODE (op
) == CONST_VECTOR
15046 && CONST_VECTOR_DUPLICATE_P (op
))
15047 n_elts
= CONST_VECTOR_NPATTERNS (op
);
15048 else if ((vec_flags
& VEC_SVE_DATA
)
15049 && const_vec_series_p (op
, &base
, &step
))
15051 gcc_assert (GET_MODE_CLASS (mode
) == MODE_VECTOR_INT
);
15052 if (!aarch64_sve_index_immediate_p (base
)
15053 || !aarch64_sve_index_immediate_p (step
))
15057 *info
= simd_immediate_info (elt_mode
, base
, step
);
15060 else if (GET_CODE (op
) == CONST_VECTOR
15061 && CONST_VECTOR_NUNITS (op
).is_constant (&n_elts
))
15062 /* N_ELTS set above. */;
15066 scalar_float_mode elt_float_mode
;
15068 && is_a
<scalar_float_mode
> (elt_mode
, &elt_float_mode
))
15070 rtx elt
= CONST_VECTOR_ENCODED_ELT (op
, 0);
15071 if (aarch64_float_const_zero_rtx_p (elt
)
15072 || aarch64_float_const_representable_p (elt
))
15075 *info
= simd_immediate_info (elt_float_mode
, elt
);
15080 unsigned int elt_size
= GET_MODE_SIZE (elt_mode
);
15084 scalar_int_mode elt_int_mode
= int_mode_for_mode (elt_mode
).require ();
15086 /* Expand the vector constant out into a byte vector, with the least
15087 significant byte of the register first. */
15088 auto_vec
<unsigned char, 16> bytes
;
15089 bytes
.reserve (n_elts
* elt_size
);
15090 for (unsigned int i
= 0; i
< n_elts
; i
++)
15092 /* The vector is provided in gcc endian-neutral fashion.
15093 For aarch64_be Advanced SIMD, it must be laid out in the vector
15094 register in reverse order. */
15095 bool swap_p
= ((vec_flags
& VEC_ADVSIMD
) != 0 && BYTES_BIG_ENDIAN
);
15096 rtx elt
= CONST_VECTOR_ELT (op
, swap_p
? (n_elts
- 1 - i
) : i
);
15098 if (elt_mode
!= elt_int_mode
)
15099 elt
= gen_lowpart (elt_int_mode
, elt
);
15101 if (!CONST_INT_P (elt
))
15104 unsigned HOST_WIDE_INT elt_val
= INTVAL (elt
);
15105 for (unsigned int byte
= 0; byte
< elt_size
; byte
++)
15107 bytes
.quick_push (elt_val
& 0xff);
15108 elt_val
>>= BITS_PER_UNIT
;
15112 /* The immediate must repeat every eight bytes. */
15113 unsigned int nbytes
= bytes
.length ();
15114 for (unsigned i
= 8; i
< nbytes
; ++i
)
15115 if (bytes
[i
] != bytes
[i
- 8])
15118 /* Get the repeating 8-byte value as an integer. No endian correction
15119 is needed here because bytes is already in lsb-first order. */
15120 unsigned HOST_WIDE_INT val64
= 0;
15121 for (unsigned int i
= 0; i
< 8; i
++)
15122 val64
|= ((unsigned HOST_WIDE_INT
) bytes
[i
% nbytes
]
15123 << (i
* BITS_PER_UNIT
));
15125 if (vec_flags
& VEC_SVE_DATA
)
15126 return aarch64_sve_valid_immediate (val64
, info
);
15128 return aarch64_advsimd_valid_immediate (val64
, info
, which
);
15131 /* Check whether X is a VEC_SERIES-like constant that starts at 0 and
15132 has a step in the range of INDEX. Return the index expression if so,
15133 otherwise return null. */
15135 aarch64_check_zero_based_sve_index_immediate (rtx x
)
15138 if (const_vec_series_p (x
, &base
, &step
)
15139 && base
== const0_rtx
15140 && aarch64_sve_index_immediate_p (step
))
15145 /* Check of immediate shift constants are within range. */
15147 aarch64_simd_shift_imm_p (rtx x
, machine_mode mode
, bool left
)
15149 int bit_width
= GET_MODE_UNIT_SIZE (mode
) * BITS_PER_UNIT
;
15151 return aarch64_const_vec_all_same_in_range_p (x
, 0, bit_width
- 1);
15153 return aarch64_const_vec_all_same_in_range_p (x
, 1, bit_width
);
15156 /* Return the bitmask CONST_INT to select the bits required by a zero extract
15157 operation of width WIDTH at bit position POS. */
15160 aarch64_mask_from_zextract_ops (rtx width
, rtx pos
)
15162 gcc_assert (CONST_INT_P (width
));
15163 gcc_assert (CONST_INT_P (pos
));
15165 unsigned HOST_WIDE_INT mask
15166 = ((unsigned HOST_WIDE_INT
) 1 << UINTVAL (width
)) - 1;
15167 return GEN_INT (mask
<< UINTVAL (pos
));
15171 aarch64_mov_operand_p (rtx x
, machine_mode mode
)
15173 if (GET_CODE (x
) == HIGH
15174 && aarch64_valid_symref (XEXP (x
, 0), GET_MODE (XEXP (x
, 0))))
15177 if (CONST_INT_P (x
))
15180 if (VECTOR_MODE_P (GET_MODE (x
)))
15181 return aarch64_simd_valid_immediate (x
, NULL
);
15183 if (GET_CODE (x
) == SYMBOL_REF
&& mode
== DImode
&& CONSTANT_ADDRESS_P (x
))
15186 if (aarch64_sve_cnt_immediate_p (x
))
15189 return aarch64_classify_symbolic_expression (x
)
15190 == SYMBOL_TINY_ABSOLUTE
;
15193 /* Return a const_int vector of VAL. */
15195 aarch64_simd_gen_const_vector_dup (machine_mode mode
, HOST_WIDE_INT val
)
15197 rtx c
= gen_int_mode (val
, GET_MODE_INNER (mode
));
15198 return gen_const_vec_duplicate (mode
, c
);
15201 /* Check OP is a legal scalar immediate for the MOVI instruction. */
15204 aarch64_simd_scalar_immediate_valid_for_move (rtx op
, scalar_int_mode mode
)
15206 machine_mode vmode
;
15208 vmode
= aarch64_simd_container_mode (mode
, 64);
15209 rtx op_v
= aarch64_simd_gen_const_vector_dup (vmode
, INTVAL (op
));
15210 return aarch64_simd_valid_immediate (op_v
, NULL
);
15213 /* Construct and return a PARALLEL RTX vector with elements numbering the
15214 lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
15215 the vector - from the perspective of the architecture. This does not
15216 line up with GCC's perspective on lane numbers, so we end up with
15217 different masks depending on our target endian-ness. The diagram
15218 below may help. We must draw the distinction when building masks
15219 which select one half of the vector. An instruction selecting
15220 architectural low-lanes for a big-endian target, must be described using
15221 a mask selecting GCC high-lanes.
15223 Big-Endian Little-Endian
15225 GCC 0 1 2 3 3 2 1 0
15226 | x | x | x | x | | x | x | x | x |
15227 Architecture 3 2 1 0 3 2 1 0
15229 Low Mask: { 2, 3 } { 0, 1 }
15230 High Mask: { 0, 1 } { 2, 3 }
15232 MODE Is the mode of the vector and NUNITS is the number of units in it. */
15235 aarch64_simd_vect_par_cnst_half (machine_mode mode
, int nunits
, bool high
)
15237 rtvec v
= rtvec_alloc (nunits
/ 2);
15238 int high_base
= nunits
/ 2;
15244 if (BYTES_BIG_ENDIAN
)
15245 base
= high
? low_base
: high_base
;
15247 base
= high
? high_base
: low_base
;
15249 for (i
= 0; i
< nunits
/ 2; i
++)
15250 RTVEC_ELT (v
, i
) = GEN_INT (base
+ i
);
15252 t1
= gen_rtx_PARALLEL (mode
, v
);
15256 /* Check OP for validity as a PARALLEL RTX vector with elements
15257 numbering the lanes of either the high (HIGH == TRUE) or low lanes,
15258 from the perspective of the architecture. See the diagram above
15259 aarch64_simd_vect_par_cnst_half for more details. */
15262 aarch64_simd_check_vect_par_cnst_half (rtx op
, machine_mode mode
,
15266 if (!VECTOR_MODE_P (mode
) || !GET_MODE_NUNITS (mode
).is_constant (&nelts
))
15269 rtx ideal
= aarch64_simd_vect_par_cnst_half (mode
, nelts
, high
);
15270 HOST_WIDE_INT count_op
= XVECLEN (op
, 0);
15271 HOST_WIDE_INT count_ideal
= XVECLEN (ideal
, 0);
15274 if (count_op
!= count_ideal
)
15277 for (i
= 0; i
< count_ideal
; i
++)
15279 rtx elt_op
= XVECEXP (op
, 0, i
);
15280 rtx elt_ideal
= XVECEXP (ideal
, 0, i
);
15282 if (!CONST_INT_P (elt_op
)
15283 || INTVAL (elt_ideal
) != INTVAL (elt_op
))
15289 /* Return a PARALLEL containing NELTS elements, with element I equal
15290 to BASE + I * STEP. */
15293 aarch64_gen_stepped_int_parallel (unsigned int nelts
, int base
, int step
)
15295 rtvec vec
= rtvec_alloc (nelts
);
15296 for (unsigned int i
= 0; i
< nelts
; ++i
)
15297 RTVEC_ELT (vec
, i
) = gen_int_mode (base
+ i
* step
, DImode
);
15298 return gen_rtx_PARALLEL (VOIDmode
, vec
);
15301 /* Return true if OP is a PARALLEL of CONST_INTs that form a linear
15302 series with step STEP. */
15305 aarch64_stepped_int_parallel_p (rtx op
, int step
)
15307 if (GET_CODE (op
) != PARALLEL
|| !CONST_INT_P (XVECEXP (op
, 0, 0)))
15310 unsigned HOST_WIDE_INT base
= UINTVAL (XVECEXP (op
, 0, 0));
15311 for (int i
= 1; i
< XVECLEN (op
, 0); ++i
)
15312 if (!CONST_INT_P (XVECEXP (op
, 0, i
))
15313 || UINTVAL (XVECEXP (op
, 0, i
)) != base
+ i
* step
)
15319 /* Bounds-check lanes. Ensure OPERAND lies between LOW (inclusive) and
15320 HIGH (exclusive). */
15322 aarch64_simd_lane_bounds (rtx operand
, HOST_WIDE_INT low
, HOST_WIDE_INT high
,
15325 HOST_WIDE_INT lane
;
15326 gcc_assert (CONST_INT_P (operand
));
15327 lane
= INTVAL (operand
);
15329 if (lane
< low
|| lane
>= high
)
15332 error ("%Klane %wd out of range %wd - %wd", exp
, lane
, low
, high
- 1);
15334 error ("lane %wd out of range %wd - %wd", lane
, low
, high
- 1);
15338 /* Peform endian correction on lane number N, which indexes a vector
15339 of mode MODE, and return the result as an SImode rtx. */
15342 aarch64_endian_lane_rtx (machine_mode mode
, unsigned int n
)
15344 return gen_int_mode (ENDIAN_LANE_N (GET_MODE_NUNITS (mode
), n
), SImode
);
15347 /* Return TRUE if OP is a valid vector addressing mode. */
15350 aarch64_simd_mem_operand_p (rtx op
)
15352 return MEM_P (op
) && (GET_CODE (XEXP (op
, 0)) == POST_INC
15353 || REG_P (XEXP (op
, 0)));
15356 /* Return true if OP is a valid MEM operand for an SVE LD1R instruction. */
15359 aarch64_sve_ld1r_operand_p (rtx op
)
15361 struct aarch64_address_info addr
;
15365 && is_a
<scalar_mode
> (GET_MODE (op
), &mode
)
15366 && aarch64_classify_address (&addr
, XEXP (op
, 0), mode
, false)
15367 && addr
.type
== ADDRESS_REG_IMM
15368 && offset_6bit_unsigned_scaled_p (mode
, addr
.const_offset
));
15371 /* Return true if OP is a valid MEM operand for an SVE LD1RQ instruction. */
15373 aarch64_sve_ld1rq_operand_p (rtx op
)
15375 struct aarch64_address_info addr
;
15376 scalar_mode elem_mode
= GET_MODE_INNER (GET_MODE (op
));
15378 || !aarch64_classify_address (&addr
, XEXP (op
, 0), elem_mode
, false))
15381 if (addr
.type
== ADDRESS_REG_IMM
)
15382 return offset_4bit_signed_scaled_p (TImode
, addr
.const_offset
);
15384 if (addr
.type
== ADDRESS_REG_REG
)
15385 return (1U << addr
.shift
) == GET_MODE_SIZE (elem_mode
);
15390 /* Return true if OP is a valid MEM operand for an SVE LDR instruction.
15391 The conditions for STR are the same. */
15393 aarch64_sve_ldr_operand_p (rtx op
)
15395 struct aarch64_address_info addr
;
15398 && aarch64_classify_address (&addr
, XEXP (op
, 0), GET_MODE (op
),
15399 false, ADDR_QUERY_ANY
)
15400 && addr
.type
== ADDRESS_REG_IMM
);
15403 /* Return true if OP is a valid MEM operand for an SVE_STRUCT mode.
15404 We need to be able to access the individual pieces, so the range
15405 is different from LD[234] and ST[234]. */
15407 aarch64_sve_struct_memory_operand_p (rtx op
)
15412 machine_mode mode
= GET_MODE (op
);
15413 struct aarch64_address_info addr
;
15414 if (!aarch64_classify_address (&addr
, XEXP (op
, 0), SVE_BYTE_MODE
, false,
15416 || addr
.type
!= ADDRESS_REG_IMM
)
15419 poly_int64 first
= addr
.const_offset
;
15420 poly_int64 last
= first
+ GET_MODE_SIZE (mode
) - BYTES_PER_SVE_VECTOR
;
15421 return (offset_4bit_signed_scaled_p (SVE_BYTE_MODE
, first
)
15422 && offset_4bit_signed_scaled_p (SVE_BYTE_MODE
, last
));
15425 /* Emit a register copy from operand to operand, taking care not to
15426 early-clobber source registers in the process.
15428 COUNT is the number of components into which the copy needs to be
15431 aarch64_simd_emit_reg_reg_move (rtx
*operands
, machine_mode mode
,
15432 unsigned int count
)
15435 int rdest
= REGNO (operands
[0]);
15436 int rsrc
= REGNO (operands
[1]);
15438 if (!reg_overlap_mentioned_p (operands
[0], operands
[1])
15440 for (i
= 0; i
< count
; i
++)
15441 emit_move_insn (gen_rtx_REG (mode
, rdest
+ i
),
15442 gen_rtx_REG (mode
, rsrc
+ i
));
15444 for (i
= 0; i
< count
; i
++)
15445 emit_move_insn (gen_rtx_REG (mode
, rdest
+ count
- i
- 1),
15446 gen_rtx_REG (mode
, rsrc
+ count
- i
- 1));
15449 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
15450 one of VSTRUCT modes: OI, CI, or XI. */
15452 aarch64_simd_attr_length_rglist (machine_mode mode
)
15454 /* This is only used (and only meaningful) for Advanced SIMD, not SVE. */
15455 return (GET_MODE_SIZE (mode
).to_constant () / UNITS_PER_VREG
) * 4;
15458 /* Implement target hook TARGET_VECTOR_ALIGNMENT. The AAPCS64 sets the maximum
15459 alignment of a vector to 128 bits. SVE predicates have an alignment of
15461 static HOST_WIDE_INT
15462 aarch64_simd_vector_alignment (const_tree type
)
15464 if (TREE_CODE (TYPE_SIZE (type
)) != INTEGER_CST
)
15465 /* ??? Checking the mode isn't ideal, but VECTOR_BOOLEAN_TYPE_P can
15466 be set for non-predicate vectors of booleans. Modes are the most
15467 direct way we have of identifying real SVE predicate types. */
15468 return GET_MODE_CLASS (TYPE_MODE (type
)) == MODE_VECTOR_BOOL
? 16 : 128;
15469 return wi::umin (wi::to_wide (TYPE_SIZE (type
)), 128).to_uhwi ();
15472 /* Implement target hook TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT. */
15474 aarch64_vectorize_preferred_vector_alignment (const_tree type
)
15476 if (aarch64_sve_data_mode_p (TYPE_MODE (type
)))
15478 /* If the length of the vector is fixed, try to align to that length,
15479 otherwise don't try to align at all. */
15480 HOST_WIDE_INT result
;
15481 if (!BITS_PER_SVE_VECTOR
.is_constant (&result
))
15482 result
= TYPE_ALIGN (TREE_TYPE (type
));
15485 return TYPE_ALIGN (type
);
15488 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE. */
15490 aarch64_simd_vector_alignment_reachable (const_tree type
, bool is_packed
)
15495 /* For fixed-length vectors, check that the vectorizer will aim for
15496 full-vector alignment. This isn't true for generic GCC vectors
15497 that are wider than the ABI maximum of 128 bits. */
15498 poly_uint64 preferred_alignment
=
15499 aarch64_vectorize_preferred_vector_alignment (type
);
15500 if (TREE_CODE (TYPE_SIZE (type
)) == INTEGER_CST
15501 && maybe_ne (wi::to_widest (TYPE_SIZE (type
)),
15502 preferred_alignment
))
15505 /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned. */
15509 /* Return true if the vector misalignment factor is supported by the
15512 aarch64_builtin_support_vector_misalignment (machine_mode mode
,
15513 const_tree type
, int misalignment
,
15516 if (TARGET_SIMD
&& STRICT_ALIGNMENT
)
15518 /* Return if movmisalign pattern is not supported for this mode. */
15519 if (optab_handler (movmisalign_optab
, mode
) == CODE_FOR_nothing
)
15522 /* Misalignment factor is unknown at compile time. */
15523 if (misalignment
== -1)
15526 return default_builtin_support_vector_misalignment (mode
, type
, misalignment
,
15530 /* If VALS is a vector constant that can be loaded into a register
15531 using DUP, generate instructions to do so and return an RTX to
15532 assign to the register. Otherwise return NULL_RTX. */
15534 aarch64_simd_dup_constant (rtx vals
)
15536 machine_mode mode
= GET_MODE (vals
);
15537 machine_mode inner_mode
= GET_MODE_INNER (mode
);
15540 if (!const_vec_duplicate_p (vals
, &x
))
15543 /* We can load this constant by using DUP and a constant in a
15544 single ARM register. This will be cheaper than a vector
15546 x
= copy_to_mode_reg (inner_mode
, x
);
15547 return gen_vec_duplicate (mode
, x
);
15551 /* Generate code to load VALS, which is a PARALLEL containing only
15552 constants (for vec_init) or CONST_VECTOR, efficiently into a
15553 register. Returns an RTX to copy into the register, or NULL_RTX
15554 for a PARALLEL that cannot be converted into a CONST_VECTOR. */
15556 aarch64_simd_make_constant (rtx vals
)
15558 machine_mode mode
= GET_MODE (vals
);
15560 rtx const_vec
= NULL_RTX
;
15564 if (GET_CODE (vals
) == CONST_VECTOR
)
15566 else if (GET_CODE (vals
) == PARALLEL
)
15568 /* A CONST_VECTOR must contain only CONST_INTs and
15569 CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
15570 Only store valid constants in a CONST_VECTOR. */
15571 int n_elts
= XVECLEN (vals
, 0);
15572 for (i
= 0; i
< n_elts
; ++i
)
15574 rtx x
= XVECEXP (vals
, 0, i
);
15575 if (CONST_INT_P (x
) || CONST_DOUBLE_P (x
))
15578 if (n_const
== n_elts
)
15579 const_vec
= gen_rtx_CONST_VECTOR (mode
, XVEC (vals
, 0));
15582 gcc_unreachable ();
15584 if (const_vec
!= NULL_RTX
15585 && aarch64_simd_valid_immediate (const_vec
, NULL
))
15586 /* Load using MOVI/MVNI. */
15588 else if ((const_dup
= aarch64_simd_dup_constant (vals
)) != NULL_RTX
)
15589 /* Loaded using DUP. */
15591 else if (const_vec
!= NULL_RTX
)
15592 /* Load from constant pool. We cannot take advantage of single-cycle
15593 LD1 because we need a PC-relative addressing mode. */
15596 /* A PARALLEL containing something not valid inside CONST_VECTOR.
15597 We cannot construct an initializer. */
15601 /* Expand a vector initialisation sequence, such that TARGET is
15602 initialised to contain VALS. */
15605 aarch64_expand_vector_init (rtx target
, rtx vals
)
15607 machine_mode mode
= GET_MODE (target
);
15608 scalar_mode inner_mode
= GET_MODE_INNER (mode
);
15609 /* The number of vector elements. */
15610 int n_elts
= XVECLEN (vals
, 0);
15611 /* The number of vector elements which are not constant. */
15613 rtx any_const
= NULL_RTX
;
15614 /* The first element of vals. */
15615 rtx v0
= XVECEXP (vals
, 0, 0);
15616 bool all_same
= true;
15618 /* This is a special vec_init<M><N> where N is not an element mode but a
15619 vector mode with half the elements of M. We expect to find two entries
15620 of mode N in VALS and we must put their concatentation into TARGET. */
15621 if (XVECLEN (vals
, 0) == 2 && VECTOR_MODE_P (GET_MODE (XVECEXP (vals
, 0, 0))))
15623 gcc_assert (known_eq (GET_MODE_SIZE (mode
),
15624 2 * GET_MODE_SIZE (GET_MODE (XVECEXP (vals
, 0, 0)))));
15625 rtx lo
= XVECEXP (vals
, 0, 0);
15626 rtx hi
= XVECEXP (vals
, 0, 1);
15627 machine_mode narrow_mode
= GET_MODE (lo
);
15628 gcc_assert (GET_MODE_INNER (narrow_mode
) == inner_mode
);
15629 gcc_assert (narrow_mode
== GET_MODE (hi
));
15631 /* When we want to concatenate a half-width vector with zeroes we can
15632 use the aarch64_combinez[_be] patterns. Just make sure that the
15633 zeroes are in the right half. */
15634 if (BYTES_BIG_ENDIAN
15635 && aarch64_simd_imm_zero (lo
, narrow_mode
)
15636 && general_operand (hi
, narrow_mode
))
15637 emit_insn (gen_aarch64_combinez_be (narrow_mode
, target
, hi
, lo
));
15638 else if (!BYTES_BIG_ENDIAN
15639 && aarch64_simd_imm_zero (hi
, narrow_mode
)
15640 && general_operand (lo
, narrow_mode
))
15641 emit_insn (gen_aarch64_combinez (narrow_mode
, target
, lo
, hi
));
15644 /* Else create the two half-width registers and combine them. */
15646 lo
= force_reg (GET_MODE (lo
), lo
);
15648 hi
= force_reg (GET_MODE (hi
), hi
);
15650 if (BYTES_BIG_ENDIAN
)
15651 std::swap (lo
, hi
);
15652 emit_insn (gen_aarch64_simd_combine (narrow_mode
, target
, lo
, hi
));
15657 /* Count the number of variable elements to initialise. */
15658 for (int i
= 0; i
< n_elts
; ++i
)
15660 rtx x
= XVECEXP (vals
, 0, i
);
15661 if (!(CONST_INT_P (x
) || CONST_DOUBLE_P (x
)))
15666 all_same
&= rtx_equal_p (x
, v0
);
15669 /* No variable elements, hand off to aarch64_simd_make_constant which knows
15670 how best to handle this. */
15673 rtx constant
= aarch64_simd_make_constant (vals
);
15674 if (constant
!= NULL_RTX
)
15676 emit_move_insn (target
, constant
);
15681 /* Splat a single non-constant element if we can. */
15684 rtx x
= copy_to_mode_reg (inner_mode
, v0
);
15685 aarch64_emit_move (target
, gen_vec_duplicate (mode
, x
));
15689 enum insn_code icode
= optab_handler (vec_set_optab
, mode
);
15690 gcc_assert (icode
!= CODE_FOR_nothing
);
15692 /* If there are only variable elements, try to optimize
15693 the insertion using dup for the most common element
15694 followed by insertions. */
15696 /* The algorithm will fill matches[*][0] with the earliest matching element,
15697 and matches[X][1] with the count of duplicate elements (if X is the
15698 earliest element which has duplicates). */
15700 if (n_var
== n_elts
&& n_elts
<= 16)
15702 int matches
[16][2] = {0};
15703 for (int i
= 0; i
< n_elts
; i
++)
15705 for (int j
= 0; j
<= i
; j
++)
15707 if (rtx_equal_p (XVECEXP (vals
, 0, i
), XVECEXP (vals
, 0, j
)))
15715 int maxelement
= 0;
15717 for (int i
= 0; i
< n_elts
; i
++)
15718 if (matches
[i
][1] > maxv
)
15721 maxv
= matches
[i
][1];
15724 /* Create a duplicate of the most common element, unless all elements
15725 are equally useless to us, in which case just immediately set the
15726 vector register using the first element. */
15730 /* For vectors of two 64-bit elements, we can do even better. */
15732 && (inner_mode
== E_DImode
15733 || inner_mode
== E_DFmode
))
15736 rtx x0
= XVECEXP (vals
, 0, 0);
15737 rtx x1
= XVECEXP (vals
, 0, 1);
15738 /* Combine can pick up this case, but handling it directly
15739 here leaves clearer RTL.
15741 This is load_pair_lanes<mode>, and also gives us a clean-up
15742 for store_pair_lanes<mode>. */
15743 if (memory_operand (x0
, inner_mode
)
15744 && memory_operand (x1
, inner_mode
)
15745 && !STRICT_ALIGNMENT
15746 && rtx_equal_p (XEXP (x1
, 0),
15747 plus_constant (Pmode
,
15749 GET_MODE_SIZE (inner_mode
))))
15752 if (inner_mode
== DFmode
)
15753 t
= gen_load_pair_lanesdf (target
, x0
, x1
);
15755 t
= gen_load_pair_lanesdi (target
, x0
, x1
);
15760 /* The subreg-move sequence below will move into lane zero of the
15761 vector register. For big-endian we want that position to hold
15762 the last element of VALS. */
15763 maxelement
= BYTES_BIG_ENDIAN
? n_elts
- 1 : 0;
15764 rtx x
= copy_to_mode_reg (inner_mode
, XVECEXP (vals
, 0, maxelement
));
15765 aarch64_emit_move (target
, lowpart_subreg (mode
, x
, inner_mode
));
15769 rtx x
= copy_to_mode_reg (inner_mode
, XVECEXP (vals
, 0, maxelement
));
15770 aarch64_emit_move (target
, gen_vec_duplicate (mode
, x
));
15773 /* Insert the rest. */
15774 for (int i
= 0; i
< n_elts
; i
++)
15776 rtx x
= XVECEXP (vals
, 0, i
);
15777 if (matches
[i
][0] == maxelement
)
15779 x
= copy_to_mode_reg (inner_mode
, x
);
15780 emit_insn (GEN_FCN (icode
) (target
, x
, GEN_INT (i
)));
15785 /* Initialise a vector which is part-variable. We want to first try
15786 to build those lanes which are constant in the most efficient way we
15788 if (n_var
!= n_elts
)
15790 rtx copy
= copy_rtx (vals
);
15792 /* Load constant part of vector. We really don't care what goes into the
15793 parts we will overwrite, but we're more likely to be able to load the
15794 constant efficiently if it has fewer, larger, repeating parts
15795 (see aarch64_simd_valid_immediate). */
15796 for (int i
= 0; i
< n_elts
; i
++)
15798 rtx x
= XVECEXP (vals
, 0, i
);
15799 if (CONST_INT_P (x
) || CONST_DOUBLE_P (x
))
15801 rtx subst
= any_const
;
15802 for (int bit
= n_elts
/ 2; bit
> 0; bit
/= 2)
15804 /* Look in the copied vector, as more elements are const. */
15805 rtx test
= XVECEXP (copy
, 0, i
^ bit
);
15806 if (CONST_INT_P (test
) || CONST_DOUBLE_P (test
))
15812 XVECEXP (copy
, 0, i
) = subst
;
15814 aarch64_expand_vector_init (target
, copy
);
15817 /* Insert the variable lanes directly. */
15818 for (int i
= 0; i
< n_elts
; i
++)
15820 rtx x
= XVECEXP (vals
, 0, i
);
15821 if (CONST_INT_P (x
) || CONST_DOUBLE_P (x
))
15823 x
= copy_to_mode_reg (inner_mode
, x
);
15824 emit_insn (GEN_FCN (icode
) (target
, x
, GEN_INT (i
)));
15828 /* Emit RTL corresponding to:
15829 insr TARGET, ELEM. */
15832 emit_insr (rtx target
, rtx elem
)
15834 machine_mode mode
= GET_MODE (target
);
15835 scalar_mode elem_mode
= GET_MODE_INNER (mode
);
15836 elem
= force_reg (elem_mode
, elem
);
15838 insn_code icode
= optab_handler (vec_shl_insert_optab
, mode
);
15839 gcc_assert (icode
!= CODE_FOR_nothing
);
15840 emit_insn (GEN_FCN (icode
) (target
, target
, elem
));
15843 /* Subroutine of aarch64_sve_expand_vector_init for handling
15844 trailing constants.
15845 This function works as follows:
15846 (a) Create a new vector consisting of trailing constants.
15847 (b) Initialize TARGET with the constant vector using emit_move_insn.
15848 (c) Insert remaining elements in TARGET using insr.
15849 NELTS is the total number of elements in original vector while
15850 while NELTS_REQD is the number of elements that are actually
15853 ??? The heuristic used is to do above only if number of constants
15854 is at least half the total number of elements. May need fine tuning. */
15857 aarch64_sve_expand_vector_init_handle_trailing_constants
15858 (rtx target
, const rtx_vector_builder
&builder
, int nelts
, int nelts_reqd
)
15860 machine_mode mode
= GET_MODE (target
);
15861 scalar_mode elem_mode
= GET_MODE_INNER (mode
);
15862 int n_trailing_constants
= 0;
15864 for (int i
= nelts_reqd
- 1;
15865 i
>= 0 && aarch64_legitimate_constant_p (elem_mode
, builder
.elt (i
));
15867 n_trailing_constants
++;
15869 if (n_trailing_constants
>= nelts_reqd
/ 2)
15871 rtx_vector_builder
v (mode
, 1, nelts
);
15872 for (int i
= 0; i
< nelts
; i
++)
15873 v
.quick_push (builder
.elt (i
+ nelts_reqd
- n_trailing_constants
));
15874 rtx const_vec
= v
.build ();
15875 emit_move_insn (target
, const_vec
);
15877 for (int i
= nelts_reqd
- n_trailing_constants
- 1; i
>= 0; i
--)
15878 emit_insr (target
, builder
.elt (i
));
15886 /* Subroutine of aarch64_sve_expand_vector_init.
15888 (a) Initialize TARGET by broadcasting element NELTS_REQD - 1 of BUILDER.
15889 (b) Skip trailing elements from BUILDER, which are the same as
15890 element NELTS_REQD - 1.
15891 (c) Insert earlier elements in reverse order in TARGET using insr. */
15894 aarch64_sve_expand_vector_init_insert_elems (rtx target
,
15895 const rtx_vector_builder
&builder
,
15898 machine_mode mode
= GET_MODE (target
);
15899 scalar_mode elem_mode
= GET_MODE_INNER (mode
);
15901 struct expand_operand ops
[2];
15902 enum insn_code icode
= optab_handler (vec_duplicate_optab
, mode
);
15903 gcc_assert (icode
!= CODE_FOR_nothing
);
15905 create_output_operand (&ops
[0], target
, mode
);
15906 create_input_operand (&ops
[1], builder
.elt (nelts_reqd
- 1), elem_mode
);
15907 expand_insn (icode
, 2, ops
);
15909 int ndups
= builder
.count_dups (nelts_reqd
- 1, -1, -1);
15910 for (int i
= nelts_reqd
- ndups
- 1; i
>= 0; i
--)
15911 emit_insr (target
, builder
.elt (i
));
15914 /* Subroutine of aarch64_sve_expand_vector_init to handle case
15915 when all trailing elements of builder are same.
15916 This works as follows:
15917 (a) Use expand_insn interface to broadcast last vector element in TARGET.
15918 (b) Insert remaining elements in TARGET using insr.
15920 ??? The heuristic used is to do above if number of same trailing elements
15921 is at least 3/4 of total number of elements, loosely based on
15922 heuristic from mostly_zeros_p. May need fine-tuning. */
15925 aarch64_sve_expand_vector_init_handle_trailing_same_elem
15926 (rtx target
, const rtx_vector_builder
&builder
, int nelts_reqd
)
15928 int ndups
= builder
.count_dups (nelts_reqd
- 1, -1, -1);
15929 if (ndups
>= (3 * nelts_reqd
) / 4)
15931 aarch64_sve_expand_vector_init_insert_elems (target
, builder
,
15932 nelts_reqd
- ndups
+ 1);
15939 /* Initialize register TARGET from BUILDER. NELTS is the constant number
15940 of elements in BUILDER.
15942 The function tries to initialize TARGET from BUILDER if it fits one
15943 of the special cases outlined below.
15945 Failing that, the function divides BUILDER into two sub-vectors:
15946 v_even = even elements of BUILDER;
15947 v_odd = odd elements of BUILDER;
15949 and recursively calls itself with v_even and v_odd.
15951 if (recursive call succeeded for v_even or v_odd)
15952 TARGET = zip (v_even, v_odd)
15954 The function returns true if it managed to build TARGET from BUILDER
15955 with one of the special cases, false otherwise.
15957 Example: {a, 1, b, 2, c, 3, d, 4}
15959 The vector gets divided into:
15960 v_even = {a, b, c, d}
15961 v_odd = {1, 2, 3, 4}
15963 aarch64_sve_expand_vector_init(v_odd) hits case 1 and
15964 initialize tmp2 from constant vector v_odd using emit_move_insn.
15966 aarch64_sve_expand_vector_init(v_even) fails since v_even contains
15967 4 elements, so we construct tmp1 from v_even using insr:
15974 TARGET = zip (tmp1, tmp2)
15975 which sets TARGET to {a, 1, b, 2, c, 3, d, 4}. */
15978 aarch64_sve_expand_vector_init (rtx target
, const rtx_vector_builder
&builder
,
15979 int nelts
, int nelts_reqd
)
15981 machine_mode mode
= GET_MODE (target
);
15983 /* Case 1: Vector contains trailing constants. */
15985 if (aarch64_sve_expand_vector_init_handle_trailing_constants
15986 (target
, builder
, nelts
, nelts_reqd
))
15989 /* Case 2: Vector contains leading constants. */
15991 rtx_vector_builder
rev_builder (mode
, 1, nelts_reqd
);
15992 for (int i
= 0; i
< nelts_reqd
; i
++)
15993 rev_builder
.quick_push (builder
.elt (nelts_reqd
- i
- 1));
15994 rev_builder
.finalize ();
15996 if (aarch64_sve_expand_vector_init_handle_trailing_constants
15997 (target
, rev_builder
, nelts
, nelts_reqd
))
15999 emit_insn (gen_aarch64_sve_rev (mode
, target
, target
));
16003 /* Case 3: Vector contains trailing same element. */
16005 if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
16006 (target
, builder
, nelts_reqd
))
16009 /* Case 4: Vector contains leading same element. */
16011 if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
16012 (target
, rev_builder
, nelts_reqd
) && nelts_reqd
== nelts
)
16014 emit_insn (gen_aarch64_sve_rev (mode
, target
, target
));
16018 /* Avoid recursing below 4-elements.
16019 ??? The threshold 4 may need fine-tuning. */
16021 if (nelts_reqd
<= 4)
16024 rtx_vector_builder
v_even (mode
, 1, nelts
);
16025 rtx_vector_builder
v_odd (mode
, 1, nelts
);
16027 for (int i
= 0; i
< nelts
* 2; i
+= 2)
16029 v_even
.quick_push (builder
.elt (i
));
16030 v_odd
.quick_push (builder
.elt (i
+ 1));
16033 v_even
.finalize ();
16036 rtx tmp1
= gen_reg_rtx (mode
);
16037 bool did_even_p
= aarch64_sve_expand_vector_init (tmp1
, v_even
,
16038 nelts
, nelts_reqd
/ 2);
16040 rtx tmp2
= gen_reg_rtx (mode
);
16041 bool did_odd_p
= aarch64_sve_expand_vector_init (tmp2
, v_odd
,
16042 nelts
, nelts_reqd
/ 2);
16044 if (!did_even_p
&& !did_odd_p
)
16047 /* Initialize v_even and v_odd using INSR if it didn't match any of the
16048 special cases and zip v_even, v_odd. */
16051 aarch64_sve_expand_vector_init_insert_elems (tmp1
, v_even
, nelts_reqd
/ 2);
16054 aarch64_sve_expand_vector_init_insert_elems (tmp2
, v_odd
, nelts_reqd
/ 2);
16056 rtvec v
= gen_rtvec (2, tmp1
, tmp2
);
16057 emit_set_insn (target
, gen_rtx_UNSPEC (mode
, v
, UNSPEC_ZIP1
));
16061 /* Initialize register TARGET from the elements in PARALLEL rtx VALS. */
16064 aarch64_sve_expand_vector_init (rtx target
, rtx vals
)
16066 machine_mode mode
= GET_MODE (target
);
16067 int nelts
= XVECLEN (vals
, 0);
16069 rtx_vector_builder
v (mode
, 1, nelts
);
16070 for (int i
= 0; i
< nelts
; i
++)
16071 v
.quick_push (XVECEXP (vals
, 0, i
));
16074 /* If neither sub-vectors of v could be initialized specially,
16075 then use INSR to insert all elements from v into TARGET.
16076 ??? This might not be optimal for vectors with large
16077 initializers like 16-element or above.
16078 For nelts < 4, it probably isn't useful to handle specially. */
16081 || !aarch64_sve_expand_vector_init (target
, v
, nelts
, nelts
))
16082 aarch64_sve_expand_vector_init_insert_elems (target
, v
, nelts
);
16085 static unsigned HOST_WIDE_INT
16086 aarch64_shift_truncation_mask (machine_mode mode
)
16088 if (!SHIFT_COUNT_TRUNCATED
|| aarch64_vector_data_mode_p (mode
))
16090 return GET_MODE_UNIT_BITSIZE (mode
) - 1;
16093 /* Select a format to encode pointers in exception handling data. */
16095 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED
, int global
)
16098 switch (aarch64_cmodel
)
16100 case AARCH64_CMODEL_TINY
:
16101 case AARCH64_CMODEL_TINY_PIC
:
16102 case AARCH64_CMODEL_SMALL
:
16103 case AARCH64_CMODEL_SMALL_PIC
:
16104 case AARCH64_CMODEL_SMALL_SPIC
:
16105 /* text+got+data < 4Gb. 4-byte signed relocs are sufficient
16107 type
= DW_EH_PE_sdata4
;
16110 /* No assumptions here. 8-byte relocs required. */
16111 type
= DW_EH_PE_sdata8
;
16114 return (global
? DW_EH_PE_indirect
: 0) | DW_EH_PE_pcrel
| type
;
16117 /* Output .variant_pcs for aarch64_vector_pcs function symbols. */
16120 aarch64_asm_output_variant_pcs (FILE *stream
, const tree decl
, const char* name
)
16122 if (aarch64_simd_decl_p (decl
))
16124 fprintf (stream
, "\t.variant_pcs\t");
16125 assemble_name (stream
, name
);
16126 fprintf (stream
, "\n");
16130 /* The last .arch and .tune assembly strings that we printed. */
16131 static std::string aarch64_last_printed_arch_string
;
16132 static std::string aarch64_last_printed_tune_string
;
16134 /* Implement ASM_DECLARE_FUNCTION_NAME. Output the ISA features used
16135 by the function fndecl. */
16138 aarch64_declare_function_name (FILE *stream
, const char* name
,
16141 tree target_parts
= DECL_FUNCTION_SPECIFIC_TARGET (fndecl
);
16143 struct cl_target_option
*targ_options
;
16145 targ_options
= TREE_TARGET_OPTION (target_parts
);
16147 targ_options
= TREE_TARGET_OPTION (target_option_current_node
);
16148 gcc_assert (targ_options
);
16150 const struct processor
*this_arch
16151 = aarch64_get_arch (targ_options
->x_explicit_arch
);
16153 uint64_t isa_flags
= targ_options
->x_aarch64_isa_flags
;
16154 std::string extension
16155 = aarch64_get_extension_string_for_isa_flags (isa_flags
,
16157 /* Only update the assembler .arch string if it is distinct from the last
16158 such string we printed. */
16159 std::string to_print
= this_arch
->name
+ extension
;
16160 if (to_print
!= aarch64_last_printed_arch_string
)
16162 asm_fprintf (asm_out_file
, "\t.arch %s\n", to_print
.c_str ());
16163 aarch64_last_printed_arch_string
= to_print
;
16166 /* Print the cpu name we're tuning for in the comments, might be
16167 useful to readers of the generated asm. Do it only when it changes
16168 from function to function and verbose assembly is requested. */
16169 const struct processor
*this_tune
16170 = aarch64_get_tune_cpu (targ_options
->x_explicit_tune_core
);
16172 if (flag_debug_asm
&& aarch64_last_printed_tune_string
!= this_tune
->name
)
16174 asm_fprintf (asm_out_file
, "\t" ASM_COMMENT_START
".tune %s\n",
16176 aarch64_last_printed_tune_string
= this_tune
->name
;
16179 aarch64_asm_output_variant_pcs (stream
, fndecl
, name
);
16181 /* Don't forget the type directive for ELF. */
16182 ASM_OUTPUT_TYPE_DIRECTIVE (stream
, name
, "function");
16183 ASM_OUTPUT_LABEL (stream
, name
);
16186 /* Implement ASM_OUTPUT_DEF_FROM_DECLS. Output .variant_pcs for aliases. */
16189 aarch64_asm_output_alias (FILE *stream
, const tree decl
, const tree target
)
16191 const char *name
= XSTR (XEXP (DECL_RTL (decl
), 0), 0);
16192 const char *value
= IDENTIFIER_POINTER (target
);
16193 aarch64_asm_output_variant_pcs (stream
, decl
, name
);
16194 ASM_OUTPUT_DEF (stream
, name
, value
);
16197 /* Implement ASM_OUTPUT_EXTERNAL. Output .variant_pcs for undefined
16198 function symbol references. */
16201 aarch64_asm_output_external (FILE *stream
, tree decl
, const char* name
)
16203 default_elf_asm_output_external (stream
, decl
, name
);
16204 aarch64_asm_output_variant_pcs (stream
, decl
, name
);
16207 /* Triggered after a .cfi_startproc directive is emitted into the assembly file.
16208 Used to output the .cfi_b_key_frame directive when signing the current
16209 function with the B key. */
16212 aarch64_post_cfi_startproc (FILE *f
, tree ignored ATTRIBUTE_UNUSED
)
16214 if (cfun
->machine
->frame
.laid_out
&& aarch64_return_address_signing_enabled ()
16215 && aarch64_ra_sign_key
== AARCH64_KEY_B
)
16216 asm_fprintf (f
, "\t.cfi_b_key_frame\n");
16219 /* Implements TARGET_ASM_FILE_START. Output the assembly header. */
16222 aarch64_start_file (void)
16224 struct cl_target_option
*default_options
16225 = TREE_TARGET_OPTION (target_option_default_node
);
16227 const struct processor
*default_arch
16228 = aarch64_get_arch (default_options
->x_explicit_arch
);
16229 uint64_t default_isa_flags
= default_options
->x_aarch64_isa_flags
;
16230 std::string extension
16231 = aarch64_get_extension_string_for_isa_flags (default_isa_flags
,
16232 default_arch
->flags
);
16234 aarch64_last_printed_arch_string
= default_arch
->name
+ extension
;
16235 aarch64_last_printed_tune_string
= "";
16236 asm_fprintf (asm_out_file
, "\t.arch %s\n",
16237 aarch64_last_printed_arch_string
.c_str ());
16239 default_file_start ();
16242 /* Emit load exclusive. */
16245 aarch64_emit_load_exclusive (machine_mode mode
, rtx rval
,
16246 rtx mem
, rtx model_rtx
)
16248 emit_insn (gen_aarch64_load_exclusive (mode
, rval
, mem
, model_rtx
));
16251 /* Emit store exclusive. */
16254 aarch64_emit_store_exclusive (machine_mode mode
, rtx bval
,
16255 rtx rval
, rtx mem
, rtx model_rtx
)
16257 emit_insn (gen_aarch64_store_exclusive (mode
, bval
, rval
, mem
, model_rtx
));
16260 /* Mark the previous jump instruction as unlikely. */
16263 aarch64_emit_unlikely_jump (rtx insn
)
16265 rtx_insn
*jump
= emit_jump_insn (insn
);
16266 add_reg_br_prob_note (jump
, profile_probability::very_unlikely ());
16269 /* Expand a compare and swap pattern. */
16272 aarch64_expand_compare_and_swap (rtx operands
[])
16274 rtx bval
, rval
, mem
, oldval
, newval
, is_weak
, mod_s
, mod_f
, x
, cc_reg
;
16275 machine_mode mode
, r_mode
;
16277 bval
= operands
[0];
16278 rval
= operands
[1];
16280 oldval
= operands
[3];
16281 newval
= operands
[4];
16282 is_weak
= operands
[5];
16283 mod_s
= operands
[6];
16284 mod_f
= operands
[7];
16285 mode
= GET_MODE (mem
);
16287 /* Normally the succ memory model must be stronger than fail, but in the
16288 unlikely event of fail being ACQUIRE and succ being RELEASE we need to
16289 promote succ to ACQ_REL so that we don't lose the acquire semantics. */
16290 if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f
)))
16291 && is_mm_release (memmodel_from_int (INTVAL (mod_s
))))
16292 mod_s
= GEN_INT (MEMMODEL_ACQ_REL
);
16295 if (mode
== QImode
|| mode
== HImode
)
16298 rval
= gen_reg_rtx (r_mode
);
16303 /* The CAS insn requires oldval and rval overlap, but we need to
16304 have a copy of oldval saved across the operation to tell if
16305 the operation is successful. */
16306 if (reg_overlap_mentioned_p (rval
, oldval
))
16307 rval
= copy_to_mode_reg (r_mode
, oldval
);
16309 emit_move_insn (rval
, gen_lowpart (r_mode
, oldval
));
16311 emit_insn (gen_aarch64_compare_and_swap_lse (mode
, rval
, mem
,
16313 cc_reg
= aarch64_gen_compare_reg_maybe_ze (NE
, rval
, oldval
, mode
);
16317 /* The oldval predicate varies by mode. Test it and force to reg. */
16318 insn_code code
= code_for_aarch64_compare_and_swap (mode
);
16319 if (!insn_data
[code
].operand
[2].predicate (oldval
, mode
))
16320 oldval
= force_reg (mode
, oldval
);
16322 emit_insn (GEN_FCN (code
) (rval
, mem
, oldval
, newval
,
16323 is_weak
, mod_s
, mod_f
));
16324 cc_reg
= gen_rtx_REG (CCmode
, CC_REGNUM
);
16327 if (r_mode
!= mode
)
16328 rval
= gen_lowpart (mode
, rval
);
16329 emit_move_insn (operands
[1], rval
);
16331 x
= gen_rtx_EQ (SImode
, cc_reg
, const0_rtx
);
16332 emit_insn (gen_rtx_SET (bval
, x
));
16335 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
16336 sequence implementing an atomic operation. */
16339 aarch64_emit_post_barrier (enum memmodel model
)
16341 const enum memmodel base_model
= memmodel_base (model
);
16343 if (is_mm_sync (model
)
16344 && (base_model
== MEMMODEL_ACQUIRE
16345 || base_model
== MEMMODEL_ACQ_REL
16346 || base_model
== MEMMODEL_SEQ_CST
))
16348 emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST
)));
16352 /* Split a compare and swap pattern. */
16355 aarch64_split_compare_and_swap (rtx operands
[])
16357 rtx rval
, mem
, oldval
, newval
, scratch
;
16360 rtx_code_label
*label1
, *label2
;
16362 enum memmodel model
;
16365 rval
= operands
[0];
16367 oldval
= operands
[2];
16368 newval
= operands
[3];
16369 is_weak
= (operands
[4] != const0_rtx
);
16370 model_rtx
= operands
[5];
16371 scratch
= operands
[7];
16372 mode
= GET_MODE (mem
);
16373 model
= memmodel_from_int (INTVAL (model_rtx
));
16375 /* When OLDVAL is zero and we want the strong version we can emit a tighter
16378 LD[A]XR rval, [mem]
16380 ST[L]XR scratch, newval, [mem]
16381 CBNZ scratch, .label1
16384 bool strong_zero_p
= !is_weak
&& oldval
== const0_rtx
;
16389 label1
= gen_label_rtx ();
16390 emit_label (label1
);
16392 label2
= gen_label_rtx ();
16394 /* The initial load can be relaxed for a __sync operation since a final
16395 barrier will be emitted to stop code hoisting. */
16396 if (is_mm_sync (model
))
16397 aarch64_emit_load_exclusive (mode
, rval
, mem
,
16398 GEN_INT (MEMMODEL_RELAXED
));
16400 aarch64_emit_load_exclusive (mode
, rval
, mem
, model_rtx
);
16404 if (aarch64_track_speculation
)
16406 /* Emit an explicit compare instruction, so that we can correctly
16407 track the condition codes. */
16408 rtx cc_reg
= aarch64_gen_compare_reg (NE
, rval
, const0_rtx
);
16409 x
= gen_rtx_NE (GET_MODE (cc_reg
), cc_reg
, const0_rtx
);
16412 x
= gen_rtx_NE (VOIDmode
, rval
, const0_rtx
);
16414 x
= gen_rtx_IF_THEN_ELSE (VOIDmode
, x
,
16415 gen_rtx_LABEL_REF (Pmode
, label2
), pc_rtx
);
16416 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx
, x
));
16420 cond
= aarch64_gen_compare_reg_maybe_ze (NE
, rval
, oldval
, mode
);
16421 x
= gen_rtx_NE (VOIDmode
, cond
, const0_rtx
);
16422 x
= gen_rtx_IF_THEN_ELSE (VOIDmode
, x
,
16423 gen_rtx_LABEL_REF (Pmode
, label2
), pc_rtx
);
16424 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx
, x
));
16427 aarch64_emit_store_exclusive (mode
, scratch
, mem
, newval
, model_rtx
);
16431 if (aarch64_track_speculation
)
16433 /* Emit an explicit compare instruction, so that we can correctly
16434 track the condition codes. */
16435 rtx cc_reg
= aarch64_gen_compare_reg (NE
, scratch
, const0_rtx
);
16436 x
= gen_rtx_NE (GET_MODE (cc_reg
), cc_reg
, const0_rtx
);
16439 x
= gen_rtx_NE (VOIDmode
, scratch
, const0_rtx
);
16441 x
= gen_rtx_IF_THEN_ELSE (VOIDmode
, x
,
16442 gen_rtx_LABEL_REF (Pmode
, label1
), pc_rtx
);
16443 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx
, x
));
16447 cond
= gen_rtx_REG (CCmode
, CC_REGNUM
);
16448 x
= gen_rtx_COMPARE (CCmode
, scratch
, const0_rtx
);
16449 emit_insn (gen_rtx_SET (cond
, x
));
16452 emit_label (label2
);
16453 /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
16454 to set the condition flags. If this is not used it will be removed by
16458 cond
= gen_rtx_REG (CCmode
, CC_REGNUM
);
16459 x
= gen_rtx_COMPARE (CCmode
, rval
, const0_rtx
);
16460 emit_insn (gen_rtx_SET (cond
, x
));
16462 /* Emit any final barrier needed for a __sync operation. */
16463 if (is_mm_sync (model
))
16464 aarch64_emit_post_barrier (model
);
16467 /* Split an atomic operation. */
16470 aarch64_split_atomic_op (enum rtx_code code
, rtx old_out
, rtx new_out
, rtx mem
,
16471 rtx value
, rtx model_rtx
, rtx cond
)
16473 machine_mode mode
= GET_MODE (mem
);
16474 machine_mode wmode
= (mode
== DImode
? DImode
: SImode
);
16475 const enum memmodel model
= memmodel_from_int (INTVAL (model_rtx
));
16476 const bool is_sync
= is_mm_sync (model
);
16477 rtx_code_label
*label
;
16480 /* Split the atomic operation into a sequence. */
16481 label
= gen_label_rtx ();
16482 emit_label (label
);
16485 new_out
= gen_lowpart (wmode
, new_out
);
16487 old_out
= gen_lowpart (wmode
, old_out
);
16490 value
= simplify_gen_subreg (wmode
, value
, mode
, 0);
16492 /* The initial load can be relaxed for a __sync operation since a final
16493 barrier will be emitted to stop code hoisting. */
16495 aarch64_emit_load_exclusive (mode
, old_out
, mem
,
16496 GEN_INT (MEMMODEL_RELAXED
));
16498 aarch64_emit_load_exclusive (mode
, old_out
, mem
, model_rtx
);
16507 x
= gen_rtx_AND (wmode
, old_out
, value
);
16508 emit_insn (gen_rtx_SET (new_out
, x
));
16509 x
= gen_rtx_NOT (wmode
, new_out
);
16510 emit_insn (gen_rtx_SET (new_out
, x
));
16514 if (CONST_INT_P (value
))
16516 value
= GEN_INT (-INTVAL (value
));
16519 /* Fall through. */
16522 x
= gen_rtx_fmt_ee (code
, wmode
, old_out
, value
);
16523 emit_insn (gen_rtx_SET (new_out
, x
));
16527 aarch64_emit_store_exclusive (mode
, cond
, mem
,
16528 gen_lowpart (mode
, new_out
), model_rtx
);
16530 if (aarch64_track_speculation
)
16532 /* Emit an explicit compare instruction, so that we can correctly
16533 track the condition codes. */
16534 rtx cc_reg
= aarch64_gen_compare_reg (NE
, cond
, const0_rtx
);
16535 x
= gen_rtx_NE (GET_MODE (cc_reg
), cc_reg
, const0_rtx
);
16538 x
= gen_rtx_NE (VOIDmode
, cond
, const0_rtx
);
16540 x
= gen_rtx_IF_THEN_ELSE (VOIDmode
, x
,
16541 gen_rtx_LABEL_REF (Pmode
, label
), pc_rtx
);
16542 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx
, x
));
16544 /* Emit any final barrier needed for a __sync operation. */
16546 aarch64_emit_post_barrier (model
);
16550 aarch64_init_libfuncs (void)
16552 /* Half-precision float operations. The compiler handles all operations
16553 with NULL libfuncs by converting to SFmode. */
16556 set_conv_libfunc (trunc_optab
, HFmode
, SFmode
, "__gnu_f2h_ieee");
16557 set_conv_libfunc (sext_optab
, SFmode
, HFmode
, "__gnu_h2f_ieee");
16560 set_optab_libfunc (add_optab
, HFmode
, NULL
);
16561 set_optab_libfunc (sdiv_optab
, HFmode
, NULL
);
16562 set_optab_libfunc (smul_optab
, HFmode
, NULL
);
16563 set_optab_libfunc (neg_optab
, HFmode
, NULL
);
16564 set_optab_libfunc (sub_optab
, HFmode
, NULL
);
16567 set_optab_libfunc (eq_optab
, HFmode
, NULL
);
16568 set_optab_libfunc (ne_optab
, HFmode
, NULL
);
16569 set_optab_libfunc (lt_optab
, HFmode
, NULL
);
16570 set_optab_libfunc (le_optab
, HFmode
, NULL
);
16571 set_optab_libfunc (ge_optab
, HFmode
, NULL
);
16572 set_optab_libfunc (gt_optab
, HFmode
, NULL
);
16573 set_optab_libfunc (unord_optab
, HFmode
, NULL
);
16576 /* Target hook for c_mode_for_suffix. */
16577 static machine_mode
16578 aarch64_c_mode_for_suffix (char suffix
)
16586 /* We can only represent floating point constants which will fit in
16587 "quarter-precision" values. These values are characterised by
16588 a sign bit, a 4-bit mantissa and a 3-bit exponent. And are given
16591 (-1)^s * (n/16) * 2^r
16594 's' is the sign bit.
16595 'n' is an integer in the range 16 <= n <= 31.
16596 'r' is an integer in the range -3 <= r <= 4. */
16598 /* Return true iff X can be represented by a quarter-precision
16599 floating point immediate operand X. Note, we cannot represent 0.0. */
16601 aarch64_float_const_representable_p (rtx x
)
16603 /* This represents our current view of how many bits
16604 make up the mantissa. */
16605 int point_pos
= 2 * HOST_BITS_PER_WIDE_INT
- 1;
16607 unsigned HOST_WIDE_INT mantissa
, mask
;
16608 REAL_VALUE_TYPE r
, m
;
16611 if (!CONST_DOUBLE_P (x
))
16614 if (GET_MODE (x
) == VOIDmode
16615 || (GET_MODE (x
) == HFmode
&& !TARGET_FP_F16INST
))
16618 r
= *CONST_DOUBLE_REAL_VALUE (x
);
16620 /* We cannot represent infinities, NaNs or +/-zero. We won't
16621 know if we have +zero until we analyse the mantissa, but we
16622 can reject the other invalid values. */
16623 if (REAL_VALUE_ISINF (r
) || REAL_VALUE_ISNAN (r
)
16624 || REAL_VALUE_MINUS_ZERO (r
))
16627 /* Extract exponent. */
16628 r
= real_value_abs (&r
);
16629 exponent
= REAL_EXP (&r
);
16631 /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
16632 highest (sign) bit, with a fixed binary point at bit point_pos.
16633 m1 holds the low part of the mantissa, m2 the high part.
16634 WARNING: If we ever have a representation using more than 2 * H_W_I - 1
16635 bits for the mantissa, this can fail (low bits will be lost). */
16636 real_ldexp (&m
, &r
, point_pos
- exponent
);
16637 wide_int w
= real_to_integer (&m
, &fail
, HOST_BITS_PER_WIDE_INT
* 2);
16639 /* If the low part of the mantissa has bits set we cannot represent
16641 if (w
.ulow () != 0)
16643 /* We have rejected the lower HOST_WIDE_INT, so update our
16644 understanding of how many bits lie in the mantissa and
16645 look only at the high HOST_WIDE_INT. */
16646 mantissa
= w
.elt (1);
16647 point_pos
-= HOST_BITS_PER_WIDE_INT
;
16649 /* We can only represent values with a mantissa of the form 1.xxxx. */
16650 mask
= ((unsigned HOST_WIDE_INT
)1 << (point_pos
- 5)) - 1;
16651 if ((mantissa
& mask
) != 0)
16654 /* Having filtered unrepresentable values, we may now remove all
16655 but the highest 5 bits. */
16656 mantissa
>>= point_pos
- 5;
16658 /* We cannot represent the value 0.0, so reject it. This is handled
16663 /* Then, as bit 4 is always set, we can mask it off, leaving
16664 the mantissa in the range [0, 15]. */
16665 mantissa
&= ~(1 << 4);
16666 gcc_assert (mantissa
<= 15);
16668 /* GCC internally does not use IEEE754-like encoding (where normalized
16669 significands are in the range [1, 2). GCC uses [0.5, 1) (see real.c).
16670 Our mantissa values are shifted 4 places to the left relative to
16671 normalized IEEE754 so we must modify the exponent returned by REAL_EXP
16672 by 5 places to correct for GCC's representation. */
16673 exponent
= 5 - exponent
;
16675 return (exponent
>= 0 && exponent
<= 7);
16678 /* Returns the string with the instruction for AdvSIMD MOVI, MVNI, ORR or BIC
16679 immediate with a CONST_VECTOR of MODE and WIDTH. WHICH selects whether to
16680 output MOVI/MVNI, ORR or BIC immediate. */
16682 aarch64_output_simd_mov_immediate (rtx const_vector
, unsigned width
,
16683 enum simd_immediate_check which
)
16686 static char templ
[40];
16687 const char *mnemonic
;
16688 const char *shift_op
;
16689 unsigned int lane_count
= 0;
16692 struct simd_immediate_info info
;
16694 /* This will return true to show const_vector is legal for use as either
16695 a AdvSIMD MOVI instruction (or, implicitly, MVNI), ORR or BIC immediate.
16696 It will also update INFO to show how the immediate should be generated.
16697 WHICH selects whether to check for MOVI/MVNI, ORR or BIC. */
16698 is_valid
= aarch64_simd_valid_immediate (const_vector
, &info
, which
);
16699 gcc_assert (is_valid
);
16701 element_char
= sizetochar (GET_MODE_BITSIZE (info
.elt_mode
));
16702 lane_count
= width
/ GET_MODE_BITSIZE (info
.elt_mode
);
16704 if (GET_MODE_CLASS (info
.elt_mode
) == MODE_FLOAT
)
16706 gcc_assert (info
.insn
== simd_immediate_info::MOV
16707 && info
.u
.mov
.shift
== 0);
16708 /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
16709 move immediate path. */
16710 if (aarch64_float_const_zero_rtx_p (info
.u
.mov
.value
))
16711 info
.u
.mov
.value
= GEN_INT (0);
16714 const unsigned int buf_size
= 20;
16715 char float_buf
[buf_size
] = {'\0'};
16716 real_to_decimal_for_mode (float_buf
,
16717 CONST_DOUBLE_REAL_VALUE (info
.u
.mov
.value
),
16718 buf_size
, buf_size
, 1, info
.elt_mode
);
16720 if (lane_count
== 1)
16721 snprintf (templ
, sizeof (templ
), "fmov\t%%d0, %s", float_buf
);
16723 snprintf (templ
, sizeof (templ
), "fmov\t%%0.%d%c, %s",
16724 lane_count
, element_char
, float_buf
);
16729 gcc_assert (CONST_INT_P (info
.u
.mov
.value
));
16731 if (which
== AARCH64_CHECK_MOV
)
16733 mnemonic
= info
.insn
== simd_immediate_info::MVN
? "mvni" : "movi";
16734 shift_op
= (info
.u
.mov
.modifier
== simd_immediate_info::MSL
16736 if (lane_count
== 1)
16737 snprintf (templ
, sizeof (templ
), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX
,
16738 mnemonic
, UINTVAL (info
.u
.mov
.value
));
16739 else if (info
.u
.mov
.shift
)
16740 snprintf (templ
, sizeof (templ
), "%s\t%%0.%d%c, "
16741 HOST_WIDE_INT_PRINT_HEX
", %s %d", mnemonic
, lane_count
,
16742 element_char
, UINTVAL (info
.u
.mov
.value
), shift_op
,
16745 snprintf (templ
, sizeof (templ
), "%s\t%%0.%d%c, "
16746 HOST_WIDE_INT_PRINT_HEX
, mnemonic
, lane_count
,
16747 element_char
, UINTVAL (info
.u
.mov
.value
));
16751 /* For AARCH64_CHECK_BIC and AARCH64_CHECK_ORR. */
16752 mnemonic
= info
.insn
== simd_immediate_info::MVN
? "bic" : "orr";
16753 if (info
.u
.mov
.shift
)
16754 snprintf (templ
, sizeof (templ
), "%s\t%%0.%d%c, #"
16755 HOST_WIDE_INT_PRINT_DEC
", %s #%d", mnemonic
, lane_count
,
16756 element_char
, UINTVAL (info
.u
.mov
.value
), "lsl",
16759 snprintf (templ
, sizeof (templ
), "%s\t%%0.%d%c, #"
16760 HOST_WIDE_INT_PRINT_DEC
, mnemonic
, lane_count
,
16761 element_char
, UINTVAL (info
.u
.mov
.value
));
16767 aarch64_output_scalar_simd_mov_immediate (rtx immediate
, scalar_int_mode mode
)
16770 /* If a floating point number was passed and we desire to use it in an
16771 integer mode do the conversion to integer. */
16772 if (CONST_DOUBLE_P (immediate
) && GET_MODE_CLASS (mode
) == MODE_INT
)
16774 unsigned HOST_WIDE_INT ival
;
16775 if (!aarch64_reinterpret_float_as_int (immediate
, &ival
))
16776 gcc_unreachable ();
16777 immediate
= gen_int_mode (ival
, mode
);
16780 machine_mode vmode
;
16781 /* use a 64 bit mode for everything except for DI/DF mode, where we use
16782 a 128 bit vector mode. */
16783 int width
= GET_MODE_BITSIZE (mode
) == 64 ? 128 : 64;
16785 vmode
= aarch64_simd_container_mode (mode
, width
);
16786 rtx v_op
= aarch64_simd_gen_const_vector_dup (vmode
, INTVAL (immediate
));
16787 return aarch64_output_simd_mov_immediate (v_op
, width
);
16790 /* Return the output string to use for moving immediate CONST_VECTOR
16791 into an SVE register. */
16794 aarch64_output_sve_mov_immediate (rtx const_vector
)
16796 static char templ
[40];
16797 struct simd_immediate_info info
;
16800 bool is_valid
= aarch64_simd_valid_immediate (const_vector
, &info
);
16801 gcc_assert (is_valid
);
16803 element_char
= sizetochar (GET_MODE_BITSIZE (info
.elt_mode
));
16805 machine_mode vec_mode
= GET_MODE (const_vector
);
16806 if (aarch64_sve_pred_mode_p (vec_mode
))
16808 static char buf
[sizeof ("ptrue\t%0.N, vlNNNNN")];
16809 if (info
.insn
== simd_immediate_info::MOV
)
16811 gcc_assert (info
.u
.mov
.value
== const0_rtx
);
16812 snprintf (buf
, sizeof (buf
), "pfalse\t%%0.b");
16816 gcc_assert (info
.insn
== simd_immediate_info::PTRUE
);
16817 unsigned int total_bytes
;
16818 if (info
.u
.pattern
== AARCH64_SV_ALL
16819 && BYTES_PER_SVE_VECTOR
.is_constant (&total_bytes
))
16820 snprintf (buf
, sizeof (buf
), "ptrue\t%%0.%c, vl%d", element_char
,
16821 total_bytes
/ GET_MODE_SIZE (info
.elt_mode
));
16823 snprintf (buf
, sizeof (buf
), "ptrue\t%%0.%c, %s", element_char
,
16824 svpattern_token (info
.u
.pattern
));
16829 if (info
.insn
== simd_immediate_info::INDEX
)
16831 snprintf (templ
, sizeof (templ
), "index\t%%0.%c, #"
16832 HOST_WIDE_INT_PRINT_DEC
", #" HOST_WIDE_INT_PRINT_DEC
,
16833 element_char
, INTVAL (info
.u
.index
.base
),
16834 INTVAL (info
.u
.index
.step
));
16838 if (GET_MODE_CLASS (info
.elt_mode
) == MODE_FLOAT
)
16840 if (aarch64_float_const_zero_rtx_p (info
.u
.mov
.value
))
16841 info
.u
.mov
.value
= GEN_INT (0);
16844 const int buf_size
= 20;
16845 char float_buf
[buf_size
] = {};
16846 real_to_decimal_for_mode (float_buf
,
16847 CONST_DOUBLE_REAL_VALUE (info
.u
.mov
.value
),
16848 buf_size
, buf_size
, 1, info
.elt_mode
);
16850 snprintf (templ
, sizeof (templ
), "fmov\t%%0.%c, #%s",
16851 element_char
, float_buf
);
16856 snprintf (templ
, sizeof (templ
), "mov\t%%0.%c, #" HOST_WIDE_INT_PRINT_DEC
,
16857 element_char
, INTVAL (info
.u
.mov
.value
));
16861 /* Split operands into moves from op[1] + op[2] into op[0]. */
16864 aarch64_split_combinev16qi (rtx operands
[3])
16866 unsigned int dest
= REGNO (operands
[0]);
16867 unsigned int src1
= REGNO (operands
[1]);
16868 unsigned int src2
= REGNO (operands
[2]);
16869 machine_mode halfmode
= GET_MODE (operands
[1]);
16870 unsigned int halfregs
= REG_NREGS (operands
[1]);
16871 rtx destlo
, desthi
;
16873 gcc_assert (halfmode
== V16QImode
);
16875 if (src1
== dest
&& src2
== dest
+ halfregs
)
16877 /* No-op move. Can't split to nothing; emit something. */
16878 emit_note (NOTE_INSN_DELETED
);
16882 /* Preserve register attributes for variable tracking. */
16883 destlo
= gen_rtx_REG_offset (operands
[0], halfmode
, dest
, 0);
16884 desthi
= gen_rtx_REG_offset (operands
[0], halfmode
, dest
+ halfregs
,
16885 GET_MODE_SIZE (halfmode
));
16887 /* Special case of reversed high/low parts. */
16888 if (reg_overlap_mentioned_p (operands
[2], destlo
)
16889 && reg_overlap_mentioned_p (operands
[1], desthi
))
16891 emit_insn (gen_xorv16qi3 (operands
[1], operands
[1], operands
[2]));
16892 emit_insn (gen_xorv16qi3 (operands
[2], operands
[1], operands
[2]));
16893 emit_insn (gen_xorv16qi3 (operands
[1], operands
[1], operands
[2]));
16895 else if (!reg_overlap_mentioned_p (operands
[2], destlo
))
16897 /* Try to avoid unnecessary moves if part of the result
16898 is in the right place already. */
16900 emit_move_insn (destlo
, operands
[1]);
16901 if (src2
!= dest
+ halfregs
)
16902 emit_move_insn (desthi
, operands
[2]);
16906 if (src2
!= dest
+ halfregs
)
16907 emit_move_insn (desthi
, operands
[2]);
16909 emit_move_insn (destlo
, operands
[1]);
16913 /* vec_perm support. */
16915 struct expand_vec_perm_d
16917 rtx target
, op0
, op1
;
16918 vec_perm_indices perm
;
16919 machine_mode vmode
;
16920 unsigned int vec_flags
;
16925 /* Generate a variable permutation. */
16928 aarch64_expand_vec_perm_1 (rtx target
, rtx op0
, rtx op1
, rtx sel
)
16930 machine_mode vmode
= GET_MODE (target
);
16931 bool one_vector_p
= rtx_equal_p (op0
, op1
);
16933 gcc_checking_assert (vmode
== V8QImode
|| vmode
== V16QImode
);
16934 gcc_checking_assert (GET_MODE (op0
) == vmode
);
16935 gcc_checking_assert (GET_MODE (op1
) == vmode
);
16936 gcc_checking_assert (GET_MODE (sel
) == vmode
);
16937 gcc_checking_assert (TARGET_SIMD
);
16941 if (vmode
== V8QImode
)
16943 /* Expand the argument to a V16QI mode by duplicating it. */
16944 rtx pair
= gen_reg_rtx (V16QImode
);
16945 emit_insn (gen_aarch64_combinev8qi (pair
, op0
, op0
));
16946 emit_insn (gen_aarch64_tbl1v8qi (target
, pair
, sel
));
16950 emit_insn (gen_aarch64_tbl1v16qi (target
, op0
, sel
));
16957 if (vmode
== V8QImode
)
16959 pair
= gen_reg_rtx (V16QImode
);
16960 emit_insn (gen_aarch64_combinev8qi (pair
, op0
, op1
));
16961 emit_insn (gen_aarch64_tbl1v8qi (target
, pair
, sel
));
16965 pair
= gen_reg_rtx (OImode
);
16966 emit_insn (gen_aarch64_combinev16qi (pair
, op0
, op1
));
16967 emit_insn (gen_aarch64_tbl2v16qi (target
, pair
, sel
));
16972 /* Expand a vec_perm with the operands given by TARGET, OP0, OP1 and SEL.
16973 NELT is the number of elements in the vector. */
16976 aarch64_expand_vec_perm (rtx target
, rtx op0
, rtx op1
, rtx sel
,
16979 machine_mode vmode
= GET_MODE (target
);
16980 bool one_vector_p
= rtx_equal_p (op0
, op1
);
16983 /* The TBL instruction does not use a modulo index, so we must take care
16984 of that ourselves. */
16985 mask
= aarch64_simd_gen_const_vector_dup (vmode
,
16986 one_vector_p
? nelt
- 1 : 2 * nelt
- 1);
16987 sel
= expand_simple_binop (vmode
, AND
, sel
, mask
, NULL
, 0, OPTAB_LIB_WIDEN
);
16989 /* For big-endian, we also need to reverse the index within the vector
16990 (but not which vector). */
16991 if (BYTES_BIG_ENDIAN
)
16993 /* If one_vector_p, mask is a vector of (nelt - 1)'s already. */
16995 mask
= aarch64_simd_gen_const_vector_dup (vmode
, nelt
- 1);
16996 sel
= expand_simple_binop (vmode
, XOR
, sel
, mask
,
16997 NULL
, 0, OPTAB_LIB_WIDEN
);
16999 aarch64_expand_vec_perm_1 (target
, op0
, op1
, sel
);
17002 /* Generate (set TARGET (unspec [OP0 OP1] CODE)). */
17005 emit_unspec2 (rtx target
, int code
, rtx op0
, rtx op1
)
17007 emit_insn (gen_rtx_SET (target
,
17008 gen_rtx_UNSPEC (GET_MODE (target
),
17009 gen_rtvec (2, op0
, op1
), code
)));
17012 /* Expand an SVE vec_perm with the given operands. */
17015 aarch64_expand_sve_vec_perm (rtx target
, rtx op0
, rtx op1
, rtx sel
)
17017 machine_mode data_mode
= GET_MODE (target
);
17018 machine_mode sel_mode
= GET_MODE (sel
);
17019 /* Enforced by the pattern condition. */
17020 int nunits
= GET_MODE_NUNITS (sel_mode
).to_constant ();
17022 /* Note: vec_perm indices are supposed to wrap when they go beyond the
17023 size of the two value vectors, i.e. the upper bits of the indices
17024 are effectively ignored. SVE TBL instead produces 0 for any
17025 out-of-range indices, so we need to modulo all the vec_perm indices
17026 to ensure they are all in range. */
17027 rtx sel_reg
= force_reg (sel_mode
, sel
);
17029 /* Check if the sel only references the first values vector. */
17030 if (GET_CODE (sel
) == CONST_VECTOR
17031 && aarch64_const_vec_all_in_range_p (sel
, 0, nunits
- 1))
17033 emit_unspec2 (target
, UNSPEC_TBL
, op0
, sel_reg
);
17037 /* Check if the two values vectors are the same. */
17038 if (rtx_equal_p (op0
, op1
))
17040 rtx max_sel
= aarch64_simd_gen_const_vector_dup (sel_mode
, nunits
- 1);
17041 rtx sel_mod
= expand_simple_binop (sel_mode
, AND
, sel_reg
, max_sel
,
17042 NULL
, 0, OPTAB_DIRECT
);
17043 emit_unspec2 (target
, UNSPEC_TBL
, op0
, sel_mod
);
17047 /* Run TBL on for each value vector and combine the results. */
17049 rtx res0
= gen_reg_rtx (data_mode
);
17050 rtx res1
= gen_reg_rtx (data_mode
);
17051 rtx neg_num_elems
= aarch64_simd_gen_const_vector_dup (sel_mode
, -nunits
);
17052 if (GET_CODE (sel
) != CONST_VECTOR
17053 || !aarch64_const_vec_all_in_range_p (sel
, 0, 2 * nunits
- 1))
17055 rtx max_sel
= aarch64_simd_gen_const_vector_dup (sel_mode
,
17057 sel_reg
= expand_simple_binop (sel_mode
, AND
, sel_reg
, max_sel
,
17058 NULL
, 0, OPTAB_DIRECT
);
17060 emit_unspec2 (res0
, UNSPEC_TBL
, op0
, sel_reg
);
17061 rtx sel_sub
= expand_simple_binop (sel_mode
, PLUS
, sel_reg
, neg_num_elems
,
17062 NULL
, 0, OPTAB_DIRECT
);
17063 emit_unspec2 (res1
, UNSPEC_TBL
, op1
, sel_sub
);
17064 if (GET_MODE_CLASS (data_mode
) == MODE_VECTOR_INT
)
17065 emit_insn (gen_rtx_SET (target
, gen_rtx_IOR (data_mode
, res0
, res1
)));
17067 emit_unspec2 (target
, UNSPEC_IORF
, res0
, res1
);
17070 /* Recognize patterns suitable for the TRN instructions. */
17072 aarch64_evpc_trn (struct expand_vec_perm_d
*d
)
17075 poly_uint64 nelt
= d
->perm
.length ();
17076 rtx out
, in0
, in1
, x
;
17077 machine_mode vmode
= d
->vmode
;
17079 if (GET_MODE_UNIT_SIZE (vmode
) > 8)
17082 /* Note that these are little-endian tests.
17083 We correct for big-endian later. */
17084 if (!d
->perm
[0].is_constant (&odd
)
17085 || (odd
!= 0 && odd
!= 1)
17086 || !d
->perm
.series_p (0, 2, odd
, 2)
17087 || !d
->perm
.series_p (1, 2, nelt
+ odd
, 2))
17096 /* We don't need a big-endian lane correction for SVE; see the comment
17097 at the head of aarch64-sve.md for details. */
17098 if (BYTES_BIG_ENDIAN
&& d
->vec_flags
== VEC_ADVSIMD
)
17100 x
= in0
, in0
= in1
, in1
= x
;
17105 emit_set_insn (out
, gen_rtx_UNSPEC (vmode
, gen_rtvec (2, in0
, in1
),
17106 odd
? UNSPEC_TRN2
: UNSPEC_TRN1
));
17110 /* Recognize patterns suitable for the UZP instructions. */
17112 aarch64_evpc_uzp (struct expand_vec_perm_d
*d
)
17115 rtx out
, in0
, in1
, x
;
17116 machine_mode vmode
= d
->vmode
;
17118 if (GET_MODE_UNIT_SIZE (vmode
) > 8)
17121 /* Note that these are little-endian tests.
17122 We correct for big-endian later. */
17123 if (!d
->perm
[0].is_constant (&odd
)
17124 || (odd
!= 0 && odd
!= 1)
17125 || !d
->perm
.series_p (0, 1, odd
, 2))
17134 /* We don't need a big-endian lane correction for SVE; see the comment
17135 at the head of aarch64-sve.md for details. */
17136 if (BYTES_BIG_ENDIAN
&& d
->vec_flags
== VEC_ADVSIMD
)
17138 x
= in0
, in0
= in1
, in1
= x
;
17143 emit_set_insn (out
, gen_rtx_UNSPEC (vmode
, gen_rtvec (2, in0
, in1
),
17144 odd
? UNSPEC_UZP2
: UNSPEC_UZP1
));
17148 /* Recognize patterns suitable for the ZIP instructions. */
17150 aarch64_evpc_zip (struct expand_vec_perm_d
*d
)
17153 poly_uint64 nelt
= d
->perm
.length ();
17154 rtx out
, in0
, in1
, x
;
17155 machine_mode vmode
= d
->vmode
;
17157 if (GET_MODE_UNIT_SIZE (vmode
) > 8)
17160 /* Note that these are little-endian tests.
17161 We correct for big-endian later. */
17162 poly_uint64 first
= d
->perm
[0];
17163 if ((maybe_ne (first
, 0U) && maybe_ne (first
* 2, nelt
))
17164 || !d
->perm
.series_p (0, 2, first
, 1)
17165 || !d
->perm
.series_p (1, 2, first
+ nelt
, 1))
17167 high
= maybe_ne (first
, 0U);
17175 /* We don't need a big-endian lane correction for SVE; see the comment
17176 at the head of aarch64-sve.md for details. */
17177 if (BYTES_BIG_ENDIAN
&& d
->vec_flags
== VEC_ADVSIMD
)
17179 x
= in0
, in0
= in1
, in1
= x
;
17184 emit_set_insn (out
, gen_rtx_UNSPEC (vmode
, gen_rtvec (2, in0
, in1
),
17185 high
? UNSPEC_ZIP2
: UNSPEC_ZIP1
));
17189 /* Recognize patterns for the EXT insn. */
17192 aarch64_evpc_ext (struct expand_vec_perm_d
*d
)
17194 HOST_WIDE_INT location
;
17197 /* The first element always refers to the first vector.
17198 Check if the extracted indices are increasing by one. */
17199 if (d
->vec_flags
== VEC_SVE_PRED
17200 || !d
->perm
[0].is_constant (&location
)
17201 || !d
->perm
.series_p (0, 1, location
, 1))
17208 /* The case where (location == 0) is a no-op for both big- and little-endian,
17209 and is removed by the mid-end at optimization levels -O1 and higher.
17211 We don't need a big-endian lane correction for SVE; see the comment
17212 at the head of aarch64-sve.md for details. */
17213 if (BYTES_BIG_ENDIAN
&& location
!= 0 && d
->vec_flags
== VEC_ADVSIMD
)
17215 /* After setup, we want the high elements of the first vector (stored
17216 at the LSB end of the register), and the low elements of the second
17217 vector (stored at the MSB end of the register). So swap. */
17218 std::swap (d
->op0
, d
->op1
);
17219 /* location != 0 (above), so safe to assume (nelt - location) < nelt.
17220 to_constant () is safe since this is restricted to Advanced SIMD
17222 location
= d
->perm
.length ().to_constant () - location
;
17225 offset
= GEN_INT (location
);
17226 emit_set_insn (d
->target
,
17227 gen_rtx_UNSPEC (d
->vmode
,
17228 gen_rtvec (3, d
->op0
, d
->op1
, offset
),
17233 /* Recognize patterns for the REV{64,32,16} insns, which reverse elements
17234 within each 64-bit, 32-bit or 16-bit granule. */
17237 aarch64_evpc_rev_local (struct expand_vec_perm_d
*d
)
17239 HOST_WIDE_INT diff
;
17240 unsigned int i
, size
, unspec
;
17241 machine_mode pred_mode
;
17243 if (d
->vec_flags
== VEC_SVE_PRED
17244 || !d
->one_vector_p
17245 || !d
->perm
[0].is_constant (&diff
))
17248 size
= (diff
+ 1) * GET_MODE_UNIT_SIZE (d
->vmode
);
17251 unspec
= UNSPEC_REV64
;
17252 pred_mode
= VNx2BImode
;
17254 else if (size
== 4)
17256 unspec
= UNSPEC_REV32
;
17257 pred_mode
= VNx4BImode
;
17259 else if (size
== 2)
17261 unspec
= UNSPEC_REV16
;
17262 pred_mode
= VNx8BImode
;
17267 unsigned int step
= diff
+ 1;
17268 for (i
= 0; i
< step
; ++i
)
17269 if (!d
->perm
.series_p (i
, step
, diff
- i
, step
))
17276 rtx src
= gen_rtx_UNSPEC (d
->vmode
, gen_rtvec (1, d
->op0
), unspec
);
17277 if (d
->vec_flags
== VEC_SVE_DATA
)
17279 rtx pred
= aarch64_ptrue_reg (pred_mode
);
17280 src
= gen_rtx_UNSPEC (d
->vmode
, gen_rtvec (2, pred
, src
),
17281 UNSPEC_MERGE_PTRUE
);
17283 emit_set_insn (d
->target
, src
);
17287 /* Recognize patterns for the REV insn, which reverses elements within
17291 aarch64_evpc_rev_global (struct expand_vec_perm_d
*d
)
17293 poly_uint64 nelt
= d
->perm
.length ();
17295 if (!d
->one_vector_p
|| d
->vec_flags
!= VEC_SVE_DATA
)
17298 if (!d
->perm
.series_p (0, 1, nelt
- 1, -1))
17305 rtx src
= gen_rtx_UNSPEC (d
->vmode
, gen_rtvec (1, d
->op0
), UNSPEC_REV
);
17306 emit_set_insn (d
->target
, src
);
17311 aarch64_evpc_dup (struct expand_vec_perm_d
*d
)
17313 rtx out
= d
->target
;
17316 machine_mode vmode
= d
->vmode
;
17319 if (d
->vec_flags
== VEC_SVE_PRED
17320 || d
->perm
.encoding ().encoded_nelts () != 1
17321 || !d
->perm
[0].is_constant (&elt
))
17324 if (d
->vec_flags
== VEC_SVE_DATA
&& elt
>= 64 * GET_MODE_UNIT_SIZE (vmode
))
17331 /* The generic preparation in aarch64_expand_vec_perm_const_1
17332 swaps the operand order and the permute indices if it finds
17333 d->perm[0] to be in the second operand. Thus, we can always
17334 use d->op0 and need not do any extra arithmetic to get the
17335 correct lane number. */
17337 lane
= GEN_INT (elt
); /* The pattern corrects for big-endian. */
17339 rtx parallel
= gen_rtx_PARALLEL (vmode
, gen_rtvec (1, lane
));
17340 rtx select
= gen_rtx_VEC_SELECT (GET_MODE_INNER (vmode
), in0
, parallel
);
17341 emit_set_insn (out
, gen_rtx_VEC_DUPLICATE (vmode
, select
));
17346 aarch64_evpc_tbl (struct expand_vec_perm_d
*d
)
17348 rtx rperm
[MAX_COMPILE_TIME_VEC_BYTES
], sel
;
17349 machine_mode vmode
= d
->vmode
;
17351 /* Make sure that the indices are constant. */
17352 unsigned int encoded_nelts
= d
->perm
.encoding ().encoded_nelts ();
17353 for (unsigned int i
= 0; i
< encoded_nelts
; ++i
)
17354 if (!d
->perm
[i
].is_constant ())
17360 /* Generic code will try constant permutation twice. Once with the
17361 original mode and again with the elements lowered to QImode.
17362 So wait and don't do the selector expansion ourselves. */
17363 if (vmode
!= V8QImode
&& vmode
!= V16QImode
)
17366 /* to_constant is safe since this routine is specific to Advanced SIMD
17368 unsigned int nelt
= d
->perm
.length ().to_constant ();
17369 for (unsigned int i
= 0; i
< nelt
; ++i
)
17370 /* If big-endian and two vectors we end up with a weird mixed-endian
17371 mode on NEON. Reverse the index within each word but not the word
17372 itself. to_constant is safe because we checked is_constant above. */
17373 rperm
[i
] = GEN_INT (BYTES_BIG_ENDIAN
17374 ? d
->perm
[i
].to_constant () ^ (nelt
- 1)
17375 : d
->perm
[i
].to_constant ());
17377 sel
= gen_rtx_CONST_VECTOR (vmode
, gen_rtvec_v (nelt
, rperm
));
17378 sel
= force_reg (vmode
, sel
);
17380 aarch64_expand_vec_perm_1 (d
->target
, d
->op0
, d
->op1
, sel
);
17384 /* Try to implement D using an SVE TBL instruction. */
17387 aarch64_evpc_sve_tbl (struct expand_vec_perm_d
*d
)
17389 unsigned HOST_WIDE_INT nelt
;
17391 /* Permuting two variable-length vectors could overflow the
17393 if (!d
->one_vector_p
&& !d
->perm
.length ().is_constant (&nelt
))
17399 machine_mode sel_mode
= mode_for_int_vector (d
->vmode
).require ();
17400 rtx sel
= vec_perm_indices_to_rtx (sel_mode
, d
->perm
);
17401 if (d
->one_vector_p
)
17402 emit_unspec2 (d
->target
, UNSPEC_TBL
, d
->op0
, force_reg (sel_mode
, sel
));
17404 aarch64_expand_sve_vec_perm (d
->target
, d
->op0
, d
->op1
, sel
);
17409 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d
*d
)
17411 /* The pattern matching functions above are written to look for a small
17412 number to begin the sequence (0, 1, N/2). If we begin with an index
17413 from the second operand, we can swap the operands. */
17414 poly_int64 nelt
= d
->perm
.length ();
17415 if (known_ge (d
->perm
[0], nelt
))
17417 d
->perm
.rotate_inputs (1);
17418 std::swap (d
->op0
, d
->op1
);
17421 if ((d
->vec_flags
== VEC_ADVSIMD
17422 || d
->vec_flags
== VEC_SVE_DATA
17423 || d
->vec_flags
== VEC_SVE_PRED
)
17424 && known_gt (nelt
, 1))
17426 if (aarch64_evpc_rev_local (d
))
17428 else if (aarch64_evpc_rev_global (d
))
17430 else if (aarch64_evpc_ext (d
))
17432 else if (aarch64_evpc_dup (d
))
17434 else if (aarch64_evpc_zip (d
))
17436 else if (aarch64_evpc_uzp (d
))
17438 else if (aarch64_evpc_trn (d
))
17440 if (d
->vec_flags
== VEC_SVE_DATA
)
17441 return aarch64_evpc_sve_tbl (d
);
17442 else if (d
->vec_flags
== VEC_ADVSIMD
)
17443 return aarch64_evpc_tbl (d
);
17448 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST. */
17451 aarch64_vectorize_vec_perm_const (machine_mode vmode
, rtx target
, rtx op0
,
17452 rtx op1
, const vec_perm_indices
&sel
)
17454 struct expand_vec_perm_d d
;
17456 /* Check whether the mask can be applied to a single vector. */
17457 if (sel
.ninputs () == 1
17458 || (op0
&& rtx_equal_p (op0
, op1
)))
17459 d
.one_vector_p
= true;
17460 else if (sel
.all_from_input_p (0))
17462 d
.one_vector_p
= true;
17465 else if (sel
.all_from_input_p (1))
17467 d
.one_vector_p
= true;
17471 d
.one_vector_p
= false;
17473 d
.perm
.new_vector (sel
.encoding (), d
.one_vector_p
? 1 : 2,
17474 sel
.nelts_per_input ());
17476 d
.vec_flags
= aarch64_classify_vector_mode (d
.vmode
);
17480 d
.testing_p
= !target
;
17483 return aarch64_expand_vec_perm_const_1 (&d
);
17485 rtx_insn
*last
= get_last_insn ();
17486 bool ret
= aarch64_expand_vec_perm_const_1 (&d
);
17487 gcc_assert (last
== get_last_insn ());
17492 /* Generate a byte permute mask for a register of mode MODE,
17493 which has NUNITS units. */
17496 aarch64_reverse_mask (machine_mode mode
, unsigned int nunits
)
17498 /* We have to reverse each vector because we dont have
17499 a permuted load that can reverse-load according to ABI rules. */
17501 rtvec v
= rtvec_alloc (16);
17503 unsigned int usize
= GET_MODE_UNIT_SIZE (mode
);
17505 gcc_assert (BYTES_BIG_ENDIAN
);
17506 gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode
));
17508 for (i
= 0; i
< nunits
; i
++)
17509 for (j
= 0; j
< usize
; j
++)
17510 RTVEC_ELT (v
, i
* usize
+ j
) = GEN_INT ((i
+ 1) * usize
- 1 - j
);
17511 mask
= gen_rtx_CONST_VECTOR (V16QImode
, v
);
17512 return force_reg (V16QImode
, mask
);
17515 /* Return true if X is a valid second operand for the SVE instruction
17516 that implements integer comparison OP_CODE. */
17519 aarch64_sve_cmp_operand_p (rtx_code op_code
, rtx x
)
17521 if (register_operand (x
, VOIDmode
))
17530 return aarch64_sve_cmp_immediate_p (x
, false);
17537 return aarch64_sve_cmp_immediate_p (x
, true);
17539 gcc_unreachable ();
17543 /* Use predicated SVE instructions to implement the equivalent of:
17547 given that PTRUE is an all-true predicate of the appropriate mode. */
17550 aarch64_emit_sve_ptrue_op (rtx target
, rtx ptrue
, rtx op
)
17552 rtx unspec
= gen_rtx_UNSPEC (GET_MODE (target
),
17553 gen_rtvec (2, ptrue
, op
),
17554 UNSPEC_MERGE_PTRUE
);
17555 rtx_insn
*insn
= emit_set_insn (target
, unspec
);
17556 set_unique_reg_note (insn
, REG_EQUAL
, copy_rtx (op
));
17559 /* Likewise, but also clobber the condition codes. */
17562 aarch64_emit_sve_ptrue_op_cc (rtx target
, rtx ptrue
, rtx op
)
17564 rtx unspec
= gen_rtx_UNSPEC (GET_MODE (target
),
17565 gen_rtvec (2, ptrue
, op
),
17566 UNSPEC_MERGE_PTRUE
);
17567 rtx_insn
*insn
= emit_insn (gen_set_clobber_cc_nzc (target
, unspec
));
17568 set_unique_reg_note (insn
, REG_EQUAL
, copy_rtx (op
));
17571 /* Return the UNSPEC_COND_* code for comparison CODE. */
17573 static unsigned int
17574 aarch64_unspec_cond_code (rtx_code code
)
17579 return UNSPEC_COND_FCMNE
;
17581 return UNSPEC_COND_FCMEQ
;
17583 return UNSPEC_COND_FCMLT
;
17585 return UNSPEC_COND_FCMGT
;
17587 return UNSPEC_COND_FCMLE
;
17589 return UNSPEC_COND_FCMGE
;
17591 gcc_unreachable ();
17597 (set TARGET (unspec [PRED OP0 OP1] UNSPEC_COND_<X>))
17599 where <X> is the operation associated with comparison CODE. This form
17600 of instruction is used when (and (CODE OP0 OP1) PRED) would have different
17601 semantics, such as when PRED might not be all-true and when comparing
17602 inactive lanes could have side effects. */
17605 aarch64_emit_sve_predicated_cond (rtx target
, rtx_code code
,
17606 rtx pred
, rtx op0
, rtx op1
)
17608 rtx unspec
= gen_rtx_UNSPEC (GET_MODE (pred
),
17609 gen_rtvec (3, pred
, op0
, op1
),
17610 aarch64_unspec_cond_code (code
));
17611 emit_set_insn (target
, unspec
);
17614 /* Expand an SVE integer comparison using the SVE equivalent of:
17616 (set TARGET (CODE OP0 OP1)). */
17619 aarch64_expand_sve_vec_cmp_int (rtx target
, rtx_code code
, rtx op0
, rtx op1
)
17621 machine_mode pred_mode
= GET_MODE (target
);
17622 machine_mode data_mode
= GET_MODE (op0
);
17624 if (!aarch64_sve_cmp_operand_p (code
, op1
))
17625 op1
= force_reg (data_mode
, op1
);
17627 rtx ptrue
= aarch64_ptrue_reg (pred_mode
);
17628 rtx cond
= gen_rtx_fmt_ee (code
, pred_mode
, op0
, op1
);
17629 aarch64_emit_sve_ptrue_op_cc (target
, ptrue
, cond
);
17632 /* Emit the SVE equivalent of:
17634 (set TMP1 (CODE1 OP0 OP1))
17635 (set TMP2 (CODE2 OP0 OP1))
17636 (set TARGET (ior:PRED_MODE TMP1 TMP2))
17638 PTRUE is an all-true predicate with the same mode as TARGET. */
17641 aarch64_emit_sve_or_conds (rtx target
, rtx_code code1
, rtx_code code2
,
17642 rtx ptrue
, rtx op0
, rtx op1
)
17644 machine_mode pred_mode
= GET_MODE (ptrue
);
17645 rtx tmp1
= gen_reg_rtx (pred_mode
);
17646 aarch64_emit_sve_ptrue_op (tmp1
, ptrue
,
17647 gen_rtx_fmt_ee (code1
, pred_mode
, op0
, op1
));
17648 rtx tmp2
= gen_reg_rtx (pred_mode
);
17649 aarch64_emit_sve_ptrue_op (tmp2
, ptrue
,
17650 gen_rtx_fmt_ee (code2
, pred_mode
, op0
, op1
));
17651 aarch64_emit_binop (target
, ior_optab
, tmp1
, tmp2
);
17654 /* Emit the SVE equivalent of:
17656 (set TMP (CODE OP0 OP1))
17657 (set TARGET (not TMP))
17659 PTRUE is an all-true predicate with the same mode as TARGET. */
17662 aarch64_emit_sve_inverted_cond (rtx target
, rtx ptrue
, rtx_code code
,
17665 machine_mode pred_mode
= GET_MODE (ptrue
);
17666 rtx tmp
= gen_reg_rtx (pred_mode
);
17667 aarch64_emit_sve_ptrue_op (tmp
, ptrue
,
17668 gen_rtx_fmt_ee (code
, pred_mode
, op0
, op1
));
17669 aarch64_emit_unop (target
, one_cmpl_optab
, tmp
);
17672 /* Expand an SVE floating-point comparison using the SVE equivalent of:
17674 (set TARGET (CODE OP0 OP1))
17676 If CAN_INVERT_P is true, the caller can also handle inverted results;
17677 return true if the result is in fact inverted. */
17680 aarch64_expand_sve_vec_cmp_float (rtx target
, rtx_code code
,
17681 rtx op0
, rtx op1
, bool can_invert_p
)
17683 machine_mode pred_mode
= GET_MODE (target
);
17684 machine_mode data_mode
= GET_MODE (op0
);
17686 rtx ptrue
= aarch64_ptrue_reg (pred_mode
);
17690 /* UNORDERED has no immediate form. */
17691 op1
= force_reg (data_mode
, op1
);
17700 /* There is native support for the comparison. */
17701 rtx cond
= gen_rtx_fmt_ee (code
, pred_mode
, op0
, op1
);
17702 aarch64_emit_sve_ptrue_op (target
, ptrue
, cond
);
17707 /* This is a trapping operation (LT or GT). */
17708 aarch64_emit_sve_or_conds (target
, LT
, GT
, ptrue
, op0
, op1
);
17712 if (!flag_trapping_math
)
17714 /* This would trap for signaling NaNs. */
17715 op1
= force_reg (data_mode
, op1
);
17716 aarch64_emit_sve_or_conds (target
, UNORDERED
, EQ
, ptrue
, op0
, op1
);
17724 if (flag_trapping_math
)
17726 /* Work out which elements are ordered. */
17727 rtx ordered
= gen_reg_rtx (pred_mode
);
17728 op1
= force_reg (data_mode
, op1
);
17729 aarch64_emit_sve_inverted_cond (ordered
, ptrue
, UNORDERED
, op0
, op1
);
17731 /* Test the opposite condition for the ordered elements,
17732 then invert the result. */
17736 code
= reverse_condition_maybe_unordered (code
);
17739 aarch64_emit_sve_predicated_cond (target
, code
,
17740 ordered
, op0
, op1
);
17743 rtx tmp
= gen_reg_rtx (pred_mode
);
17744 aarch64_emit_sve_predicated_cond (tmp
, code
, ordered
, op0
, op1
);
17745 aarch64_emit_unop (target
, one_cmpl_optab
, tmp
);
17751 /* ORDERED has no immediate form. */
17752 op1
= force_reg (data_mode
, op1
);
17756 gcc_unreachable ();
17759 /* There is native support for the inverse comparison. */
17760 code
= reverse_condition_maybe_unordered (code
);
17763 rtx cond
= gen_rtx_fmt_ee (code
, pred_mode
, op0
, op1
);
17764 aarch64_emit_sve_ptrue_op (target
, ptrue
, cond
);
17767 aarch64_emit_sve_inverted_cond (target
, ptrue
, code
, op0
, op1
);
17771 /* Expand an SVE vcond pattern with operands OPS. DATA_MODE is the mode
17772 of the data being selected and CMP_MODE is the mode of the values being
17776 aarch64_expand_sve_vcond (machine_mode data_mode
, machine_mode cmp_mode
,
17779 machine_mode pred_mode
17780 = aarch64_get_mask_mode (GET_MODE_NUNITS (cmp_mode
),
17781 GET_MODE_SIZE (cmp_mode
)).require ();
17782 rtx pred
= gen_reg_rtx (pred_mode
);
17783 if (FLOAT_MODE_P (cmp_mode
))
17785 if (aarch64_expand_sve_vec_cmp_float (pred
, GET_CODE (ops
[3]),
17786 ops
[4], ops
[5], true))
17787 std::swap (ops
[1], ops
[2]);
17790 aarch64_expand_sve_vec_cmp_int (pred
, GET_CODE (ops
[3]), ops
[4], ops
[5]);
17792 rtvec vec
= gen_rtvec (3, pred
, ops
[1], ops
[2]);
17793 emit_set_insn (ops
[0], gen_rtx_UNSPEC (data_mode
, vec
, UNSPEC_SEL
));
17796 /* Implement TARGET_MODES_TIEABLE_P. In principle we should always return
17797 true. However due to issues with register allocation it is preferable
17798 to avoid tieing integer scalar and FP scalar modes. Executing integer
17799 operations in general registers is better than treating them as scalar
17800 vector operations. This reduces latency and avoids redundant int<->FP
17801 moves. So tie modes if they are either the same class, or vector modes
17802 with other vector modes, vector structs or any scalar mode. */
17805 aarch64_modes_tieable_p (machine_mode mode1
, machine_mode mode2
)
17807 if (GET_MODE_CLASS (mode1
) == GET_MODE_CLASS (mode2
))
17810 /* We specifically want to allow elements of "structure" modes to
17811 be tieable to the structure. This more general condition allows
17812 other rarer situations too. The reason we don't extend this to
17813 predicate modes is that there are no predicate structure modes
17814 nor any specific instructions for extracting part of a predicate
17816 if (aarch64_vector_data_mode_p (mode1
)
17817 && aarch64_vector_data_mode_p (mode2
))
17820 /* Also allow any scalar modes with vectors. */
17821 if (aarch64_vector_mode_supported_p (mode1
)
17822 || aarch64_vector_mode_supported_p (mode2
))
17828 /* Return a new RTX holding the result of moving POINTER forward by
17832 aarch64_move_pointer (rtx pointer
, poly_int64 amount
)
17834 rtx next
= plus_constant (Pmode
, XEXP (pointer
, 0), amount
);
17836 return adjust_automodify_address (pointer
, GET_MODE (pointer
),
17840 /* Return a new RTX holding the result of moving POINTER forward by the
17841 size of the mode it points to. */
17844 aarch64_progress_pointer (rtx pointer
)
17846 return aarch64_move_pointer (pointer
, GET_MODE_SIZE (GET_MODE (pointer
)));
17849 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
17853 aarch64_copy_one_block_and_progress_pointers (rtx
*src
, rtx
*dst
,
17856 rtx reg
= gen_reg_rtx (mode
);
17858 /* "Cast" the pointers to the correct mode. */
17859 *src
= adjust_address (*src
, mode
, 0);
17860 *dst
= adjust_address (*dst
, mode
, 0);
17861 /* Emit the memcpy. */
17862 emit_move_insn (reg
, *src
);
17863 emit_move_insn (*dst
, reg
);
17864 /* Move the pointers forward. */
17865 *src
= aarch64_progress_pointer (*src
);
17866 *dst
= aarch64_progress_pointer (*dst
);
17869 /* Expand cpymem, as if from a __builtin_memcpy. Return true if
17870 we succeed, otherwise return false. */
17873 aarch64_expand_cpymem (rtx
*operands
)
17876 rtx dst
= operands
[0];
17877 rtx src
= operands
[1];
17879 machine_mode cur_mode
= BLKmode
, next_mode
;
17880 bool speed_p
= !optimize_function_for_size_p (cfun
);
17882 /* When optimizing for size, give a better estimate of the length of a
17883 memcpy call, but use the default otherwise. Moves larger than 8 bytes
17884 will always require an even number of instructions to do now. And each
17885 operation requires both a load+store, so devide the max number by 2. */
17886 int max_num_moves
= (speed_p
? 16 : AARCH64_CALL_RATIO
) / 2;
17888 /* We can't do anything smart if the amount to copy is not constant. */
17889 if (!CONST_INT_P (operands
[2]))
17892 n
= INTVAL (operands
[2]);
17894 /* Try to keep the number of instructions low. For all cases we will do at
17895 most two moves for the residual amount, since we'll always overlap the
17897 if (((n
/ 16) + (n
% 16 ? 2 : 0)) > max_num_moves
)
17900 base
= copy_to_mode_reg (Pmode
, XEXP (dst
, 0));
17901 dst
= adjust_automodify_address (dst
, VOIDmode
, base
, 0);
17903 base
= copy_to_mode_reg (Pmode
, XEXP (src
, 0));
17904 src
= adjust_automodify_address (src
, VOIDmode
, base
, 0);
17906 /* Convert n to bits to make the rest of the code simpler. */
17907 n
= n
* BITS_PER_UNIT
;
17909 /* Maximum amount to copy in one go. The AArch64 back-end has integer modes
17910 larger than TImode, but we should not use them for loads/stores here. */
17911 const int copy_limit
= GET_MODE_BITSIZE (TImode
);
17915 /* Find the largest mode in which to do the copy in without over reading
17917 opt_scalar_int_mode mode_iter
;
17918 FOR_EACH_MODE_IN_CLASS (mode_iter
, MODE_INT
)
17919 if (GET_MODE_BITSIZE (mode_iter
.require ()) <= MIN (n
, copy_limit
))
17920 cur_mode
= mode_iter
.require ();
17922 gcc_assert (cur_mode
!= BLKmode
);
17924 mode_bits
= GET_MODE_BITSIZE (cur_mode
).to_constant ();
17925 aarch64_copy_one_block_and_progress_pointers (&src
, &dst
, cur_mode
);
17929 /* Do certain trailing copies as overlapping if it's going to be
17930 cheaper. i.e. less instructions to do so. For instance doing a 15
17931 byte copy it's more efficient to do two overlapping 8 byte copies than
17933 if (n
> 0 && n
<= 8 * BITS_PER_UNIT
)
17935 next_mode
= smallest_mode_for_size (n
, MODE_INT
);
17936 int n_bits
= GET_MODE_BITSIZE (next_mode
).to_constant ();
17937 src
= aarch64_move_pointer (src
, (n
- n_bits
) / BITS_PER_UNIT
);
17938 dst
= aarch64_move_pointer (dst
, (n
- n_bits
) / BITS_PER_UNIT
);
17946 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
17947 SImode stores. Handle the case when the constant has identical
17948 bottom and top halves. This is beneficial when the two stores can be
17949 merged into an STP and we avoid synthesising potentially expensive
17950 immediates twice. Return true if such a split is possible. */
17953 aarch64_split_dimode_const_store (rtx dst
, rtx src
)
17955 rtx lo
= gen_lowpart (SImode
, src
);
17956 rtx hi
= gen_highpart_mode (SImode
, DImode
, src
);
17958 bool size_p
= optimize_function_for_size_p (cfun
);
17960 if (!rtx_equal_p (lo
, hi
))
17963 unsigned int orig_cost
17964 = aarch64_internal_mov_immediate (NULL_RTX
, src
, false, DImode
);
17965 unsigned int lo_cost
17966 = aarch64_internal_mov_immediate (NULL_RTX
, lo
, false, SImode
);
17968 /* We want to transform:
17970 MOVK x1, 0x140, lsl 16
17971 MOVK x1, 0xc0da, lsl 32
17972 MOVK x1, 0x140, lsl 48
17976 MOVK w1, 0x140, lsl 16
17978 So we want to perform this only when we save two instructions
17979 or more. When optimizing for size, however, accept any code size
17981 if (size_p
&& orig_cost
<= lo_cost
)
17985 && (orig_cost
<= lo_cost
+ 1))
17988 rtx mem_lo
= adjust_address (dst
, SImode
, 0);
17989 if (!aarch64_mem_pair_operand (mem_lo
, SImode
))
17992 rtx tmp_reg
= gen_reg_rtx (SImode
);
17993 aarch64_expand_mov_immediate (tmp_reg
, lo
);
17994 rtx mem_hi
= aarch64_move_pointer (mem_lo
, GET_MODE_SIZE (SImode
));
17995 /* Don't emit an explicit store pair as this may not be always profitable.
17996 Let the sched-fusion logic decide whether to merge them. */
17997 emit_move_insn (mem_lo
, tmp_reg
);
17998 emit_move_insn (mem_hi
, tmp_reg
);
18003 /* Generate RTL for a conditional branch with rtx comparison CODE in
18004 mode CC_MODE. The destination of the unlikely conditional branch
18008 aarch64_gen_unlikely_cbranch (enum rtx_code code
, machine_mode cc_mode
,
18012 x
= gen_rtx_fmt_ee (code
, VOIDmode
,
18013 gen_rtx_REG (cc_mode
, CC_REGNUM
),
18016 x
= gen_rtx_IF_THEN_ELSE (VOIDmode
, x
,
18017 gen_rtx_LABEL_REF (VOIDmode
, label_ref
),
18019 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx
, x
));
18022 /* Generate DImode scratch registers for 128-bit (TImode) addition.
18024 OP1 represents the TImode destination operand 1
18025 OP2 represents the TImode destination operand 2
18026 LOW_DEST represents the low half (DImode) of TImode operand 0
18027 LOW_IN1 represents the low half (DImode) of TImode operand 1
18028 LOW_IN2 represents the low half (DImode) of TImode operand 2
18029 HIGH_DEST represents the high half (DImode) of TImode operand 0
18030 HIGH_IN1 represents the high half (DImode) of TImode operand 1
18031 HIGH_IN2 represents the high half (DImode) of TImode operand 2. */
18034 aarch64_addti_scratch_regs (rtx op1
, rtx op2
, rtx
*low_dest
,
18035 rtx
*low_in1
, rtx
*low_in2
,
18036 rtx
*high_dest
, rtx
*high_in1
,
18039 *low_dest
= gen_reg_rtx (DImode
);
18040 *low_in1
= gen_lowpart (DImode
, op1
);
18041 *low_in2
= simplify_gen_subreg (DImode
, op2
, TImode
,
18042 subreg_lowpart_offset (DImode
, TImode
));
18043 *high_dest
= gen_reg_rtx (DImode
);
18044 *high_in1
= gen_highpart (DImode
, op1
);
18045 *high_in2
= simplify_gen_subreg (DImode
, op2
, TImode
,
18046 subreg_highpart_offset (DImode
, TImode
));
18049 /* Generate DImode scratch registers for 128-bit (TImode) subtraction.
18051 This function differs from 'arch64_addti_scratch_regs' in that
18052 OP1 can be an immediate constant (zero). We must call
18053 subreg_highpart_offset with DImode and TImode arguments, otherwise
18054 VOIDmode will be used for the const_int which generates an internal
18055 error from subreg_size_highpart_offset which does not expect a size of zero.
18057 OP1 represents the TImode destination operand 1
18058 OP2 represents the TImode destination operand 2
18059 LOW_DEST represents the low half (DImode) of TImode operand 0
18060 LOW_IN1 represents the low half (DImode) of TImode operand 1
18061 LOW_IN2 represents the low half (DImode) of TImode operand 2
18062 HIGH_DEST represents the high half (DImode) of TImode operand 0
18063 HIGH_IN1 represents the high half (DImode) of TImode operand 1
18064 HIGH_IN2 represents the high half (DImode) of TImode operand 2. */
18068 aarch64_subvti_scratch_regs (rtx op1
, rtx op2
, rtx
*low_dest
,
18069 rtx
*low_in1
, rtx
*low_in2
,
18070 rtx
*high_dest
, rtx
*high_in1
,
18073 *low_dest
= gen_reg_rtx (DImode
);
18074 *low_in1
= simplify_gen_subreg (DImode
, op1
, TImode
,
18075 subreg_lowpart_offset (DImode
, TImode
));
18077 *low_in2
= simplify_gen_subreg (DImode
, op2
, TImode
,
18078 subreg_lowpart_offset (DImode
, TImode
));
18079 *high_dest
= gen_reg_rtx (DImode
);
18081 *high_in1
= simplify_gen_subreg (DImode
, op1
, TImode
,
18082 subreg_highpart_offset (DImode
, TImode
));
18083 *high_in2
= simplify_gen_subreg (DImode
, op2
, TImode
,
18084 subreg_highpart_offset (DImode
, TImode
));
18087 /* Generate RTL for 128-bit (TImode) subtraction with overflow.
18089 OP0 represents the TImode destination operand 0
18090 LOW_DEST represents the low half (DImode) of TImode operand 0
18091 LOW_IN1 represents the low half (DImode) of TImode operand 1
18092 LOW_IN2 represents the low half (DImode) of TImode operand 2
18093 HIGH_DEST represents the high half (DImode) of TImode operand 0
18094 HIGH_IN1 represents the high half (DImode) of TImode operand 1
18095 HIGH_IN2 represents the high half (DImode) of TImode operand 2
18096 UNSIGNED_P is true if the operation is being performed on unsigned
18099 aarch64_expand_subvti (rtx op0
, rtx low_dest
, rtx low_in1
,
18100 rtx low_in2
, rtx high_dest
, rtx high_in1
,
18101 rtx high_in2
, bool unsigned_p
)
18103 if (low_in2
== const0_rtx
)
18105 low_dest
= low_in1
;
18106 high_in2
= force_reg (DImode
, high_in2
);
18108 emit_insn (gen_subdi3_compare1 (high_dest
, high_in1
, high_in2
));
18110 emit_insn (gen_subvdi_insn (high_dest
, high_in1
, high_in2
));
18114 if (CONST_INT_P (low_in2
))
18116 high_in2
= force_reg (DImode
, high_in2
);
18117 emit_insn (gen_subdi3_compare1_imm (low_dest
, low_in1
, low_in2
,
18118 GEN_INT (-INTVAL (low_in2
))));
18121 emit_insn (gen_subdi3_compare1 (low_dest
, low_in1
, low_in2
));
18124 emit_insn (gen_usubdi3_carryinC (high_dest
, high_in1
, high_in2
));
18126 emit_insn (gen_subdi3_carryinV (high_dest
, high_in1
, high_in2
));
18129 emit_move_insn (gen_lowpart (DImode
, op0
), low_dest
);
18130 emit_move_insn (gen_highpart (DImode
, op0
), high_dest
);
18134 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
18136 static unsigned HOST_WIDE_INT
18137 aarch64_asan_shadow_offset (void)
18140 return (HOST_WIDE_INT_1
<< 29);
18142 return (HOST_WIDE_INT_1
<< 36);
18146 aarch64_gen_ccmp_first (rtx_insn
**prep_seq
, rtx_insn
**gen_seq
,
18147 int code
, tree treeop0
, tree treeop1
)
18149 machine_mode op_mode
, cmp_mode
, cc_mode
= CCmode
;
18151 int unsignedp
= TYPE_UNSIGNED (TREE_TYPE (treeop0
));
18153 struct expand_operand ops
[4];
18156 expand_operands (treeop0
, treeop1
, NULL_RTX
, &op0
, &op1
, EXPAND_NORMAL
);
18158 op_mode
= GET_MODE (op0
);
18159 if (op_mode
== VOIDmode
)
18160 op_mode
= GET_MODE (op1
);
18168 icode
= CODE_FOR_cmpsi
;
18173 icode
= CODE_FOR_cmpdi
;
18178 cc_mode
= aarch64_select_cc_mode ((rtx_code
) code
, op0
, op1
);
18179 icode
= cc_mode
== CCFPEmode
? CODE_FOR_fcmpesf
: CODE_FOR_fcmpsf
;
18184 cc_mode
= aarch64_select_cc_mode ((rtx_code
) code
, op0
, op1
);
18185 icode
= cc_mode
== CCFPEmode
? CODE_FOR_fcmpedf
: CODE_FOR_fcmpdf
;
18193 op0
= prepare_operand (icode
, op0
, 0, op_mode
, cmp_mode
, unsignedp
);
18194 op1
= prepare_operand (icode
, op1
, 1, op_mode
, cmp_mode
, unsignedp
);
18200 *prep_seq
= get_insns ();
18203 create_fixed_operand (&ops
[0], op0
);
18204 create_fixed_operand (&ops
[1], op1
);
18207 if (!maybe_expand_insn (icode
, 2, ops
))
18212 *gen_seq
= get_insns ();
18215 return gen_rtx_fmt_ee ((rtx_code
) code
, cc_mode
,
18216 gen_rtx_REG (cc_mode
, CC_REGNUM
), const0_rtx
);
18220 aarch64_gen_ccmp_next (rtx_insn
**prep_seq
, rtx_insn
**gen_seq
, rtx prev
,
18221 int cmp_code
, tree treeop0
, tree treeop1
, int bit_code
)
18223 rtx op0
, op1
, target
;
18224 machine_mode op_mode
, cmp_mode
, cc_mode
= CCmode
;
18225 int unsignedp
= TYPE_UNSIGNED (TREE_TYPE (treeop0
));
18227 struct expand_operand ops
[6];
18230 push_to_sequence (*prep_seq
);
18231 expand_operands (treeop0
, treeop1
, NULL_RTX
, &op0
, &op1
, EXPAND_NORMAL
);
18233 op_mode
= GET_MODE (op0
);
18234 if (op_mode
== VOIDmode
)
18235 op_mode
= GET_MODE (op1
);
18243 icode
= CODE_FOR_ccmpsi
;
18248 icode
= CODE_FOR_ccmpdi
;
18253 cc_mode
= aarch64_select_cc_mode ((rtx_code
) cmp_code
, op0
, op1
);
18254 icode
= cc_mode
== CCFPEmode
? CODE_FOR_fccmpesf
: CODE_FOR_fccmpsf
;
18259 cc_mode
= aarch64_select_cc_mode ((rtx_code
) cmp_code
, op0
, op1
);
18260 icode
= cc_mode
== CCFPEmode
? CODE_FOR_fccmpedf
: CODE_FOR_fccmpdf
;
18268 op0
= prepare_operand (icode
, op0
, 2, op_mode
, cmp_mode
, unsignedp
);
18269 op1
= prepare_operand (icode
, op1
, 3, op_mode
, cmp_mode
, unsignedp
);
18275 *prep_seq
= get_insns ();
18278 target
= gen_rtx_REG (cc_mode
, CC_REGNUM
);
18279 aarch64_cond
= aarch64_get_condition_code_1 (cc_mode
, (rtx_code
) cmp_code
);
18281 if (bit_code
!= AND
)
18283 prev
= gen_rtx_fmt_ee (REVERSE_CONDITION (GET_CODE (prev
),
18284 GET_MODE (XEXP (prev
, 0))),
18285 VOIDmode
, XEXP (prev
, 0), const0_rtx
);
18286 aarch64_cond
= AARCH64_INVERSE_CONDITION_CODE (aarch64_cond
);
18289 create_fixed_operand (&ops
[0], XEXP (prev
, 0));
18290 create_fixed_operand (&ops
[1], target
);
18291 create_fixed_operand (&ops
[2], op0
);
18292 create_fixed_operand (&ops
[3], op1
);
18293 create_fixed_operand (&ops
[4], prev
);
18294 create_fixed_operand (&ops
[5], GEN_INT (aarch64_cond
));
18296 push_to_sequence (*gen_seq
);
18297 if (!maybe_expand_insn (icode
, 6, ops
))
18303 *gen_seq
= get_insns ();
18306 return gen_rtx_fmt_ee ((rtx_code
) cmp_code
, VOIDmode
, target
, const0_rtx
);
18309 #undef TARGET_GEN_CCMP_FIRST
18310 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
18312 #undef TARGET_GEN_CCMP_NEXT
18313 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
18315 /* Implement TARGET_SCHED_MACRO_FUSION_P. Return true if target supports
18316 instruction fusion of some sort. */
18319 aarch64_macro_fusion_p (void)
18321 return aarch64_tune_params
.fusible_ops
!= AARCH64_FUSE_NOTHING
;
18325 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P. Return true if PREV and CURR
18326 should be kept together during scheduling. */
18329 aarch_macro_fusion_pair_p (rtx_insn
*prev
, rtx_insn
*curr
)
18332 rtx prev_set
= single_set (prev
);
18333 rtx curr_set
= single_set (curr
);
18334 /* prev and curr are simple SET insns i.e. no flag setting or branching. */
18335 bool simple_sets_p
= prev_set
&& curr_set
&& !any_condjump_p (curr
);
18337 if (!aarch64_macro_fusion_p ())
18340 if (simple_sets_p
&& aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK
))
18342 /* We are trying to match:
18343 prev (mov) == (set (reg r0) (const_int imm16))
18344 curr (movk) == (set (zero_extract (reg r0)
18347 (const_int imm16_1)) */
18349 set_dest
= SET_DEST (curr_set
);
18351 if (GET_CODE (set_dest
) == ZERO_EXTRACT
18352 && CONST_INT_P (SET_SRC (curr_set
))
18353 && CONST_INT_P (SET_SRC (prev_set
))
18354 && CONST_INT_P (XEXP (set_dest
, 2))
18355 && INTVAL (XEXP (set_dest
, 2)) == 16
18356 && REG_P (XEXP (set_dest
, 0))
18357 && REG_P (SET_DEST (prev_set
))
18358 && REGNO (XEXP (set_dest
, 0)) == REGNO (SET_DEST (prev_set
)))
18364 if (simple_sets_p
&& aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD
))
18367 /* We're trying to match:
18368 prev (adrp) == (set (reg r1)
18369 (high (symbol_ref ("SYM"))))
18370 curr (add) == (set (reg r0)
18372 (symbol_ref ("SYM"))))
18373 Note that r0 need not necessarily be the same as r1, especially
18374 during pre-regalloc scheduling. */
18376 if (satisfies_constraint_Ush (SET_SRC (prev_set
))
18377 && REG_P (SET_DEST (prev_set
)) && REG_P (SET_DEST (curr_set
)))
18379 if (GET_CODE (SET_SRC (curr_set
)) == LO_SUM
18380 && REG_P (XEXP (SET_SRC (curr_set
), 0))
18381 && REGNO (XEXP (SET_SRC (curr_set
), 0))
18382 == REGNO (SET_DEST (prev_set
))
18383 && rtx_equal_p (XEXP (SET_SRC (prev_set
), 0),
18384 XEXP (SET_SRC (curr_set
), 1)))
18389 if (simple_sets_p
&& aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK
))
18392 /* We're trying to match:
18393 prev (movk) == (set (zero_extract (reg r0)
18396 (const_int imm16_1))
18397 curr (movk) == (set (zero_extract (reg r0)
18400 (const_int imm16_2)) */
18402 if (GET_CODE (SET_DEST (prev_set
)) == ZERO_EXTRACT
18403 && GET_CODE (SET_DEST (curr_set
)) == ZERO_EXTRACT
18404 && REG_P (XEXP (SET_DEST (prev_set
), 0))
18405 && REG_P (XEXP (SET_DEST (curr_set
), 0))
18406 && REGNO (XEXP (SET_DEST (prev_set
), 0))
18407 == REGNO (XEXP (SET_DEST (curr_set
), 0))
18408 && CONST_INT_P (XEXP (SET_DEST (prev_set
), 2))
18409 && CONST_INT_P (XEXP (SET_DEST (curr_set
), 2))
18410 && INTVAL (XEXP (SET_DEST (prev_set
), 2)) == 32
18411 && INTVAL (XEXP (SET_DEST (curr_set
), 2)) == 48
18412 && CONST_INT_P (SET_SRC (prev_set
))
18413 && CONST_INT_P (SET_SRC (curr_set
)))
18417 if (simple_sets_p
&& aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR
))
18419 /* We're trying to match:
18420 prev (adrp) == (set (reg r0)
18421 (high (symbol_ref ("SYM"))))
18422 curr (ldr) == (set (reg r1)
18423 (mem (lo_sum (reg r0)
18424 (symbol_ref ("SYM")))))
18426 curr (ldr) == (set (reg r1)
18429 (symbol_ref ("SYM")))))) */
18430 if (satisfies_constraint_Ush (SET_SRC (prev_set
))
18431 && REG_P (SET_DEST (prev_set
)) && REG_P (SET_DEST (curr_set
)))
18433 rtx curr_src
= SET_SRC (curr_set
);
18435 if (GET_CODE (curr_src
) == ZERO_EXTEND
)
18436 curr_src
= XEXP (curr_src
, 0);
18438 if (MEM_P (curr_src
) && GET_CODE (XEXP (curr_src
, 0)) == LO_SUM
18439 && REG_P (XEXP (XEXP (curr_src
, 0), 0))
18440 && REGNO (XEXP (XEXP (curr_src
, 0), 0))
18441 == REGNO (SET_DEST (prev_set
))
18442 && rtx_equal_p (XEXP (XEXP (curr_src
, 0), 1),
18443 XEXP (SET_SRC (prev_set
), 0)))
18448 if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH
)
18449 && any_condjump_p (curr
))
18451 unsigned int condreg1
, condreg2
;
18453 aarch64_fixed_condition_code_regs (&condreg1
, &condreg2
);
18454 cc_reg_1
= gen_rtx_REG (CCmode
, condreg1
);
18456 if (reg_referenced_p (cc_reg_1
, PATTERN (curr
))
18458 && modified_in_p (cc_reg_1
, prev
))
18460 enum attr_type prev_type
= get_attr_type (prev
);
18462 /* FIXME: this misses some which is considered simple arthematic
18463 instructions for ThunderX. Simple shifts are missed here. */
18464 if (prev_type
== TYPE_ALUS_SREG
18465 || prev_type
== TYPE_ALUS_IMM
18466 || prev_type
== TYPE_LOGICS_REG
18467 || prev_type
== TYPE_LOGICS_IMM
)
18474 && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH
)
18475 && any_condjump_p (curr
))
18477 /* We're trying to match:
18478 prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
18479 curr (cbz) == (set (pc) (if_then_else (eq/ne) (r0)
18481 (label_ref ("SYM"))
18483 if (SET_DEST (curr_set
) == (pc_rtx
)
18484 && GET_CODE (SET_SRC (curr_set
)) == IF_THEN_ELSE
18485 && REG_P (XEXP (XEXP (SET_SRC (curr_set
), 0), 0))
18486 && REG_P (SET_DEST (prev_set
))
18487 && REGNO (SET_DEST (prev_set
))
18488 == REGNO (XEXP (XEXP (SET_SRC (curr_set
), 0), 0)))
18490 /* Fuse ALU operations followed by conditional branch instruction. */
18491 switch (get_attr_type (prev
))
18494 case TYPE_ALU_SREG
:
18497 case TYPE_ADCS_REG
:
18498 case TYPE_ADCS_IMM
:
18499 case TYPE_LOGIC_REG
:
18500 case TYPE_LOGIC_IMM
:
18504 case TYPE_SHIFT_REG
:
18505 case TYPE_SHIFT_IMM
:
18520 /* Return true iff the instruction fusion described by OP is enabled. */
18523 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op
)
18525 return (aarch64_tune_params
.fusible_ops
& op
) != 0;
18528 /* If MEM is in the form of [base+offset], extract the two parts
18529 of address and set to BASE and OFFSET, otherwise return false
18530 after clearing BASE and OFFSET. */
18533 extract_base_offset_in_addr (rtx mem
, rtx
*base
, rtx
*offset
)
18537 gcc_assert (MEM_P (mem
));
18539 addr
= XEXP (mem
, 0);
18544 *offset
= const0_rtx
;
18548 if (GET_CODE (addr
) == PLUS
18549 && REG_P (XEXP (addr
, 0)) && CONST_INT_P (XEXP (addr
, 1)))
18551 *base
= XEXP (addr
, 0);
18552 *offset
= XEXP (addr
, 1);
18557 *offset
= NULL_RTX
;
18562 /* Types for scheduling fusion. */
18563 enum sched_fusion_type
18565 SCHED_FUSION_NONE
= 0,
18566 SCHED_FUSION_LD_SIGN_EXTEND
,
18567 SCHED_FUSION_LD_ZERO_EXTEND
,
18573 /* If INSN is a load or store of address in the form of [base+offset],
18574 extract the two parts and set to BASE and OFFSET. Return scheduling
18575 fusion type this INSN is. */
18577 static enum sched_fusion_type
18578 fusion_load_store (rtx_insn
*insn
, rtx
*base
, rtx
*offset
)
18581 enum sched_fusion_type fusion
= SCHED_FUSION_LD
;
18583 gcc_assert (INSN_P (insn
));
18584 x
= PATTERN (insn
);
18585 if (GET_CODE (x
) != SET
)
18586 return SCHED_FUSION_NONE
;
18589 dest
= SET_DEST (x
);
18591 machine_mode dest_mode
= GET_MODE (dest
);
18593 if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode
))
18594 return SCHED_FUSION_NONE
;
18596 if (GET_CODE (src
) == SIGN_EXTEND
)
18598 fusion
= SCHED_FUSION_LD_SIGN_EXTEND
;
18599 src
= XEXP (src
, 0);
18600 if (GET_CODE (src
) != MEM
|| GET_MODE (src
) != SImode
)
18601 return SCHED_FUSION_NONE
;
18603 else if (GET_CODE (src
) == ZERO_EXTEND
)
18605 fusion
= SCHED_FUSION_LD_ZERO_EXTEND
;
18606 src
= XEXP (src
, 0);
18607 if (GET_CODE (src
) != MEM
|| GET_MODE (src
) != SImode
)
18608 return SCHED_FUSION_NONE
;
18611 if (GET_CODE (src
) == MEM
&& REG_P (dest
))
18612 extract_base_offset_in_addr (src
, base
, offset
);
18613 else if (GET_CODE (dest
) == MEM
&& (REG_P (src
) || src
== const0_rtx
))
18615 fusion
= SCHED_FUSION_ST
;
18616 extract_base_offset_in_addr (dest
, base
, offset
);
18619 return SCHED_FUSION_NONE
;
18621 if (*base
== NULL_RTX
|| *offset
== NULL_RTX
)
18622 fusion
= SCHED_FUSION_NONE
;
18627 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
18629 Currently we only support to fuse ldr or str instructions, so FUSION_PRI
18630 and PRI are only calculated for these instructions. For other instruction,
18631 FUSION_PRI and PRI are simply set to MAX_PRI - 1. In the future, other
18632 type instruction fusion can be added by returning different priorities.
18634 It's important that irrelevant instructions get the largest FUSION_PRI. */
18637 aarch64_sched_fusion_priority (rtx_insn
*insn
, int max_pri
,
18638 int *fusion_pri
, int *pri
)
18642 enum sched_fusion_type fusion
;
18644 gcc_assert (INSN_P (insn
));
18647 fusion
= fusion_load_store (insn
, &base
, &offset
);
18648 if (fusion
== SCHED_FUSION_NONE
)
18655 /* Set FUSION_PRI according to fusion type and base register. */
18656 *fusion_pri
= tmp
- fusion
* FIRST_PSEUDO_REGISTER
- REGNO (base
);
18658 /* Calculate PRI. */
18661 /* INSN with smaller offset goes first. */
18662 off_val
= (int)(INTVAL (offset
));
18664 tmp
-= (off_val
& 0xfffff);
18666 tmp
+= ((- off_val
) & 0xfffff);
18672 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
18673 Adjust priority of sha1h instructions so they are scheduled before
18674 other SHA1 instructions. */
18677 aarch64_sched_adjust_priority (rtx_insn
*insn
, int priority
)
18679 rtx x
= PATTERN (insn
);
18681 if (GET_CODE (x
) == SET
)
18685 if (GET_CODE (x
) == UNSPEC
&& XINT (x
, 1) == UNSPEC_SHA1H
)
18686 return priority
+ 10;
18692 /* Given OPERANDS of consecutive load/store, check if we can merge
18693 them into ldp/stp. LOAD is true if they are load instructions.
18694 MODE is the mode of memory operands. */
18697 aarch64_operands_ok_for_ldpstp (rtx
*operands
, bool load
,
18700 HOST_WIDE_INT offval_1
, offval_2
, msize
;
18701 enum reg_class rclass_1
, rclass_2
;
18702 rtx mem_1
, mem_2
, reg_1
, reg_2
, base_1
, base_2
, offset_1
, offset_2
;
18706 mem_1
= operands
[1];
18707 mem_2
= operands
[3];
18708 reg_1
= operands
[0];
18709 reg_2
= operands
[2];
18710 gcc_assert (REG_P (reg_1
) && REG_P (reg_2
));
18711 if (REGNO (reg_1
) == REGNO (reg_2
))
18716 mem_1
= operands
[0];
18717 mem_2
= operands
[2];
18718 reg_1
= operands
[1];
18719 reg_2
= operands
[3];
18722 /* The mems cannot be volatile. */
18723 if (MEM_VOLATILE_P (mem_1
) || MEM_VOLATILE_P (mem_2
))
18726 /* If we have SImode and slow unaligned ldp,
18727 check the alignment to be at least 8 byte. */
18729 && (aarch64_tune_params
.extra_tuning_flags
18730 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
)
18732 && MEM_ALIGN (mem_1
) < 8 * BITS_PER_UNIT
)
18735 /* Check if the addresses are in the form of [base+offset]. */
18736 extract_base_offset_in_addr (mem_1
, &base_1
, &offset_1
);
18737 if (base_1
== NULL_RTX
|| offset_1
== NULL_RTX
)
18739 extract_base_offset_in_addr (mem_2
, &base_2
, &offset_2
);
18740 if (base_2
== NULL_RTX
|| offset_2
== NULL_RTX
)
18743 /* Check if the bases are same. */
18744 if (!rtx_equal_p (base_1
, base_2
))
18747 /* The operands must be of the same size. */
18748 gcc_assert (known_eq (GET_MODE_SIZE (GET_MODE (mem_1
)),
18749 GET_MODE_SIZE (GET_MODE (mem_2
))));
18751 offval_1
= INTVAL (offset_1
);
18752 offval_2
= INTVAL (offset_2
);
18753 /* We should only be trying this for fixed-sized modes. There is no
18754 SVE LDP/STP instruction. */
18755 msize
= GET_MODE_SIZE (mode
).to_constant ();
18756 /* Check if the offsets are consecutive. */
18757 if (offval_1
!= (offval_2
+ msize
) && offval_2
!= (offval_1
+ msize
))
18760 /* Check if the addresses are clobbered by load. */
18763 if (reg_mentioned_p (reg_1
, mem_1
))
18766 /* In increasing order, the last load can clobber the address. */
18767 if (offval_1
> offval_2
&& reg_mentioned_p (reg_2
, mem_2
))
18771 /* One of the memory accesses must be a mempair operand.
18772 If it is not the first one, they need to be swapped by the
18774 if (!aarch64_mem_pair_operand (mem_1
, GET_MODE (mem_1
))
18775 && !aarch64_mem_pair_operand (mem_2
, GET_MODE (mem_2
)))
18778 if (REG_P (reg_1
) && FP_REGNUM_P (REGNO (reg_1
)))
18779 rclass_1
= FP_REGS
;
18781 rclass_1
= GENERAL_REGS
;
18783 if (REG_P (reg_2
) && FP_REGNUM_P (REGNO (reg_2
)))
18784 rclass_2
= FP_REGS
;
18786 rclass_2
= GENERAL_REGS
;
18788 /* Check if the registers are of same class. */
18789 if (rclass_1
!= rclass_2
)
18795 /* Given OPERANDS of consecutive load/store that can be merged,
18796 swap them if they are not in ascending order. */
18798 aarch64_swap_ldrstr_operands (rtx
* operands
, bool load
)
18800 rtx mem_1
, mem_2
, base_1
, base_2
, offset_1
, offset_2
;
18801 HOST_WIDE_INT offval_1
, offval_2
;
18805 mem_1
= operands
[1];
18806 mem_2
= operands
[3];
18810 mem_1
= operands
[0];
18811 mem_2
= operands
[2];
18814 extract_base_offset_in_addr (mem_1
, &base_1
, &offset_1
);
18815 extract_base_offset_in_addr (mem_2
, &base_2
, &offset_2
);
18817 offval_1
= INTVAL (offset_1
);
18818 offval_2
= INTVAL (offset_2
);
18820 if (offval_1
> offval_2
)
18822 /* Irrespective of whether this is a load or a store,
18823 we do the same swap. */
18824 std::swap (operands
[0], operands
[2]);
18825 std::swap (operands
[1], operands
[3]);
18829 /* Taking X and Y to be HOST_WIDE_INT pointers, return the result of a
18830 comparison between the two. */
18832 aarch64_host_wide_int_compare (const void *x
, const void *y
)
18834 return wi::cmps (* ((const HOST_WIDE_INT
*) x
),
18835 * ((const HOST_WIDE_INT
*) y
));
18838 /* Taking X and Y to be pairs of RTX, one pointing to a MEM rtx and the
18839 other pointing to a REG rtx containing an offset, compare the offsets
18844 1 iff offset (X) > offset (Y)
18845 0 iff offset (X) == offset (Y)
18846 -1 iff offset (X) < offset (Y) */
18848 aarch64_ldrstr_offset_compare (const void *x
, const void *y
)
18850 const rtx
* operands_1
= (const rtx
*) x
;
18851 const rtx
* operands_2
= (const rtx
*) y
;
18852 rtx mem_1
, mem_2
, base
, offset_1
, offset_2
;
18854 if (MEM_P (operands_1
[0]))
18855 mem_1
= operands_1
[0];
18857 mem_1
= operands_1
[1];
18859 if (MEM_P (operands_2
[0]))
18860 mem_2
= operands_2
[0];
18862 mem_2
= operands_2
[1];
18864 /* Extract the offsets. */
18865 extract_base_offset_in_addr (mem_1
, &base
, &offset_1
);
18866 extract_base_offset_in_addr (mem_2
, &base
, &offset_2
);
18868 gcc_assert (offset_1
!= NULL_RTX
&& offset_2
!= NULL_RTX
);
18870 return wi::cmps (INTVAL (offset_1
), INTVAL (offset_2
));
18873 /* Given OPERANDS of consecutive load/store, check if we can merge
18874 them into ldp/stp by adjusting the offset. LOAD is true if they
18875 are load instructions. MODE is the mode of memory operands.
18877 Given below consecutive stores:
18879 str w1, [xb, 0x100]
18880 str w1, [xb, 0x104]
18881 str w1, [xb, 0x108]
18882 str w1, [xb, 0x10c]
18884 Though the offsets are out of the range supported by stp, we can
18885 still pair them after adjusting the offset, like:
18887 add scratch, xb, 0x100
18888 stp w1, w1, [scratch]
18889 stp w1, w1, [scratch, 0x8]
18891 The peephole patterns detecting this opportunity should guarantee
18892 the scratch register is avaliable. */
18895 aarch64_operands_adjust_ok_for_ldpstp (rtx
*operands
, bool load
,
18898 const int num_insns
= 4;
18899 enum reg_class rclass
;
18900 HOST_WIDE_INT offvals
[num_insns
], msize
;
18901 rtx mem
[num_insns
], reg
[num_insns
], base
[num_insns
], offset
[num_insns
];
18905 for (int i
= 0; i
< num_insns
; i
++)
18907 reg
[i
] = operands
[2 * i
];
18908 mem
[i
] = operands
[2 * i
+ 1];
18910 gcc_assert (REG_P (reg
[i
]));
18913 /* Do not attempt to merge the loads if the loads clobber each other. */
18914 for (int i
= 0; i
< 8; i
+= 2)
18915 for (int j
= i
+ 2; j
< 8; j
+= 2)
18916 if (reg_overlap_mentioned_p (operands
[i
], operands
[j
]))
18920 for (int i
= 0; i
< num_insns
; i
++)
18922 mem
[i
] = operands
[2 * i
];
18923 reg
[i
] = operands
[2 * i
+ 1];
18926 /* Skip if memory operand is by itself valid for ldp/stp. */
18927 if (!MEM_P (mem
[0]) || aarch64_mem_pair_operand (mem
[0], mode
))
18930 for (int i
= 0; i
< num_insns
; i
++)
18932 /* The mems cannot be volatile. */
18933 if (MEM_VOLATILE_P (mem
[i
]))
18936 /* Check if the addresses are in the form of [base+offset]. */
18937 extract_base_offset_in_addr (mem
[i
], base
+ i
, offset
+ i
);
18938 if (base
[i
] == NULL_RTX
|| offset
[i
] == NULL_RTX
)
18942 /* Check if the registers are of same class. */
18943 rclass
= REG_P (reg
[0]) && FP_REGNUM_P (REGNO (reg
[0]))
18944 ? FP_REGS
: GENERAL_REGS
;
18946 for (int i
= 1; i
< num_insns
; i
++)
18947 if (REG_P (reg
[i
]) && FP_REGNUM_P (REGNO (reg
[i
])))
18949 if (rclass
!= FP_REGS
)
18954 if (rclass
!= GENERAL_REGS
)
18958 /* Only the last register in the order in which they occur
18959 may be clobbered by the load. */
18960 if (rclass
== GENERAL_REGS
&& load
)
18961 for (int i
= 0; i
< num_insns
- 1; i
++)
18962 if (reg_mentioned_p (reg
[i
], mem
[i
]))
18965 /* Check if the bases are same. */
18966 for (int i
= 0; i
< num_insns
- 1; i
++)
18967 if (!rtx_equal_p (base
[i
], base
[i
+ 1]))
18970 for (int i
= 0; i
< num_insns
; i
++)
18971 offvals
[i
] = INTVAL (offset
[i
]);
18973 msize
= GET_MODE_SIZE (mode
);
18975 /* Check if the offsets can be put in the right order to do a ldp/stp. */
18976 qsort (offvals
, num_insns
, sizeof (HOST_WIDE_INT
),
18977 aarch64_host_wide_int_compare
);
18979 if (!(offvals
[1] == offvals
[0] + msize
18980 && offvals
[3] == offvals
[2] + msize
))
18983 /* Check that offsets are within range of each other. The ldp/stp
18984 instructions have 7 bit immediate offsets, so use 0x80. */
18985 if (offvals
[2] - offvals
[0] >= msize
* 0x80)
18988 /* The offsets must be aligned with respect to each other. */
18989 if (offvals
[0] % msize
!= offvals
[2] % msize
)
18992 /* If we have SImode and slow unaligned ldp,
18993 check the alignment to be at least 8 byte. */
18995 && (aarch64_tune_params
.extra_tuning_flags
18996 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
)
18998 && MEM_ALIGN (mem
[0]) < 8 * BITS_PER_UNIT
)
19004 /* Given OPERANDS of consecutive load/store, this function pairs them
19005 into LDP/STP after adjusting the offset. It depends on the fact
19006 that the operands can be sorted so the offsets are correct for STP.
19007 MODE is the mode of memory operands. CODE is the rtl operator
19008 which should be applied to all memory operands, it's SIGN_EXTEND,
19009 ZERO_EXTEND or UNKNOWN. */
19012 aarch64_gen_adjusted_ldpstp (rtx
*operands
, bool load
,
19013 scalar_mode mode
, RTX_CODE code
)
19015 rtx base
, offset_1
, offset_3
, t1
, t2
;
19016 rtx mem_1
, mem_2
, mem_3
, mem_4
;
19017 rtx temp_operands
[8];
19018 HOST_WIDE_INT off_val_1
, off_val_3
, base_off
, new_off_1
, new_off_3
,
19019 stp_off_upper_limit
, stp_off_lower_limit
, msize
;
19021 /* We make changes on a copy as we may still bail out. */
19022 for (int i
= 0; i
< 8; i
++)
19023 temp_operands
[i
] = operands
[i
];
19025 /* Sort the operands. */
19026 qsort (temp_operands
, 4, 2 * sizeof (rtx
*), aarch64_ldrstr_offset_compare
);
19028 /* Copy the memory operands so that if we have to bail for some
19029 reason the original addresses are unchanged. */
19032 mem_1
= copy_rtx (temp_operands
[1]);
19033 mem_2
= copy_rtx (temp_operands
[3]);
19034 mem_3
= copy_rtx (temp_operands
[5]);
19035 mem_4
= copy_rtx (temp_operands
[7]);
19039 mem_1
= copy_rtx (temp_operands
[0]);
19040 mem_2
= copy_rtx (temp_operands
[2]);
19041 mem_3
= copy_rtx (temp_operands
[4]);
19042 mem_4
= copy_rtx (temp_operands
[6]);
19043 gcc_assert (code
== UNKNOWN
);
19046 extract_base_offset_in_addr (mem_1
, &base
, &offset_1
);
19047 extract_base_offset_in_addr (mem_3
, &base
, &offset_3
);
19048 gcc_assert (base
!= NULL_RTX
&& offset_1
!= NULL_RTX
19049 && offset_3
!= NULL_RTX
);
19051 /* Adjust offset so it can fit in LDP/STP instruction. */
19052 msize
= GET_MODE_SIZE (mode
);
19053 stp_off_upper_limit
= msize
* (0x40 - 1);
19054 stp_off_lower_limit
= - msize
* 0x40;
19056 off_val_1
= INTVAL (offset_1
);
19057 off_val_3
= INTVAL (offset_3
);
19059 /* The base offset is optimally half way between the two STP/LDP offsets. */
19061 base_off
= (off_val_1
+ off_val_3
) / 2;
19063 /* However, due to issues with negative LDP/STP offset generation for
19064 larger modes, for DF, DI and vector modes. we must not use negative
19065 addresses smaller than 9 signed unadjusted bits can store. This
19066 provides the most range in this case. */
19067 base_off
= off_val_1
;
19069 /* Adjust the base so that it is aligned with the addresses but still
19071 if (base_off
% msize
!= off_val_1
% msize
)
19072 /* Fix the offset, bearing in mind we want to make it bigger not
19074 base_off
+= (((base_off
% msize
) - (off_val_1
% msize
)) + msize
) % msize
;
19075 else if (msize
<= 4)
19076 /* The negative range of LDP/STP is one larger than the positive range. */
19079 /* Check if base offset is too big or too small. We can attempt to resolve
19080 this issue by setting it to the maximum value and seeing if the offsets
19082 if (base_off
>= 0x1000)
19084 base_off
= 0x1000 - 1;
19085 /* We must still make sure that the base offset is aligned with respect
19086 to the address. But it may may not be made any bigger. */
19087 base_off
-= (((base_off
% msize
) - (off_val_1
% msize
)) + msize
) % msize
;
19090 /* Likewise for the case where the base is too small. */
19091 if (base_off
<= -0x1000)
19093 base_off
= -0x1000 + 1;
19094 base_off
+= (((base_off
% msize
) - (off_val_1
% msize
)) + msize
) % msize
;
19097 /* Offset of the first STP/LDP. */
19098 new_off_1
= off_val_1
- base_off
;
19100 /* Offset of the second STP/LDP. */
19101 new_off_3
= off_val_3
- base_off
;
19103 /* The offsets must be within the range of the LDP/STP instructions. */
19104 if (new_off_1
> stp_off_upper_limit
|| new_off_1
< stp_off_lower_limit
19105 || new_off_3
> stp_off_upper_limit
|| new_off_3
< stp_off_lower_limit
)
19108 replace_equiv_address_nv (mem_1
, plus_constant (Pmode
, operands
[8],
19110 replace_equiv_address_nv (mem_2
, plus_constant (Pmode
, operands
[8],
19111 new_off_1
+ msize
), true);
19112 replace_equiv_address_nv (mem_3
, plus_constant (Pmode
, operands
[8],
19114 replace_equiv_address_nv (mem_4
, plus_constant (Pmode
, operands
[8],
19115 new_off_3
+ msize
), true);
19117 if (!aarch64_mem_pair_operand (mem_1
, mode
)
19118 || !aarch64_mem_pair_operand (mem_3
, mode
))
19121 if (code
== ZERO_EXTEND
)
19123 mem_1
= gen_rtx_ZERO_EXTEND (DImode
, mem_1
);
19124 mem_2
= gen_rtx_ZERO_EXTEND (DImode
, mem_2
);
19125 mem_3
= gen_rtx_ZERO_EXTEND (DImode
, mem_3
);
19126 mem_4
= gen_rtx_ZERO_EXTEND (DImode
, mem_4
);
19128 else if (code
== SIGN_EXTEND
)
19130 mem_1
= gen_rtx_SIGN_EXTEND (DImode
, mem_1
);
19131 mem_2
= gen_rtx_SIGN_EXTEND (DImode
, mem_2
);
19132 mem_3
= gen_rtx_SIGN_EXTEND (DImode
, mem_3
);
19133 mem_4
= gen_rtx_SIGN_EXTEND (DImode
, mem_4
);
19138 operands
[0] = temp_operands
[0];
19139 operands
[1] = mem_1
;
19140 operands
[2] = temp_operands
[2];
19141 operands
[3] = mem_2
;
19142 operands
[4] = temp_operands
[4];
19143 operands
[5] = mem_3
;
19144 operands
[6] = temp_operands
[6];
19145 operands
[7] = mem_4
;
19149 operands
[0] = mem_1
;
19150 operands
[1] = temp_operands
[1];
19151 operands
[2] = mem_2
;
19152 operands
[3] = temp_operands
[3];
19153 operands
[4] = mem_3
;
19154 operands
[5] = temp_operands
[5];
19155 operands
[6] = mem_4
;
19156 operands
[7] = temp_operands
[7];
19159 /* Emit adjusting instruction. */
19160 emit_insn (gen_rtx_SET (operands
[8], plus_constant (DImode
, base
, base_off
)));
19161 /* Emit ldp/stp instructions. */
19162 t1
= gen_rtx_SET (operands
[0], operands
[1]);
19163 t2
= gen_rtx_SET (operands
[2], operands
[3]);
19164 emit_insn (gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (2, t1
, t2
)));
19165 t1
= gen_rtx_SET (operands
[4], operands
[5]);
19166 t2
= gen_rtx_SET (operands
[6], operands
[7]);
19167 emit_insn (gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (2, t1
, t2
)));
19171 /* Implement TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE. Assume for now that
19172 it isn't worth branching around empty masked ops (including masked
19176 aarch64_empty_mask_is_expensive (unsigned)
19181 /* Return 1 if pseudo register should be created and used to hold
19182 GOT address for PIC code. */
19185 aarch64_use_pseudo_pic_reg (void)
19187 return aarch64_cmodel
== AARCH64_CMODEL_SMALL_SPIC
;
19190 /* Implement TARGET_UNSPEC_MAY_TRAP_P. */
19193 aarch64_unspec_may_trap_p (const_rtx x
, unsigned flags
)
19195 switch (XINT (x
, 1))
19197 case UNSPEC_GOTSMALLPIC
:
19198 case UNSPEC_GOTSMALLPIC28K
:
19199 case UNSPEC_GOTTINYPIC
:
19205 return default_unspec_may_trap_p (x
, flags
);
19209 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
19210 return the log2 of that value. Otherwise return -1. */
19213 aarch64_fpconst_pow_of_2 (rtx x
)
19215 const REAL_VALUE_TYPE
*r
;
19217 if (!CONST_DOUBLE_P (x
))
19220 r
= CONST_DOUBLE_REAL_VALUE (x
);
19222 if (REAL_VALUE_NEGATIVE (*r
)
19223 || REAL_VALUE_ISNAN (*r
)
19224 || REAL_VALUE_ISINF (*r
)
19225 || !real_isinteger (r
, DFmode
))
19228 return exact_log2 (real_to_integer (r
));
19231 /* If X is a vector of equal CONST_DOUBLE values and that value is
19232 Y, return the aarch64_fpconst_pow_of_2 of Y. Otherwise return -1. */
19235 aarch64_vec_fpconst_pow_of_2 (rtx x
)
19238 if (GET_CODE (x
) != CONST_VECTOR
19239 || !CONST_VECTOR_NUNITS (x
).is_constant (&nelts
))
19242 if (GET_MODE_CLASS (GET_MODE (x
)) != MODE_VECTOR_FLOAT
)
19245 int firstval
= aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x
, 0));
19249 for (int i
= 1; i
< nelts
; i
++)
19250 if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x
, i
)) != firstval
)
19256 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
19259 __fp16 always promotes through this hook.
19260 _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
19261 through the generic excess precision logic rather than here. */
19264 aarch64_promoted_type (const_tree t
)
19266 if (SCALAR_FLOAT_TYPE_P (t
)
19267 && TYPE_MAIN_VARIANT (t
) == aarch64_fp16_type_node
)
19268 return float_type_node
;
19273 /* Implement the TARGET_OPTAB_SUPPORTED_P hook. */
19276 aarch64_optab_supported_p (int op
, machine_mode mode1
, machine_mode
,
19277 optimization_type opt_type
)
19282 return opt_type
== OPTIMIZE_FOR_SPEED
&& use_rsqrt_p (mode1
);
19289 /* Implement the TARGET_DWARF_POLY_INDETERMINATE_VALUE hook. */
19291 static unsigned int
19292 aarch64_dwarf_poly_indeterminate_value (unsigned int i
, unsigned int *factor
,
19295 /* Polynomial invariant 1 == (VG / 2) - 1. */
19296 gcc_assert (i
== 1);
19299 return AARCH64_DWARF_VG
;
19302 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
19303 if MODE is HFmode, and punt to the generic implementation otherwise. */
19306 aarch64_libgcc_floating_mode_supported_p (scalar_float_mode mode
)
19308 return (mode
== HFmode
19310 : default_libgcc_floating_mode_supported_p (mode
));
19313 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
19314 if MODE is HFmode, and punt to the generic implementation otherwise. */
19317 aarch64_scalar_mode_supported_p (scalar_mode mode
)
19319 return (mode
== HFmode
19321 : default_scalar_mode_supported_p (mode
));
19324 /* Set the value of FLT_EVAL_METHOD.
19325 ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
19327 0: evaluate all operations and constants, whose semantic type has at
19328 most the range and precision of type float, to the range and
19329 precision of float; evaluate all other operations and constants to
19330 the range and precision of the semantic type;
19332 N, where _FloatN is a supported interchange floating type
19333 evaluate all operations and constants, whose semantic type has at
19334 most the range and precision of _FloatN type, to the range and
19335 precision of the _FloatN type; evaluate all other operations and
19336 constants to the range and precision of the semantic type;
19338 If we have the ARMv8.2-A extensions then we support _Float16 in native
19339 precision, so we should set this to 16. Otherwise, we support the type,
19340 but want to evaluate expressions in float precision, so set this to
19343 static enum flt_eval_method
19344 aarch64_excess_precision (enum excess_precision_type type
)
19348 case EXCESS_PRECISION_TYPE_FAST
:
19349 case EXCESS_PRECISION_TYPE_STANDARD
:
19350 /* We can calculate either in 16-bit range and precision or
19351 32-bit range and precision. Make that decision based on whether
19352 we have native support for the ARMv8.2-A 16-bit floating-point
19353 instructions or not. */
19354 return (TARGET_FP_F16INST
19355 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
19356 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT
);
19357 case EXCESS_PRECISION_TYPE_IMPLICIT
:
19358 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
;
19360 gcc_unreachable ();
19362 return FLT_EVAL_METHOD_UNPREDICTABLE
;
19365 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN. Return true if INSN can be
19366 scheduled for speculative execution. Reject the long-running division
19367 and square-root instructions. */
19370 aarch64_sched_can_speculate_insn (rtx_insn
*insn
)
19372 switch (get_attr_type (insn
))
19380 case TYPE_NEON_FP_SQRT_S
:
19381 case TYPE_NEON_FP_SQRT_D
:
19382 case TYPE_NEON_FP_SQRT_S_Q
:
19383 case TYPE_NEON_FP_SQRT_D_Q
:
19384 case TYPE_NEON_FP_DIV_S
:
19385 case TYPE_NEON_FP_DIV_D
:
19386 case TYPE_NEON_FP_DIV_S_Q
:
19387 case TYPE_NEON_FP_DIV_D_Q
:
19394 /* Implement TARGET_COMPUTE_PRESSURE_CLASSES. */
19397 aarch64_compute_pressure_classes (reg_class
*classes
)
19400 classes
[i
++] = GENERAL_REGS
;
19401 classes
[i
++] = FP_REGS
;
19402 /* PR_REGS isn't a useful pressure class because many predicate pseudo
19403 registers need to go in PR_LO_REGS at some point during their
19404 lifetime. Splitting it into two halves has the effect of making
19405 all predicates count against PR_LO_REGS, so that we try whenever
19406 possible to restrict the number of live predicates to 8. This
19407 greatly reduces the amount of spilling in certain loops. */
19408 classes
[i
++] = PR_LO_REGS
;
19409 classes
[i
++] = PR_HI_REGS
;
19413 /* Implement TARGET_CAN_CHANGE_MODE_CLASS. */
19416 aarch64_can_change_mode_class (machine_mode from
,
19417 machine_mode to
, reg_class_t
)
19419 if (BYTES_BIG_ENDIAN
)
19421 bool from_sve_p
= aarch64_sve_data_mode_p (from
);
19422 bool to_sve_p
= aarch64_sve_data_mode_p (to
);
19424 /* Don't allow changes between SVE data modes and non-SVE modes.
19425 See the comment at the head of aarch64-sve.md for details. */
19426 if (from_sve_p
!= to_sve_p
)
19429 /* Don't allow changes in element size: lane 0 of the new vector
19430 would not then be lane 0 of the old vector. See the comment
19431 above aarch64_maybe_expand_sve_subreg_move for a more detailed
19434 In the worst case, this forces a register to be spilled in
19435 one mode and reloaded in the other, which handles the
19436 endianness correctly. */
19437 if (from_sve_p
&& GET_MODE_UNIT_SIZE (from
) != GET_MODE_UNIT_SIZE (to
))
19443 /* Implement TARGET_EARLY_REMAT_MODES. */
19446 aarch64_select_early_remat_modes (sbitmap modes
)
19448 /* SVE values are not normally live across a call, so it should be
19449 worth doing early rematerialization even in VL-specific mode. */
19450 for (int i
= 0; i
< NUM_MACHINE_MODES
; ++i
)
19452 machine_mode mode
= (machine_mode
) i
;
19453 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
19454 if (vec_flags
& VEC_ANY_SVE
)
19455 bitmap_set_bit (modes
, i
);
19459 /* Override the default target speculation_safe_value. */
19461 aarch64_speculation_safe_value (machine_mode mode
,
19462 rtx result
, rtx val
, rtx failval
)
19464 /* Maybe we should warn if falling back to hard barriers. They are
19465 likely to be noticably more expensive than the alternative below. */
19466 if (!aarch64_track_speculation
)
19467 return default_speculation_safe_value (mode
, result
, val
, failval
);
19470 val
= copy_to_mode_reg (mode
, val
);
19472 if (!aarch64_reg_or_zero (failval
, mode
))
19473 failval
= copy_to_mode_reg (mode
, failval
);
19475 emit_insn (gen_despeculate_copy (mode
, result
, val
, failval
));
19479 /* Implement TARGET_ESTIMATED_POLY_VALUE.
19480 Look into the tuning structure for an estimate.
19481 VAL.coeffs[1] is multiplied by the number of VQ chunks over the initial
19482 Advanced SIMD 128 bits. */
19484 static HOST_WIDE_INT
19485 aarch64_estimated_poly_value (poly_int64 val
)
19487 enum aarch64_sve_vector_bits_enum width_source
19488 = aarch64_tune_params
.sve_width
;
19490 /* If we still don't have an estimate, use the default. */
19491 if (width_source
== SVE_SCALABLE
)
19492 return default_estimated_poly_value (val
);
19494 HOST_WIDE_INT over_128
= width_source
- 128;
19495 return val
.coeffs
[0] + val
.coeffs
[1] * over_128
/ 128;
19499 /* Return true for types that could be supported as SIMD return or
19503 supported_simd_type (tree t
)
19505 if (SCALAR_FLOAT_TYPE_P (t
) || INTEGRAL_TYPE_P (t
) || POINTER_TYPE_P (t
))
19507 HOST_WIDE_INT s
= tree_to_shwi (TYPE_SIZE_UNIT (t
));
19508 return s
== 1 || s
== 2 || s
== 4 || s
== 8;
19513 /* Return true for types that currently are supported as SIMD return
19514 or argument types. */
19517 currently_supported_simd_type (tree t
, tree b
)
19519 if (COMPLEX_FLOAT_TYPE_P (t
))
19522 if (TYPE_SIZE (t
) != TYPE_SIZE (b
))
19525 return supported_simd_type (t
);
19528 /* Implement TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN. */
19531 aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node
*node
,
19532 struct cgraph_simd_clone
*clonei
,
19533 tree base_type
, int num
)
19535 tree t
, ret_type
, arg_type
;
19536 unsigned int elt_bits
, vec_bits
, count
;
19541 if (clonei
->simdlen
19542 && (clonei
->simdlen
< 2
19543 || clonei
->simdlen
> 1024
19544 || (clonei
->simdlen
& (clonei
->simdlen
- 1)) != 0))
19546 warning_at (DECL_SOURCE_LOCATION (node
->decl
), 0,
19547 "unsupported simdlen %d", clonei
->simdlen
);
19551 ret_type
= TREE_TYPE (TREE_TYPE (node
->decl
));
19552 if (TREE_CODE (ret_type
) != VOID_TYPE
19553 && !currently_supported_simd_type (ret_type
, base_type
))
19555 if (TYPE_SIZE (ret_type
) != TYPE_SIZE (base_type
))
19556 warning_at (DECL_SOURCE_LOCATION (node
->decl
), 0,
19557 "GCC does not currently support mixed size types "
19558 "for %<simd%> functions");
19559 else if (supported_simd_type (ret_type
))
19560 warning_at (DECL_SOURCE_LOCATION (node
->decl
), 0,
19561 "GCC does not currently support return type %qT "
19562 "for %<simd%> functions", ret_type
);
19564 warning_at (DECL_SOURCE_LOCATION (node
->decl
), 0,
19565 "unsupported return type %qT for %<simd%> functions",
19570 for (t
= DECL_ARGUMENTS (node
->decl
); t
; t
= DECL_CHAIN (t
))
19572 arg_type
= TREE_TYPE (t
);
19574 if (!currently_supported_simd_type (arg_type
, base_type
))
19576 if (TYPE_SIZE (arg_type
) != TYPE_SIZE (base_type
))
19577 warning_at (DECL_SOURCE_LOCATION (node
->decl
), 0,
19578 "GCC does not currently support mixed size types "
19579 "for %<simd%> functions");
19581 warning_at (DECL_SOURCE_LOCATION (node
->decl
), 0,
19582 "GCC does not currently support argument type %qT "
19583 "for %<simd%> functions", arg_type
);
19588 clonei
->vecsize_mangle
= 'n';
19589 clonei
->mask_mode
= VOIDmode
;
19590 elt_bits
= GET_MODE_BITSIZE (SCALAR_TYPE_MODE (base_type
));
19591 if (clonei
->simdlen
== 0)
19594 vec_bits
= (num
== 0 ? 64 : 128);
19595 clonei
->simdlen
= vec_bits
/ elt_bits
;
19600 vec_bits
= clonei
->simdlen
* elt_bits
;
19601 if (vec_bits
!= 64 && vec_bits
!= 128)
19603 warning_at (DECL_SOURCE_LOCATION (node
->decl
), 0,
19604 "GCC does not currently support simdlen %d for type %qT",
19605 clonei
->simdlen
, base_type
);
19609 clonei
->vecsize_int
= vec_bits
;
19610 clonei
->vecsize_float
= vec_bits
;
19614 /* Implement TARGET_SIMD_CLONE_ADJUST. */
19617 aarch64_simd_clone_adjust (struct cgraph_node
*node
)
19619 /* Add aarch64_vector_pcs target attribute to SIMD clones so they
19620 use the correct ABI. */
19622 tree t
= TREE_TYPE (node
->decl
);
19623 TYPE_ATTRIBUTES (t
) = make_attribute ("aarch64_vector_pcs", "default",
19624 TYPE_ATTRIBUTES (t
));
19627 /* Implement TARGET_SIMD_CLONE_USABLE. */
19630 aarch64_simd_clone_usable (struct cgraph_node
*node
)
19632 switch (node
->simdclone
->vecsize_mangle
)
19639 gcc_unreachable ();
19643 /* Implement TARGET_COMP_TYPE_ATTRIBUTES */
19646 aarch64_comp_type_attributes (const_tree type1
, const_tree type2
)
19648 if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (type1
))
19649 != lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (type2
)))
19654 /* Implement TARGET_GET_MULTILIB_ABI_NAME */
19656 static const char *
19657 aarch64_get_multilib_abi_name (void)
19659 if (TARGET_BIG_END
)
19660 return TARGET_ILP32
? "aarch64_be_ilp32" : "aarch64_be";
19661 return TARGET_ILP32
? "aarch64_ilp32" : "aarch64";
19664 /* Implement TARGET_STACK_PROTECT_GUARD. In case of a
19665 global variable based guard use the default else
19666 return a null tree. */
19668 aarch64_stack_protect_guard (void)
19670 if (aarch64_stack_protector_guard
== SSP_GLOBAL
)
19671 return default_stack_protect_guard ();
19676 /* Implement TARGET_ASM_FILE_END for AArch64. This adds the AArch64 GNU NOTE
19677 section at the end if needed. */
19678 #define GNU_PROPERTY_AARCH64_FEATURE_1_AND 0xc0000000
19679 #define GNU_PROPERTY_AARCH64_FEATURE_1_BTI (1U << 0)
19680 #define GNU_PROPERTY_AARCH64_FEATURE_1_PAC (1U << 1)
19682 aarch64_file_end_indicate_exec_stack ()
19684 file_end_indicate_exec_stack ();
19686 unsigned feature_1_and
= 0;
19687 if (aarch64_bti_enabled ())
19688 feature_1_and
|= GNU_PROPERTY_AARCH64_FEATURE_1_BTI
;
19690 if (aarch64_ra_sign_scope
!= AARCH64_FUNCTION_NONE
)
19691 feature_1_and
|= GNU_PROPERTY_AARCH64_FEATURE_1_PAC
;
19695 /* Generate .note.gnu.property section. */
19696 switch_to_section (get_section (".note.gnu.property",
19697 SECTION_NOTYPE
, NULL
));
19699 /* PT_NOTE header: namesz, descsz, type.
19700 namesz = 4 ("GNU\0")
19701 descsz = 16 (Size of the program property array)
19702 [(12 + padding) * Number of array elements]
19703 type = 5 (NT_GNU_PROPERTY_TYPE_0). */
19704 assemble_align (POINTER_SIZE
);
19705 assemble_integer (GEN_INT (4), 4, 32, 1);
19706 assemble_integer (GEN_INT (ROUND_UP (12, POINTER_BYTES
)), 4, 32, 1);
19707 assemble_integer (GEN_INT (5), 4, 32, 1);
19709 /* PT_NOTE name. */
19710 assemble_string ("GNU", 4);
19712 /* PT_NOTE contents for NT_GNU_PROPERTY_TYPE_0:
19713 type = GNU_PROPERTY_AARCH64_FEATURE_1_AND
19715 data = feature_1_and. */
19716 assemble_integer (GEN_INT (GNU_PROPERTY_AARCH64_FEATURE_1_AND
), 4, 32, 1);
19717 assemble_integer (GEN_INT (4), 4, 32, 1);
19718 assemble_integer (GEN_INT (feature_1_and
), 4, 32, 1);
19720 /* Pad the size of the note to the required alignment. */
19721 assemble_align (POINTER_SIZE
);
19724 #undef GNU_PROPERTY_AARCH64_FEATURE_1_PAC
19725 #undef GNU_PROPERTY_AARCH64_FEATURE_1_BTI
19726 #undef GNU_PROPERTY_AARCH64_FEATURE_1_AND
19728 /* Target-specific selftests. */
19732 namespace selftest
{
19734 /* Selftest for the RTL loader.
19735 Verify that the RTL loader copes with a dump from
19736 print_rtx_function. This is essentially just a test that class
19737 function_reader can handle a real dump, but it also verifies
19738 that lookup_reg_by_dump_name correctly handles hard regs.
19739 The presence of hard reg names in the dump means that the test is
19740 target-specific, hence it is in this file. */
19743 aarch64_test_loading_full_dump ()
19745 rtl_dump_test
t (SELFTEST_LOCATION
, locate_file ("aarch64/times-two.rtl"));
19747 ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun
->decl
)));
19749 rtx_insn
*insn_1
= get_insn_by_uid (1);
19750 ASSERT_EQ (NOTE
, GET_CODE (insn_1
));
19752 rtx_insn
*insn_15
= get_insn_by_uid (15);
19753 ASSERT_EQ (INSN
, GET_CODE (insn_15
));
19754 ASSERT_EQ (USE
, GET_CODE (PATTERN (insn_15
)));
19756 /* Verify crtl->return_rtx. */
19757 ASSERT_EQ (REG
, GET_CODE (crtl
->return_rtx
));
19758 ASSERT_EQ (0, REGNO (crtl
->return_rtx
));
19759 ASSERT_EQ (SImode
, GET_MODE (crtl
->return_rtx
));
19762 /* Run all target-specific selftests. */
19765 aarch64_run_selftests (void)
19767 aarch64_test_loading_full_dump ();
19770 } // namespace selftest
19772 #endif /* #if CHECKING_P */
19774 #undef TARGET_STACK_PROTECT_GUARD
19775 #define TARGET_STACK_PROTECT_GUARD aarch64_stack_protect_guard
19777 #undef TARGET_ADDRESS_COST
19778 #define TARGET_ADDRESS_COST aarch64_address_cost
19780 /* This hook will determines whether unnamed bitfields affect the alignment
19781 of the containing structure. The hook returns true if the structure
19782 should inherit the alignment requirements of an unnamed bitfield's
19784 #undef TARGET_ALIGN_ANON_BITFIELD
19785 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
19787 #undef TARGET_ASM_ALIGNED_DI_OP
19788 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
19790 #undef TARGET_ASM_ALIGNED_HI_OP
19791 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
19793 #undef TARGET_ASM_ALIGNED_SI_OP
19794 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
19796 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
19797 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
19798 hook_bool_const_tree_hwi_hwi_const_tree_true
19800 #undef TARGET_ASM_FILE_START
19801 #define TARGET_ASM_FILE_START aarch64_start_file
19803 #undef TARGET_ASM_OUTPUT_MI_THUNK
19804 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
19806 #undef TARGET_ASM_SELECT_RTX_SECTION
19807 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
19809 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
19810 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
19812 #undef TARGET_BUILD_BUILTIN_VA_LIST
19813 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
19815 #undef TARGET_CALLEE_COPIES
19816 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
19818 #undef TARGET_CAN_ELIMINATE
19819 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
19821 #undef TARGET_CAN_INLINE_P
19822 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
19824 #undef TARGET_CANNOT_FORCE_CONST_MEM
19825 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
19827 #undef TARGET_CASE_VALUES_THRESHOLD
19828 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
19830 #undef TARGET_CONDITIONAL_REGISTER_USAGE
19831 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
19833 /* Only the least significant bit is used for initialization guard
19835 #undef TARGET_CXX_GUARD_MASK_BIT
19836 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
19838 #undef TARGET_C_MODE_FOR_SUFFIX
19839 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
19841 #ifdef TARGET_BIG_ENDIAN_DEFAULT
19842 #undef TARGET_DEFAULT_TARGET_FLAGS
19843 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
19846 #undef TARGET_CLASS_MAX_NREGS
19847 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
19849 #undef TARGET_BUILTIN_DECL
19850 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
19852 #undef TARGET_BUILTIN_RECIPROCAL
19853 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
19855 #undef TARGET_C_EXCESS_PRECISION
19856 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
19858 #undef TARGET_EXPAND_BUILTIN
19859 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
19861 #undef TARGET_EXPAND_BUILTIN_VA_START
19862 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
19864 #undef TARGET_FOLD_BUILTIN
19865 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
19867 #undef TARGET_FUNCTION_ARG
19868 #define TARGET_FUNCTION_ARG aarch64_function_arg
19870 #undef TARGET_FUNCTION_ARG_ADVANCE
19871 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
19873 #undef TARGET_FUNCTION_ARG_BOUNDARY
19874 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
19876 #undef TARGET_FUNCTION_ARG_PADDING
19877 #define TARGET_FUNCTION_ARG_PADDING aarch64_function_arg_padding
19879 #undef TARGET_GET_RAW_RESULT_MODE
19880 #define TARGET_GET_RAW_RESULT_MODE aarch64_get_reg_raw_mode
19881 #undef TARGET_GET_RAW_ARG_MODE
19882 #define TARGET_GET_RAW_ARG_MODE aarch64_get_reg_raw_mode
19884 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
19885 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
19887 #undef TARGET_FUNCTION_VALUE
19888 #define TARGET_FUNCTION_VALUE aarch64_function_value
19890 #undef TARGET_FUNCTION_VALUE_REGNO_P
19891 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
19893 #undef TARGET_GIMPLE_FOLD_BUILTIN
19894 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
19896 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
19897 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
19899 #undef TARGET_INIT_BUILTINS
19900 #define TARGET_INIT_BUILTINS aarch64_init_builtins
19902 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
19903 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
19904 aarch64_ira_change_pseudo_allocno_class
19906 #undef TARGET_LEGITIMATE_ADDRESS_P
19907 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
19909 #undef TARGET_LEGITIMATE_CONSTANT_P
19910 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
19912 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
19913 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
19914 aarch64_legitimize_address_displacement
19916 #undef TARGET_LIBGCC_CMP_RETURN_MODE
19917 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
19919 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
19920 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
19921 aarch64_libgcc_floating_mode_supported_p
19923 #undef TARGET_MANGLE_TYPE
19924 #define TARGET_MANGLE_TYPE aarch64_mangle_type
19926 #undef TARGET_MEMORY_MOVE_COST
19927 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
19929 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
19930 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
19932 #undef TARGET_MUST_PASS_IN_STACK
19933 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
19935 /* This target hook should return true if accesses to volatile bitfields
19936 should use the narrowest mode possible. It should return false if these
19937 accesses should use the bitfield container type. */
19938 #undef TARGET_NARROW_VOLATILE_BITFIELD
19939 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
19941 #undef TARGET_OPTION_OVERRIDE
19942 #define TARGET_OPTION_OVERRIDE aarch64_override_options
19944 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
19945 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
19946 aarch64_override_options_after_change
19948 #undef TARGET_OPTION_SAVE
19949 #define TARGET_OPTION_SAVE aarch64_option_save
19951 #undef TARGET_OPTION_RESTORE
19952 #define TARGET_OPTION_RESTORE aarch64_option_restore
19954 #undef TARGET_OPTION_PRINT
19955 #define TARGET_OPTION_PRINT aarch64_option_print
19957 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
19958 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
19960 #undef TARGET_SET_CURRENT_FUNCTION
19961 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
19963 #undef TARGET_PASS_BY_REFERENCE
19964 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
19966 #undef TARGET_PREFERRED_RELOAD_CLASS
19967 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
19969 #undef TARGET_SCHED_REASSOCIATION_WIDTH
19970 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
19972 #undef TARGET_PROMOTED_TYPE
19973 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
19975 #undef TARGET_SECONDARY_RELOAD
19976 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
19978 #undef TARGET_SHIFT_TRUNCATION_MASK
19979 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
19981 #undef TARGET_SETUP_INCOMING_VARARGS
19982 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
19984 #undef TARGET_STRUCT_VALUE_RTX
19985 #define TARGET_STRUCT_VALUE_RTX aarch64_struct_value_rtx
19987 #undef TARGET_REGISTER_MOVE_COST
19988 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
19990 #undef TARGET_RETURN_IN_MEMORY
19991 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
19993 #undef TARGET_RETURN_IN_MSB
19994 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
19996 #undef TARGET_RTX_COSTS
19997 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
19999 #undef TARGET_SCALAR_MODE_SUPPORTED_P
20000 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
20002 #undef TARGET_SCHED_ISSUE_RATE
20003 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
20005 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
20006 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
20007 aarch64_sched_first_cycle_multipass_dfa_lookahead
20009 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
20010 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
20011 aarch64_first_cycle_multipass_dfa_lookahead_guard
20013 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
20014 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
20015 aarch64_get_separate_components
20017 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
20018 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
20019 aarch64_components_for_bb
20021 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
20022 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
20023 aarch64_disqualify_components
20025 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
20026 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
20027 aarch64_emit_prologue_components
20029 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
20030 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
20031 aarch64_emit_epilogue_components
20033 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
20034 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
20035 aarch64_set_handled_components
20037 #undef TARGET_TRAMPOLINE_INIT
20038 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
20040 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
20041 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
20043 #undef TARGET_VECTOR_MODE_SUPPORTED_P
20044 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
20046 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
20047 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
20048 aarch64_builtin_support_vector_misalignment
20050 #undef TARGET_ARRAY_MODE
20051 #define TARGET_ARRAY_MODE aarch64_array_mode
20053 #undef TARGET_ARRAY_MODE_SUPPORTED_P
20054 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
20056 #undef TARGET_VECTORIZE_ADD_STMT_COST
20057 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
20059 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
20060 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
20061 aarch64_builtin_vectorization_cost
20063 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
20064 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
20066 #undef TARGET_VECTORIZE_BUILTINS
20067 #define TARGET_VECTORIZE_BUILTINS
20069 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
20070 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
20071 aarch64_builtin_vectorized_function
20073 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
20074 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
20075 aarch64_autovectorize_vector_sizes
20077 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
20078 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
20079 aarch64_atomic_assign_expand_fenv
20081 /* Section anchor support. */
20083 #undef TARGET_MIN_ANCHOR_OFFSET
20084 #define TARGET_MIN_ANCHOR_OFFSET -256
20086 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
20087 byte offset; we can do much more for larger data types, but have no way
20088 to determine the size of the access. We assume accesses are aligned. */
20089 #undef TARGET_MAX_ANCHOR_OFFSET
20090 #define TARGET_MAX_ANCHOR_OFFSET 4095
20092 #undef TARGET_VECTOR_ALIGNMENT
20093 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
20095 #undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
20096 #define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
20097 aarch64_vectorize_preferred_vector_alignment
20098 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
20099 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
20100 aarch64_simd_vector_alignment_reachable
20102 /* vec_perm support. */
20104 #undef TARGET_VECTORIZE_VEC_PERM_CONST
20105 #define TARGET_VECTORIZE_VEC_PERM_CONST \
20106 aarch64_vectorize_vec_perm_const
20108 #undef TARGET_VECTORIZE_GET_MASK_MODE
20109 #define TARGET_VECTORIZE_GET_MASK_MODE aarch64_get_mask_mode
20110 #undef TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE
20111 #define TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE \
20112 aarch64_empty_mask_is_expensive
20113 #undef TARGET_PREFERRED_ELSE_VALUE
20114 #define TARGET_PREFERRED_ELSE_VALUE \
20115 aarch64_preferred_else_value
20117 #undef TARGET_INIT_LIBFUNCS
20118 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
20120 #undef TARGET_FIXED_CONDITION_CODE_REGS
20121 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
20123 #undef TARGET_FLAGS_REGNUM
20124 #define TARGET_FLAGS_REGNUM CC_REGNUM
20126 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
20127 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
20129 #undef TARGET_ASAN_SHADOW_OFFSET
20130 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
20132 #undef TARGET_LEGITIMIZE_ADDRESS
20133 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
20135 #undef TARGET_SCHED_CAN_SPECULATE_INSN
20136 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
20138 #undef TARGET_CAN_USE_DOLOOP_P
20139 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
20141 #undef TARGET_SCHED_ADJUST_PRIORITY
20142 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
20144 #undef TARGET_SCHED_MACRO_FUSION_P
20145 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
20147 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
20148 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
20150 #undef TARGET_SCHED_FUSION_PRIORITY
20151 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
20153 #undef TARGET_UNSPEC_MAY_TRAP_P
20154 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
20156 #undef TARGET_USE_PSEUDO_PIC_REG
20157 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
20159 #undef TARGET_PRINT_OPERAND
20160 #define TARGET_PRINT_OPERAND aarch64_print_operand
20162 #undef TARGET_PRINT_OPERAND_ADDRESS
20163 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
20165 #undef TARGET_OPTAB_SUPPORTED_P
20166 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
20168 #undef TARGET_OMIT_STRUCT_RETURN_REG
20169 #define TARGET_OMIT_STRUCT_RETURN_REG true
20171 #undef TARGET_DWARF_POLY_INDETERMINATE_VALUE
20172 #define TARGET_DWARF_POLY_INDETERMINATE_VALUE \
20173 aarch64_dwarf_poly_indeterminate_value
20175 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors. */
20176 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
20177 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
20179 #undef TARGET_HARD_REGNO_NREGS
20180 #define TARGET_HARD_REGNO_NREGS aarch64_hard_regno_nregs
20181 #undef TARGET_HARD_REGNO_MODE_OK
20182 #define TARGET_HARD_REGNO_MODE_OK aarch64_hard_regno_mode_ok
20184 #undef TARGET_MODES_TIEABLE_P
20185 #define TARGET_MODES_TIEABLE_P aarch64_modes_tieable_p
20187 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
20188 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
20189 aarch64_hard_regno_call_part_clobbered
20191 #undef TARGET_REMOVE_EXTRA_CALL_PRESERVED_REGS
20192 #define TARGET_REMOVE_EXTRA_CALL_PRESERVED_REGS \
20193 aarch64_remove_extra_call_preserved_regs
20195 #undef TARGET_RETURN_CALL_WITH_MAX_CLOBBERS
20196 #define TARGET_RETURN_CALL_WITH_MAX_CLOBBERS \
20197 aarch64_return_call_with_max_clobbers
20199 #undef TARGET_CONSTANT_ALIGNMENT
20200 #define TARGET_CONSTANT_ALIGNMENT aarch64_constant_alignment
20202 #undef TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE
20203 #define TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE \
20204 aarch64_stack_clash_protection_alloca_probe_range
20206 #undef TARGET_COMPUTE_PRESSURE_CLASSES
20207 #define TARGET_COMPUTE_PRESSURE_CLASSES aarch64_compute_pressure_classes
20209 #undef TARGET_CAN_CHANGE_MODE_CLASS
20210 #define TARGET_CAN_CHANGE_MODE_CLASS aarch64_can_change_mode_class
20212 #undef TARGET_SELECT_EARLY_REMAT_MODES
20213 #define TARGET_SELECT_EARLY_REMAT_MODES aarch64_select_early_remat_modes
20215 #undef TARGET_SPECULATION_SAFE_VALUE
20216 #define TARGET_SPECULATION_SAFE_VALUE aarch64_speculation_safe_value
20218 #undef TARGET_ESTIMATED_POLY_VALUE
20219 #define TARGET_ESTIMATED_POLY_VALUE aarch64_estimated_poly_value
20221 #undef TARGET_ATTRIBUTE_TABLE
20222 #define TARGET_ATTRIBUTE_TABLE aarch64_attribute_table
20224 #undef TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN
20225 #define TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN \
20226 aarch64_simd_clone_compute_vecsize_and_simdlen
20228 #undef TARGET_SIMD_CLONE_ADJUST
20229 #define TARGET_SIMD_CLONE_ADJUST aarch64_simd_clone_adjust
20231 #undef TARGET_SIMD_CLONE_USABLE
20232 #define TARGET_SIMD_CLONE_USABLE aarch64_simd_clone_usable
20234 #undef TARGET_COMP_TYPE_ATTRIBUTES
20235 #define TARGET_COMP_TYPE_ATTRIBUTES aarch64_comp_type_attributes
20237 #undef TARGET_GET_MULTILIB_ABI_NAME
20238 #define TARGET_GET_MULTILIB_ABI_NAME aarch64_get_multilib_abi_name
20241 #undef TARGET_RUN_TARGET_SELFTESTS
20242 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
20243 #endif /* #if CHECKING_P */
20245 #undef TARGET_ASM_POST_CFI_STARTPROC
20246 #define TARGET_ASM_POST_CFI_STARTPROC aarch64_post_cfi_startproc
20248 struct gcc_target targetm
= TARGET_INITIALIZER
;
20250 #include "gt-aarch64.h"